In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix
from scipy.spatial.distance import euclidean
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.display import clear_output
from datetime import datetime
from urllib.parse import urlparse

In [3]:
#targets:
#    0: porn
#    1: propaganda
#    2: spam
#    3: fake followers
#    4: genuine accounts

porn_users = pd.read_csv('data/porn/users.csv', encoding='utf-8-sig')
prop_users = pd.read_csv('data/propaganda/users.csv', encoding='utf-8-sig')
spam_users = pd.read_csv('data/spam/users.csv', encoding='utf-8-sig')
fake_users = pd.read_csv('data/fake_followers/users.csv', encoding='utf-8-sig')
genuine_users = pd.read_csv('data/genuine/users.csv', encoding='utf-8-sig')

porn_ids = porn_users['id']
prop_ids = prop_users['id']
spam_ids = spam_users['id']
fake_ids = fake_users['id']
gen_ids = genuine_users['id']

# TF-IDF

In [47]:
def compute_centroid(tf_idf):

    center = tf_idf.sum(axis=1)/tf_idf.shape[0]
    return center

In [48]:
def dist_from_centroid(tf_idf, centroid):
    
    distances = []
    for elem in tf_idf:
        distances.append(np.linalg.norm(tf_idf - centroid))
    return distances

In [49]:
def wss(id, tweets_df, is_tweet = 1):
    
    if is_tweet == 1:
        # get tweets per id
        vector = tweets_df[tweets_df.user_id == id]['full_text']
        n_vectors = len(vector)
    elif is_tweet == 0:
        # get domains per id
        vector = tweets_df[tweets_df.user_id == id]['url']
        vector = vector.fillna('').astype(str)
        for i in range(len(vector)):
            vector.iloc[i] = urlparse(vector.iloc[i]).netloc
        n_vectors = len(vector)
    else:
        print ('Invalid Input')

    transformer = TfidfVectorizer(smooth_idf=True)
    tf_idf = transformer.fit_transform(vector).todense()
    
    centroid = compute_centroid(tf_idf)
    distances = dist_from_centroid(tf_idf, centroid)
    avg_dist = np.asarray(distances).sum()/n_vectors
    
    return avg_dist

In [50]:
def intradistance(bot_ids, bot_type, is_tweet=1):
    
    tweets_df = pd.read_csv('data/' + bot_type + '/tweets.csv', encoding='utf-8-sig', sep='\t')
    
    distances = []
    i=0
    for user in bot_ids:
        i += 1
        try:
            distances.append(wss(user, tweets_df, is_tweet))
        except Exception as e:
            print(e)
            distances.append(0)
        if (i%10 == 0):
            clear_output()
            print(str(i) +  " " + bot_type + " bots processed")
    
    dist = pd.DataFrame()
    dist['user_id'] = bot_ids.values
    if is_tweet == 1:
        dist['tweet_intradistance'] = distances
        dist.to_csv('data/' + bot_type + '/tweet_intradistance.csv', index=False)
    elif is_tweet == 0:
        dist['url_intradistance'] = distances
        dist.to_csv('data/' + bot_type + '/url_intradistance.csv', index=False)
    else:
        print ('Invalid Input')

    print(bot_type + " done!")

In [51]:
intradistance(porn_ids, 'porn', is_tweet=1)

KeyboardInterrupt: 

In [None]:
intradistance(porn_ids, 'porn', is_tweet=0)

In [None]:
intradistance(spam_ids, 'spam', is_tweet=1)

In [None]:
intradistance(spam_ids, 'spam', is_tweet=0)

In [None]:
intradistance(fake_ids, 'fake_followers', is_tweet=1)

In [12]:
intradistance(fake_ids, 'fake_followers', is_tweet=0)

9050 fake_followers bots processed
empty vocabulary; perhaps the documents only contain stop words
fake_followers done!


In [13]:
intradistance(prop_ids, 'propaganda', is_tweet=1)

3370 propaganda bots processed
propaganda done!


In [14]:
intradistance(prop_ids, 'propaganda', is_tweet=0)

3370 propaganda bots processed
propaganda done!


In [15]:
intradistance(gen_ids, 'genuine', is_tweet=1)

3660 genuine bots processed
empty vocabulary; perhaps the documents only contain stop words
genuine done!


In [16]:
intradistance(gen_ids, 'genuine', is_tweet=0)

3660 genuine bots processed
empty vocabulary; perhaps the documents only contain stop words
genuine done!


# Context Score

In [54]:
from collections import Counter
from sklearn.preprocessing import MinMaxScaler, normalize

porn_tweets_df = pd.read_csv('data/porn/tweets.csv', encoding='utf-8-sig', sep='\t')
prop_tweets_df = pd.read_csv('data/propaganda/tweets.csv', encoding='utf-8-sig', sep='\t')
spam_tweets_df = pd.read_csv('data/spam/tweets.csv', encoding='utf-8-sig', sep='\t')
fake_tweets_df = pd.read_csv('data/fake_followers/tweets.csv', encoding='utf-8-sig', sep='\t')
genuine_tweets_df = pd.read_csv('data/genuine/tweets.csv', encoding='utf-8-sig', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [55]:
from nltk.corpus import stopwords
import string
def get_main_words(tweets_df):
    
    n_words = 300
    tweets = tweets_df['full_text'].values.astype('str')
    
    #tokenize and remove stop words 
    punctuation = list(string.punctuation)
    stopWords = stopwords.words('english') + stopwords.words('italian') + stopwords.words('french') + stopwords.words('spanish') + punctuation + ['...', '"the', "i'm", 'go', 'time', 'get', 'rt', 'via', '&amp;'] + ["it's"]

    word_counter = Counter()
    for elem in tweets:
        word_counter.update(elem.lower().split())
    
    for word in stopWords:
        if word in word_counter:
            del word_counter[word]
            

    main_words = pd.DataFrame(data=word_counter.most_common(n_words), index=None, columns=['word', 'score'])
    
    return main_words[:n_words-1]

In [56]:
def unique_vocabularies():
    porn_voc = pd.read_csv('data/porn/main_words.csv', sep=',')
    prop_voc = pd.read_csv('data/propaganda/main_words.csv', sep=',')
    spam_voc = pd.read_csv('data/spam/main_words.csv', sep=',')
    fake_voc = pd.read_csv('data/fake_followers/main_words.csv', sep=',')
    genuine_voc = pd.read_csv('data/genuine/main_words.csv', sep=',')
    
    porn_words = set(porn_voc['word'])
    porn_main_words = set(porn_voc['word'][:50])
    prop_words = set(prop_voc['word'])
    prop_main_words = set(prop_voc['word'][:50])
    spam_words = set(spam_voc['word'])
    spam_main_words = set(spam_voc['word'][:50])
    fake_words = set(fake_voc['word'])
    fake_main_words = set(fake_voc['word'][:50])
    genuine_words = set(genuine_voc['word'])
    genuine_main_words = set(genuine_voc['word'][:50])
    
    new_porn_words = porn_words - set(prop_main_words | spam_main_words | fake_main_words | genuine_main_words)
    new_prop_words = prop_words - set(porn_main_words | spam_main_words | fake_main_words | genuine_main_words)
    new_spam_words = spam_words - set(prop_main_words | porn_main_words | fake_main_words | genuine_main_words)
    new_fake_words = fake_words - set(prop_main_words | spam_main_words | porn_main_words | genuine_main_words)
    new_genuine_words = genuine_words - set(prop_main_words | spam_main_words | porn_main_words | fake_main_words)
    
    print(str(len(new_porn_words)) + ' porn words')
    print(str(len(new_prop_words)) + ' prop words')
    print(str(len(new_spam_words)) + ' spam words')
    print(str(len(new_fake_words)) + ' fake words')
    print(str(len(new_genuine_words)) + ' genuine words')
    
    # normalize scores
    porn_voc = porn_voc[porn_voc['word'].isin(list(new_porn_words))]
    scaler = MinMaxScaler() 
    porn_voc['score'] = scaler.fit_transform(porn_voc.score.values.reshape(-1, 1))
    porn_voc.drop(columns='Unnamed: 0', inplace=True)
    porn_voc.drop(porn_voc.tail(1).index, inplace=True)
    porn_voc.reset_index(drop=True, inplace=True)
    
    prop_voc = prop_voc[prop_voc['word'].isin(list(new_prop_words))]
    scaler = MinMaxScaler()
    prop_voc['score'] = scaler.fit_transform(prop_voc.score.values.reshape(-1, 1))
    prop_voc.drop(columns='Unnamed: 0', inplace=True)
    prop_voc.drop(prop_voc.tail(1).index, inplace=True)
    prop_voc.reset_index(drop=True, inplace=True)
    
    spam_voc = spam_voc[spam_voc['word'].isin(list(new_spam_words))]
    scaler = MinMaxScaler()
    spam_voc['score'] = scaler.fit_transform(spam_voc.score.values.reshape(-1, 1))
    spam_voc.drop(columns='Unnamed: 0', inplace=True)
    spam_voc.drop(spam_voc.tail(1).index, inplace=True)
    spam_voc.reset_index(drop=True, inplace=True)
    
    fake_voc = fake_voc[fake_voc['word'].isin(list(new_fake_words))]
    scaler = MinMaxScaler()
    fake_voc['score'] = scaler.fit_transform(fake_voc.score.values.reshape(-1, 1))
    fake_voc.drop(columns='Unnamed: 0', inplace=True)
    fake_voc.drop(fake_voc.tail(1).index, inplace=True)
    fake_voc.reset_index(drop=True, inplace=True)
    
    genuine_voc = genuine_voc[genuine_voc['word'].isin(list(new_genuine_words))]
    scaler = MinMaxScaler()
    genuine_voc['score'] = scaler.fit_transform(genuine_voc.score.values.reshape(-1, 1))
    genuine_voc.drop(columns='Unnamed: 0', inplace=True)
    genuine_voc.drop(genuine_voc.tail(1).index, inplace=True)
    genuine_voc.reset_index(drop=True, inplace=True)
    
    
    porn_voc.to_csv('data/porn/filtered_main_words.csv', encoding='utf-8-sig')
    prop_voc.to_csv('data/propaganda/filtered_main_words.csv', encoding='utf-8-sig')
    spam_voc.to_csv('data/spam/filtered_main_words.csv', encoding='utf-8-sig')
    fake_voc.to_csv('data/fake_followers/filtered_main_words.csv', encoding='utf-8-sig')
    genuine_voc.to_csv('data/genuine/filtered_main_words.csv', encoding='utf-8-sig')

In [57]:
porn_vocabulary = get_main_words(porn_tweets_df)
porn_vocabulary.to_csv('data/porn/main_words.csv', encoding='utf-8-sig')
prop_vocabulary = get_main_words(prop_tweets_df)
prop_vocabulary.to_csv('data/propaganda/main_words.csv', encoding='utf-8-sig')
spam_vocabulary = get_main_words(spam_tweets_df)
spam_vocabulary.to_csv('data/spam/main_words.csv', encoding='utf-8-sig')
fake_vocabulary = get_main_words(fake_tweets_df)
fake_vocabulary.to_csv('data/fake_followers/main_words.csv', encoding='utf-8-sig')
genuine_vocabulary = get_main_words(genuine_tweets_df)
genuine_vocabulary.to_csv('data/genuine/main_words.csv', encoding='utf-8-sig')

In [58]:
unique_vocabularies()

210 porn words
238 prop words
231 spam words
234 fake words
224 genuine words




### Compute Context Score

In [59]:
porn_words = pd.read_csv('data/porn/filtered_main_words.csv', sep=',')
prop_words = pd.read_csv('data/propaganda/filtered_main_words.csv', sep=',')
spam_words = pd.read_csv('data/spam/filtered_main_words.csv', sep=',')
fake_words = pd.read_csv('data/fake_followers/filtered_main_words.csv', sep=',')
genuine_words = pd.read_csv('data/genuine/filtered_main_words.csv', sep=',')

In [60]:
def compute_score(tweets):

    user_score = pd.DataFrame(columns=['porn_words_score', 'prop_words_score', 'spam_words_score', 'fake_words_score'])

    for tweet in tweets:
        # check for words in main_words and compute the scores for each tweet and for each category
        mask = np.in1d(porn_words.word, tweet.split())
        porn_score = porn_words.loc[mask]['score'].values.sum()
        mask = np.in1d(prop_words.word, tweet.split())
        prop_score = prop_words.loc[mask]['score'].values.sum()
        mask = np.in1d(spam_words.word, tweet.split())
        spam_score = spam_words.loc[mask]['score'].values.sum()
        mask = np.in1d(fake_words.word, tweet.split())
        fake_score = fake_words.loc[mask]['score'].values.sum()
        mask = np.in1d(genuine_words.word, tweet.split())
        genuine_score = genuine_words.loc[mask]['score'].values.sum()
        
        user_score = user_score.append(pd.DataFrame({'porn_words_score': porn_score, 'prop_words_score': prop_score, 'spam_words_score': spam_score,'fake_words_score': fake_score,'genuine_words_score': genuine_score}, index=[0]), ignore_index=True)

    return user_score

In [61]:
def score(tweets_df, id):
    
    tweets = tweets_df[tweets_df.user_id == id]['full_text']
    if len(tweets) > 0:
        # sum all the scores of each category
        user_score = compute_score(tweets).sum()
        scores = np.divide(user_score,len(tweets))
    else:
        scores = pd.DataFrame({'porn_words_score': 0, 'prop_words_score': 0, 'spam_words_score': 0, 'fake_words_score': 0, 'genuine_words_score': 0}, index=[0])
    
    # return the average scores of each user
    return scores

In [64]:
def context_score(bot_ids, bot_type):
    
    tweets_df = pd.read_csv('data/' + bot_type + '/tweets.csv', encoding='utf-8-sig', sep='\t')
        
    score_df = pd.DataFrame(columns=['porn_words_score', 'prop_words_score', 'spam_words_score', 'fake_words_score', 'genuine_words_score', 'user_id'])
    i = 0
    for user_id in bot_ids:
        i += 1
        scores = score(tweets_df, user_id)
        score_df = score_df.append(scores, ignore_index=True)
        if (i%10 == 0):
            clear_output()
            print(str(i) +  " " + bot_type + " bots processed")
    
    score_df['user_id'] = bot_ids.values
    
    score_df.reset_index(drop=True, inplace=True)
    score_df.to_csv('data/' + bot_type + '/context_score.csv', index=False)

    print(bot_type + " done!")

In [66]:
context_score(porn_ids, 'porn')

6930 porn bots processed
porn done!


In [67]:
context_score(prop_ids, 'propaganda')

3370 propaganda bots processed
propaganda done!


In [68]:
context_score(spam_ids, 'spam')

5360 spam bots processed
spam done!


In [69]:
context_score(fake_ids, 'fake_followers')

9050 fake_followers bots processed
fake_followers done!


In [70]:
context_score(gen_ids, 'genuine')

3660 genuine bots processed
genuine done!


# Descriptive features

In [71]:
def describe_tweets(tweets):
    
    ret_perc, media_perc, url_perc, quote_perc = tweet_perc(tweets)
    
    avg_len, avg_ret, avg_fav, avg_hash = tweet_desc(tweets, 'avg')
    max_len, max_ret, max_fav, max_hash = tweet_desc(tweets, 'max')
    min_len, min_ret, min_fav, min_hash = tweet_desc(tweets, 'min')
    
    freq = tweet_freq(tweets)

    frame = np.array([avg_len, max_len, min_len, avg_ret, max_ret, min_ret, avg_fav, max_fav, min_fav, avg_hash, max_hash, min_hash, freq, ret_perc, media_perc, url_perc, quote_perc])
   
    desc_features = pd.DataFrame({'avg_len': avg_len, 'max_len': max_len, 'min_len': min_len, 'avg_ret': avg_ret, 'max_ret': max_ret, 'min_ret': min_ret, 'avg_fav': avg_fav, 'max_fav': max_fav, 'min_fav': min_fav, 'avg_hash' : avg_hash, 'max_hash' : max_hash, 'min_hash' : max_hash,'freq': freq, 'ret_perc': ret_perc, 'media_perc': media_perc, 'url_perc': url_perc, 'quote_perc': quote_perc}, index=[0])
    
    return desc_features

In [72]:
def tweet_perc(tweets):
    
    ret_perc = np.invert(tweets.retweeted_status.isnull()).sum()/len(tweets)
    media_perc = np.invert(tweets.extended_entities.isnull()).sum()/len(tweets)
    url_perc = np.invert(tweets.url.isnull()).sum()/len(tweets)
    quote_perc = tweets.is_quote_status.sum()/len(tweets)
    
    return ret_perc, media_perc, url_perc, quote_perc

In [73]:
def hashtag_count(tweets):
    
    occurrences = []
    for tweet in tweets:
        occurrences.append(tweet.count('#'))
        
    return occurrences

In [74]:
def tweet_desc(tweets, metric):
    
    tweets_lenght = tweets['full_text'].apply(lambda x: len(x))
    
    if metric == 'avg':
        ret = np.mean(tweets.retweet_count)
        lenght = np.mean(tweets_lenght)
        fav = np.mean(tweets.favorite_count)
        hashtag = np.mean(hashtag_count(tweets['full_text']))
    elif metric == 'max':
        ret = max(tweets.retweet_count)
        lenght = max(tweets_lenght)
        fav = max(tweets.favorite_count)
        hashtag = max(hashtag_count(tweets['full_text']))
    elif metric == 'min':
        ret = min(tweets.retweet_count)
        lenght = min(tweets_lenght)
        fav = min(tweets.favorite_count)
        hashtag = min(hashtag_count(tweets['full_text']))

    return lenght, ret, fav, hashtag

In [75]:
def tweet_freq(tweets):
    
    dates = list(tweets.created_at)
    
    last = dates[0]
    d = last[8:10]
    m = last[4:7]
    y = last[-4:]
    date = d + ' ' + m + ' ' + y
    last = datetime.strptime(date, '%d %b %Y')
    
    first = dates[-1]
    d = first[8:10]
    m = first[4:7]
    y = first[-4:]
    date = d + ' ' + m + ' ' + y
    first = datetime.strptime(date, '%d %b %Y')
    
    delta = (last - first).days + 1
    freq = len(tweets)/delta
    
    return freq

In [76]:
def describe(tweets_df, id):
    
    tweets = tweets_df[tweets_df.user_id == id]
    
    if len(tweets) > 0:
        # sum all the scores of each category
        features = describe_tweets(tweets)
    else:
        features = pd.DataFrame({'avg_len': 0, 'max_len': 0, 'min_len': 0,'avg_ret': 0, 'max_ret': 0, 'min_ret': 0, 'avg_fav': 0, 'max_fav': 0, 'min_fav': 0, 'avg_hash': 0, 'max_hash': 0, 'min_hash': 0, 'freq': 0, 'ret_perc': 0, 'media_perc': 0, 'url_perc': 0, 'quote_perc':0}, index=[0])
    
    # return the average scores of each user
    return features

In [77]:
def descriptive_features(bot_ids, bot_type):
    
    tweets_df = pd.read_csv('data/' + bot_type + '/tweets.csv', encoding='utf-8-sig', sep='\t')
        
    desc_df = pd.DataFrame(columns=['avg_len', 'max_len', 'min_len', 'avg_ret', 'max_ret', 'min_ret', 'avg_fav', 'max_fav', 'min_fav', 'avg_hash', 'max_hash', 'min_hash', 'freq', 'ret_perc', 'media_perc', 'url_perc', 'quote_perc'])
    i = 0
    
    for user_id in bot_ids:
        i += 1
        features = describe(tweets_df, user_id)
        desc_df = desc_df.append(features, ignore_index=True)
        if (i%10 == 0):
            clear_output()
            print(str(i) +  " " + bot_type + " bots processed")
    
    desc_df['user_id'] = bot_ids.values
    
    desc_df.reset_index(drop=True, inplace=True)
    desc_df.to_csv('data/' + bot_type + '/descriptive_features.csv', index=False)

    print(bot_type + " done!")

In [78]:
descriptive_features(porn_ids, 'porn')

6930 porn bots processed
porn done!


In [79]:
descriptive_features(prop_ids, 'propaganda')

3370 propaganda bots processed
propaganda done!


In [80]:
descriptive_features(spam_ids, 'spam')

5360 spam bots processed
spam done!


In [81]:
descriptive_features(fake_ids, 'fake_followers')

9050 fake_followers bots processed
fake_followers done!


In [82]:
descriptive_features(gen_ids, 'genuine')

3660 genuine bots processed
genuine done!


# Semantic

In [4]:
def clusterize_tweets(tweets_df, k=5):
    
    tweets = tweets_df['full_text']
    
    transformer = TfidfVectorizer(smooth_idf=True)
    tf_idf = transformer.fit_transform(tweets).todense()
    
    kmeans = KMeans(n_clusters=k, random_state=0).fit(tf_idf)
    
    return kmeans.labels

In [5]:
tweets_df = pd.read_csv('data/porn/tweets.csv', encoding='utf-8-sig', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
labels = clusterize_tweets(tweets_df, k=5)

In [1]:
tweets_df = pd.read_csv('data/prop/tweets.csv', encoding='utf-8-sig', sep='\t')

NameError: name 'pd' is not defined

In [2]:
labels

NameError: name 'labels' is not defined