In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.spatial.distance import euclidean
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.display import clear_output
from datetime import datetime
from urllib.parse import urlparse

In [2]:
#targets:
#    0: porn
#    1: propaganda
#    2: spam
#    3: fake followers
#    4: genuine accounts

porn_users = pd.read_csv('data/porn/users.csv', encoding='utf-8-sig')
prop_users = pd.read_csv('data/propaganda/users.csv', encoding='utf-8-sig')
spam_users = pd.read_csv('data/spam/users.csv', encoding='utf-8-sig')
fake_users = pd.read_csv('data/fake_followers/users.csv', encoding='utf-8-sig')
genuine_users = pd.read_csv('data/genuine/users.csv', encoding='utf-8-sig')

porn_ids = porn_users['id']
prop_ids = prop_users['id']
spam_ids = spam_users['id']
fake_ids = fake_users['id']
gen_ids = genuine_users['id']

# TF-IDF

In [9]:
def compute_centroid(tf_idf):

    center = tf_idf.sum(axis=1)/tf_idf.shape[0]
    return center

In [10]:
def dist_from_centroid(tf_idf, centroid):
    
    distances = []
    for elem in tf_idf:
        distances.append(np.linalg.norm(tf_idf - centroid))
    return distances

In [11]:
def wss(id, tweets_df, is_tweet = 1):
    
    if is_tweet == 1:
        # get tweets per id
        vector = tweets_df[tweets_df.user_id == id]['full_text']
        n_vectors = len(vector)
    elif is_tweet == 0:
        # get domains per id
        vector = urlparse(tweets_df[tweets_df.user_id == id]['url'])
        n_vectors = len(vector)
    else:
        print ('Invalid Input')

    
    transformer = TfidfVectorizer(smooth_idf=True)
    tf_idf = transformer.fit_transform(vector).todense()
    
    centroid = compute_centroid(tf_idf)
    distances = dist_from_centroid(tf_idf, centroid)
    avg_dist = np.asarray(distances).sum()/n_vectors
    
    return avg_dist

In [12]:
def intradistance(bot_ids, bot_type, is_tweet=1):
    
    tweets_df = pd.read_csv('data/' + bot_type + '/tweets.csv', encoding='utf-8-sig', sep='\t')
    
    distances = []
    i=0
    for user in bot_ids:
        i += 1
        try:
            distances.append(wss(user, tweets_df, is_tweet))
        except Exception as e:
            print(e)
            distances.append(0)
        if (i%10 == 0):
            clear_output()
            print(str(i) +  " " + bot_type + " bots processed")
    
    dist = pd.DataFrame()
    dist['user_id'] = bot_ids.values
    if is_tweet == 1:
        dist['tweet_intradistance'] = distances
        dist.to_csv('data/' + bot_type + '/tweet_intradistance.csv')
    elif is_tweet == 0:
        dist['url_intradistance'] = distances
        dist.to_csv('data/' + bot_type + '/url_intradistance.csv')
    else:
        print ('Invalid Input')

    print(bot_type + " done!")

In [15]:
intradistance(porn_ids, 'porn', is_tweet=1)

6910 porn bots processed
empty vocabulary; perhaps the documents only contain stop words
porn done!


In [None]:
intradistance(porn_ids, 'porn', is_tweet=0)

In [16]:
intradistance(spam_ids, 'spam', is_tweet=1)

4940 spam bots processed
spam done!


In [None]:
intradistance(spam_ids, 'spam', is_tweet=0)

In [17]:
intradistance(fake_ids, 'fake_followers', is_tweet=1)

8820 fake_followers bots processed
empty vocabulary; perhaps the documents only contain stop words
empty vocabulary; perhaps the documents only contain stop words
empty vocabulary; perhaps the documents only contain stop words
empty vocabulary; perhaps the documents only contain stop words
empty vocabulary; perhaps the documents only contain stop words
fake_followers done!


In [None]:
intradistance(fake_ids, 'fake_followers', is_tweet=0)

In [18]:
intradistance(prop_ids, 'propaganda', is_tweet=1)

3300 propaganda bots processed
propaganda done!


In [None]:
intradistance(prop_ids, 'propaganda', is_tweet=0)

In [None]:
intradistance(prop_ids, 'genuine', is_tweet=1)

In [None]:
intradistance(prop_ids, 'genuine', is_tweet=0)

# Context Score

In [19]:
from collections import Counter
from sklearn.preprocessing import MinMaxScaler, normalize

porn_tweets_df = pd.read_csv('data/porn/tweets.csv', encoding='utf-8-sig', sep='\t')
prop_tweets_df = pd.read_csv('data/propaganda/tweets.csv', encoding='utf-8-sig', sep='\t')
spam_tweets_df = pd.read_csv('data/spam/tweets.csv', encoding='utf-8-sig', sep='\t')
fake_tweets_df = pd.read_csv('data/fake_followers/tweets.csv', encoding='utf-8-sig')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [24]:
from nltk.corpus import stopwords
import string
def get_main_words(tweets_df):
    
    n_words = 300
    tweets = tweets_df['full_text'].values.astype('str')
    
    #tokenize and remove stop words 
    punctuation = list(string.punctuation)
    stopWords = stopwords.words('english') + stopwords.words('italian') + stopwords.words('french') + stopwords.words('spanish') + punctuation + ['...', '"the', "i'm", 'go', 'time', 'get', 'rt', 'via', '&amp;'] + ["it's"]

    word_counter = Counter()
    for elem in tweets:
        word_counter.update(elem.lower().split())
    
    for word in stopWords:
        if word in word_counter:
            del word_counter[word]
            

    main_words = pd.DataFrame(data=word_counter.most_common(n_words), index=None, columns=['word', 'score'])
    
    return main_words[:n_words-1]

In [21]:
def unique_vocabularies():
    porn_voc = pd.read_csv('data/porn/main_words.csv', sep=',')
    prop_voc = pd.read_csv('data/propaganda/main_words.csv', sep=',')
    spam_voc = pd.read_csv('data/spam/main_words.csv', sep=',')
    fake_voc = pd.read_csv('data/fake_followers/main_words.csv', sep=',')
    
    porn_words = set(porn_voc['word'])
    porn_main_words = set(porn_voc['word'][:50])
    prop_words = set(prop_voc['word'])
    prop_main_words = set(prop_voc['word'][:50])
    spam_words = set(spam_voc['word'])
    spam_main_words = set(spam_voc['word'][:50])
    fake_words = set(fake_voc['word'])
    fake_main_words = set(fake_voc['word'][:50])
    
    new_porn_words = porn_words - set(prop_main_words | spam_main_words | fake_main_words)
    new_prop_words = prop_words - set(porn_main_words | spam_main_words | fake_main_words)
    new_spam_words = spam_words - set(prop_main_words | porn_main_words | fake_main_words)
    new_fake_words = fake_words - set(prop_main_words | spam_main_words | porn_main_words)
    
    print(str(len(new_porn_words)) + ' porn words')
    print(str(len(new_prop_words)) + ' prop words')
    print(str(len(new_spam_words)) + ' spam words')
    print(str(len(new_fake_words)) + ' fake words')
    
    # normalize scores
    porn_voc = porn_voc[porn_voc['word'].isin(list(new_porn_words))]
    scaler = MinMaxScaler() 
    porn_voc['score'] = scaler.fit_transform(porn_voc.score.values.reshape(-1, 1))
    porn_voc.drop(columns='Unnamed: 0', inplace=True)
    porn_voc.drop(porn_voc.tail(1).index, inplace=True)
    porn_voc.reset_index(drop=True, inplace=True)
    
    prop_voc = prop_voc[prop_voc['word'].isin(list(new_prop_words))]
    scaler = MinMaxScaler()
    prop_voc['score'] = scaler.fit_transform(prop_voc.score.values.reshape(-1, 1))
    prop_voc.drop(columns='Unnamed: 0', inplace=True)
    prop_voc.drop(prop_voc.tail(1).index, inplace=True)
    prop_voc.reset_index(drop=True, inplace=True)
    
    spam_voc = spam_voc[spam_voc['word'].isin(list(new_spam_words))]
    scaler = MinMaxScaler()
    spam_voc['score'] = scaler.fit_transform(spam_voc.score.values.reshape(-1, 1))
    spam_voc.drop(columns='Unnamed: 0', inplace=True)
    spam_voc.drop(spam_voc.tail(1).index, inplace=True)
    spam_voc.reset_index(drop=True, inplace=True)
    
    fake_voc = fake_voc[fake_voc['word'].isin(list(new_fake_words))]
    scaler = MinMaxScaler()
    fake_voc['score'] = scaler.fit_transform(fake_voc.score.values.reshape(-1, 1))
    fake_voc.drop(columns='Unnamed: 0', inplace=True)
    fake_voc.drop(fake_voc.tail(1).index, inplace=True)
    fake_voc.reset_index(drop=True, inplace=True)
    
    
    porn_voc.to_csv('data/porn/filtered_main_words.csv', encoding='utf-8-sig')
    prop_voc.to_csv('data/propaganda/filtered_main_words.csv', encoding='utf-8-sig')
    spam_voc.to_csv('data/spam/filtered_main_words.csv', encoding='utf-8-sig')
    fake_voc.to_csv('data/fake_followers/filtered_main_words.csv', encoding='utf-8-sig')

In [25]:
porn_vocabulary = get_main_words(porn_tweets_df)
porn_vocabulary.to_csv('data/porn/main_words.csv', encoding='utf-8-sig')
prop_vocabulary = get_main_words(prop_tweets_df)
prop_vocabulary.to_csv('data/propaganda/main_words.csv', encoding='utf-8-sig')
spam_vocabulary = get_main_words(spam_tweets_df)
spam_vocabulary.to_csv('data/spam/main_words.csv', encoding='utf-8-sig')

In [23]:
fake_vocabulary = get_main_words(fake_tweets_df)
fake_vocabulary.to_csv('data/fake_followers/main_words.csv', encoding='utf-8-sig')

In [26]:
unique_vocabularies()

225 porn words
248 prop words
244 spam words
239 fake words




### Compute Context Score

In [27]:
porn_words = pd.read_csv('data/porn/filtered_main_words.csv', sep=',')
prop_words = pd.read_csv('data/propaganda/filtered_main_words.csv', sep=',')
spam_words = pd.read_csv('data/spam/filtered_main_words.csv', sep=',')
fake_words = pd.read_csv('data/fake_followers/filtered_main_words.csv', sep=',')

In [28]:
def compute_score(tweets):

    user_score = pd.DataFrame(columns=['porn_words_score', 'prop_words_score', 'spam_words_score', 'fake_words_score'])

    for tweet in tweets:
        # check for words in main_words and compute the scores for each tweet and for each category
        mask = np.in1d(porn_words.word, tweet.split())
        porn_score = porn_words.loc[mask]['score'].values.sum()
        mask = np.in1d(prop_words.word, tweet.split())
        prop_score = prop_words.loc[mask]['score'].values.sum()
        mask = np.in1d(spam_words.word, tweet.split())
        spam_score = spam_words.loc[mask]['score'].values.sum()
        mask = np.in1d(fake_words.word, tweet.split())
        fake_score = fake_words.loc[mask]['score'].values.sum()
        
        user_score = user_score.append(pd.DataFrame({'porn_words_score': porn_score, 'prop_words_score': prop_score, 'spam_words_score': spam_score,'fake_words_score': fake_score}, index=[0]), ignore_index=True)

    return user_score

In [29]:
def score(tweets_df, id):
    
    tweets = tweets_df[tweets_df.user_id == id]['full_text']
    if len(tweets) > 0:
        # sum all the scores of each category
        user_score = compute_score(tweets).sum()
        scores = np.divide(user_score,len(tweets))
    else:
        scores = pd.DataFrame({'porn_words_score': 0, 'prop_words_score': 0, 'spam_words_score': 0,'fake_words_score': 0}, index=[0])
    
    # return the average scores of each user
    return scores

In [30]:
def context_score(bot_ids, bot_type):
    
    tweets_df = pd.read_csv('data/' + bot_type + '/tweets.csv', encoding='utf-8-sig', sep='\t')
        
    score_df = pd.DataFrame(columns=['porn_words_score', 'prop_words_score', 'spam_words_score', 'fake_words_score', 'user_id'])
    i = 0
    for user_id in bot_ids:
        i += 1
        scores = score(tweets_df, user_id)
        scores['user_id'] = user_id
        score_df = score_df.append(scores, ignore_index=True)
        if (i%10 == 0):
            clear_output()
            print(str(i) +  " " + bot_type + " bots processed")
            
    score_df.reset_index(drop=True, inplace=True)
    score_df.to_csv('data/' + bot_type + '/context_score.csv')

    print(bot_type + " done!")

In [31]:
context_score(porn_ids, 'porn')

6910 porn bots processed
porn done!


In [32]:
context_score(prop_ids, 'propaganda')

3300 propaganda bots processed
propaganda done!


In [33]:
context_score(spam_ids, 'spam')

4940 spam bots processed
spam done!


In [34]:
context_score(fake_ids, 'fake_followers')

8820 fake_followers bots processed
fake_followers done!


# Descriptive features

In [6]:
def describe_tweets(tweets):
    
    ret_perc, media_perc, url_perc, quote_perc = tweet_perc(tweets)
    
    avg_len, avg_ret, avg_fav = tweet_desc(tweets, 'avg')
    max_len, max_ret, max_fav = tweet_desc(tweets, 'max')
    min_len, min_ret, min_fav = tweet_desc(tweets, 'min')
    
    freq = tweet_freq(tweets)

    frame = np.array([avg_len, max_len, min_len, avg_ret, max_ret, min_ret, avg_fav, max_fav, min_fav, freq, ret_perc, media_perc, url_perc, quote_perc])

    
    desc_features = pd.DataFrame({'avg_len': avg_len, 'max_len': max_len, 'min_len': min_len, 'avg_ret': avg_ret, 'max_ret': max_ret, 'min_ret': min_ret, 'avg_fav': avg_fav, 'max_fav': max_fav, 'min_fav': min_fav, 'freq': freq, 'ret_perc': ret_perc, 'media_perc': media_perc, 'url_perc': url_perc, 'quote_perc': quote_perc}, index=[0])
    
    
    return desc_features

In [40]:
def tweet_perc(tweets):
    
    ret_perc = np.invert(tweets.retweeted_status.isnull()).sum()/len(tweets)
    media_perc = np.invert(tweets.extended_entities.isnull()).sum()/len(tweets)
    url_perc = np.invert(tweets.url.isnull()).sum()/len(tweets)
    quote_perc = tweets.is_quote_status.sum()/len(tweets)
    
    return ret_perc, media_perc, url_perc, quote_perc

In [9]:
def tweet_desc(tweets, metric):
    
    tweets_lenght = tweets['full_text'].apply(lambda x: len(x))
    
    if metric == 'avg':
        ret = np.mean(tweets.retweet_count)
        lenght = np.mean(tweets_lenght)
        fav = np.mean(tweets.favorite_count)
    elif metric == 'max':
        ret = max(tweets.retweet_count)
        lenght = max(tweets_lenght)
        fav = max(tweets.favorite_count)
    elif metric == 'min':
        ret = min(tweets.retweet_count)
        lenght = min(tweets_lenght)
        fav = min(tweets.favorite_count)

    return lenght, ret, fav

In [38]:
def tweet_freq(tweets):
    
    dates = list(tweets.created_at)
    
    last = dates[0]
    d = last[8:10]
    m = last[4:7]
    y = last[-4:]
    date = d + ' ' + m + ' ' + y
    last = datetime.strptime(date, '%d %b %Y')
    
    first = dates[-1]
    d = first[8:10]
    m = first[4:7]
    y = first[-4:]
    date = d + ' ' + m + ' ' + y
    first = datetime.strptime(date, '%d %b %Y')
    
    delta = (last - first).days + 1
    freq = len(tweets)/delta
    
    return freq

In [11]:
def describe(tweets_df, id):
    
    tweets = tweets_df[tweets_df.user_id == id]
    
    if len(tweets) > 0:
        # sum all the scores of each category
        features = describe_tweets(tweets)
    else:
        features = pd.DataFrame({'avg_len': 0, 'max_len': 0, 'min_len': 0,'avg_ret': 0, 'max_ret': 0, 'min_ret': 0, 'avg_fav': 0, 'max_fav': 0, 'min_fav': 0, 'freq': 0, 'ret_perc': 0, 'media_perc': 0, 'url_perc': 0, 'quote_perc':0}, index=[0])
    
    # return the average scores of each user
    return features

In [12]:
def descriptive_features(bot_ids, bot_type):
    
    tweets_df = pd.read_csv('data/' + bot_type + '/tweets.csv', encoding='utf-8-sig', sep='\t')
        
    desc_df = pd.DataFrame(columns=['avg_len', 'max_len', 'min_len', 'avg_ret', 'max_ret', 'min_ret', 'avg_fav', 'max_fav', 'min_fav', 'freq', 'ret_perc', 'media_perc', 'url_perc', 'quote_perc'])
    i = 0
    
    for user_id in bot_ids:
        i += 1
        features = describe(tweets_df, user_id)
        features['user_id'] = user_id
        desc_df = desc_df.append(features, ignore_index=True)
        if (i%10 == 0):
            clear_output()
            print(str(i) +  " " + bot_type + " bots processed")
            
    desc_df.reset_index(drop=True, inplace=True)
    desc_df.to_csv('data/' + bot_type + '/descriptive_features.csv')

    print(bot_type + " done!")

In [53]:
descriptive_features(fake_ids, 'fake_followers')

  if self.run_code(code, result):


AttributeError: 'DataFrame' object has no attribute 'retweeted_status'

In [44]:
desc = pd.read_csv('data/propaganda/descriptive_features.csv')

In [50]:
desc.drop('Unnamed: 0', inplace=True, axis=1)

In [51]:
desc.describe()

Unnamed: 0,avg_len,avg_ret,freq,max_len,max_ret,media_perc,min_len,min_ret,ret_perc,url_perc,user_id
count,3301.0,3301.0,3301.0,3301.0,3301.0,3301.0,3301.0,3301.0,3301.0,3301.0,3301.0
mean,126.399118,3087.114962,36.373379,189.455922,39965.5,0.103343,39.523478,10.879128,0.86217,0.233608,3.267065e+17
std,17.37748,3738.590667,29.886725,89.970273,91091.44,0.115992,14.340845,150.004112,0.265045,0.203501,4.099872e+17
min,23.0,0.0,0.012635,23.0,0.0,0.0,1.0,0.0,0.0,0.0,755113.0
25%,122.15,1059.8,12.375,144.0,18069.0,0.04,30.0,0.0,0.89,0.12,512625200.0
50%,127.56,2193.050505,33.333333,148.0,29751.0,0.08,40.0,0.0,0.989899,0.171717,3035315000.0
75%,131.57,4042.33,50.0,209.0,35421.0,0.13,46.0,1.0,1.0,0.25,8.003112e+17
max,373.39,113761.066667,100.0,977.0,3369452.0,1.0,235.0,6053.0,1.0,1.0,9.852123e+17


In [52]:
desc.head()

Unnamed: 0,avg_len,avg_ret,freq,max_len,max_ret,media_perc,min_len,min_ret,ret_perc,url_perc,user_id
0,135.01,1268.85,50.0,147,17340,0.05,61,1,1.0,0.13,8.010638e+17
1,126.93,2402.32,100.0,307,36501,0.06,42,0,0.92,0.15,7.204254e+17
2,161.79,1703.37,10.0,316,28404,0.05,30,0,0.47,0.57,9.22837e+17
3,124.84,201.85,50.0,144,4185,0.09,51,0,0.99,0.11,8.182995e+17
4,120.89899,2675.30303,8.25,185,29984,0.090909,23,0,0.757576,0.080808,8.377794e+17


In [4]:
tweets_df = pd.read_csv('data/spam/tweets.csv', encoding='utf-8-sig', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
tweets_df.loc[0]

Unnamed: 0                                                                   0
contributors                                                               NaN
coordinates                                                                NaN
created_at                                      Fri Dec 05 02:19:38 +0000 2014
display_text_range                                                     [0, 55]
extended_entities                                                          NaN
favorite_count                                                               2
favorited                                                                False
full_text                    Read the History of David Gates  http://t.co/Y...
geo                                                                        NaN
id                                                          540691938639630336
id_str                                                      540691938639630336
in_reply_to_screen_name                             