In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix
from scipy.spatial.distance import euclidean
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.display import clear_output
from datetime import datetime
from urllib.parse import urlparse

In [26]:
tweets_df = pd.read_csv('data/bot_or_not/bot/tweets.csv', encoding='utf-8-sig', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
bot_users = pd.read_csv('data/bot_or_not/bot_users.csv', encoding='utf-8-sig')
gen_users = pd.read_csv('data/bot_or_not/gen_users.csv', encoding='utf-8-sig')

In [8]:
bot_ids = bot_users['id']
gen_ids = gen_users['id']

In [2]:
#targets:
#    0: porn
#    1: propaganda
#    2: spam
#    3: fake followers
#    4: genuine accounts

porn_users = pd.read_csv('data/porn/users.csv', encoding='utf-8-sig')
prop_users = pd.read_csv('data/propaganda/users.csv', encoding='utf-8-sig')
spam_users = pd.read_csv('data/spam/users.csv', encoding='utf-8-sig')
fake_users = pd.read_csv('data/fake_followers/users.csv', encoding='utf-8-sig')
genuine_users = pd.read_csv('data/genuine/users.csv', encoding='utf-8-sig')

porn_ids = porn_users['id']
prop_ids = prop_users['id']
spam_ids = spam_users['id']
fake_ids = fake_users['id']
gen_ids = genuine_users['id']

# TF-IDF

In [3]:
def compute_centroid(tf_idf):

    center = tf_idf.sum(axis=1)/tf_idf.shape[0]
    return center

In [4]:
def dist_from_centroid(tf_idf, centroid):
    
    distances = []
    for elem in tf_idf:
        distances.append(np.linalg.norm(tf_idf - centroid))
    return distances

In [40]:
def wss(id, tweets_df, is_tweet = 1):
    
    if is_tweet == 1:
        # get tweets per id
        vector = tweets_df[tweets_df.user_id.astype(int) == int(id)]['full_text']
        n_vectors = len(vector)
    elif is_tweet == 0:
        # get domains per id
        vector = tweets_df[tweets_df.user_id.astype(int) == int(id)]['url']
        vector = vector.fillna('').astype(str)
        for i in range(len(vector)):
            vector.iloc[i] = urlparse(vector.iloc[i]).netloc
        n_vectors = len(vector)
    else:
        print ('Invalid Input')

    transformer = TfidfVectorizer(smooth_idf=True)
    tf_idf = transformer.fit_transform(vector).todense()
    
    centroid = compute_centroid(tf_idf)
    distances = dist_from_centroid(tf_idf, centroid)
    avg_dist = np.asarray(distances).sum()/n_vectors
    
    return avg_dist

In [6]:
def intradistance(bot_ids, bot_type, is_tweet=1):
    
    tweets_df = pd.read_csv('data/' + bot_type + '/tweets.csv', encoding='utf-8-sig', sep='\t')
    
    distances = []
    i=0
    for user in bot_ids:
        i += 1
        try:
            distances.append(wss(user, tweets_df, is_tweet))
        except Exception as e:
            print(e)
            distances.append(0)
        if (i%10 == 0):
            clear_output()
            print(str(i) +  " " + bot_type + " bots processed")
    
    dist = pd.DataFrame()
    dist['user_id'] = bot_ids.values
    if is_tweet == 1:
        dist['tweet_intradistance'] = distances
        dist.to_csv('data/' + bot_type + '/tweet_intradistance.csv', index=False)
    elif is_tweet == 0:
        dist['url_intradistance'] = distances
        dist.to_csv('data/' + bot_type + '/url_intradistance.csv', index=False)
    else:
        print ('Invalid Input')

    print(bot_type + " done!")

In [41]:
intradistance(bot_ids, 'bot_or_not/bot', is_tweet=1)

5290 bot_or_not/bot bots processed
bot_or_not/bot done!


In [43]:
intradistance(bot_ids, 'bot_or_not/bot', is_tweet=0)

5290 bot_or_not/bot bots processed
empty vocabulary; perhaps the documents only contain stop words
empty vocabulary; perhaps the documents only contain stop words
empty vocabulary; perhaps the documents only contain stop words
bot_or_not/bot done!


In [44]:
intradistance(gen_ids, 'bot_or_not/gen', is_tweet=1)

5160 bot_or_not/gen bots processed
bot_or_not/gen done!


In [45]:
intradistance(gen_ids, 'bot_or_not/gen', is_tweet=0)

5160 bot_or_not/gen bots processed
bot_or_not/gen done!


In [None]:
intradistance(spam_ids, 'spam', is_tweet=1)

In [None]:
intradistance(spam_ids, 'spam', is_tweet=0)

In [None]:
intradistance(fake_ids, 'fake_followers', is_tweet=1)

In [12]:
intradistance(fake_ids, 'fake_followers', is_tweet=0)

9050 fake_followers bots processed
empty vocabulary; perhaps the documents only contain stop words
fake_followers done!


In [13]:
intradistance(prop_ids, 'propaganda', is_tweet=1)

3370 propaganda bots processed
propaganda done!


In [14]:
intradistance(prop_ids, 'propaganda', is_tweet=0)

3370 propaganda bots processed
propaganda done!


In [15]:
intradistance(gen_ids, 'genuine', is_tweet=1)

3660 genuine bots processed
empty vocabulary; perhaps the documents only contain stop words
genuine done!


In [16]:
intradistance(gen_ids, 'genuine', is_tweet=0)

3660 genuine bots processed
empty vocabulary; perhaps the documents only contain stop words
genuine done!


# Context Score

In [14]:
from collections import Counter
from sklearn.preprocessing import MinMaxScaler, normalize

bot_tweets_df = pd.read_csv('data/bot_or_not/bot/tweets.csv', encoding='utf-8-sig', sep='\t')
gen_tweets_df = pd.read_csv('data/bot_or_not/gen/tweets.csv', encoding='utf-8-sig', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [54]:
from collections import Counter
from sklearn.preprocessing import MinMaxScaler, normalize

porn_tweets_df = pd.read_csv('data/porn/tweets.csv', encoding='utf-8-sig', sep='\t')
prop_tweets_df = pd.read_csv('data/propaganda/tweets.csv', encoding='utf-8-sig', sep='\t')
spam_tweets_df = pd.read_csv('data/spam/tweets.csv', encoding='utf-8-sig', sep='\t')
fake_tweets_df = pd.read_csv('data/fake_followers/tweets.csv', encoding='utf-8-sig', sep='\t')
genuine_tweets_df = pd.read_csv('data/genuine/tweets.csv', encoding='utf-8-sig', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [15]:
from nltk.corpus import stopwords
import string
def get_main_words(tweets_df):
    
    n_words = 300
    tweets = tweets_df['full_text'].values.astype('str')
    
    #tokenize and remove stop words 
    punctuation = list(string.punctuation)
    stopWords = stopwords.words('english') + stopwords.words('italian') + stopwords.words('french') + stopwords.words('spanish') + punctuation + ['...', '"the', "i'm", 'go', 'time', 'get', 'rt', 'via', '&amp;'] + ["it's"]

    word_counter = Counter()
    for elem in tweets:
        word_counter.update(elem.lower().split())
    
    for word in stopWords:
        if word in word_counter:
            del word_counter[word]
            

    main_words = pd.DataFrame(data=word_counter.most_common(n_words), index=None, columns=['word', 'score'])
    
    return main_words[:n_words-1]

In [56]:
def unique_vocabularies():
    porn_voc = pd.read_csv('data/porn/main_words.csv', sep=',')
    prop_voc = pd.read_csv('data/propaganda/main_words.csv', sep=',')
    spam_voc = pd.read_csv('data/spam/main_words.csv', sep=',')
    fake_voc = pd.read_csv('data/fake_followers/main_words.csv', sep=',')
    genuine_voc = pd.read_csv('data/genuine/main_words.csv', sep=',')
    
    porn_words = set(porn_voc['word'])
    porn_main_words = set(porn_voc['word'][:50])
    prop_words = set(prop_voc['word'])
    prop_main_words = set(prop_voc['word'][:50])
    spam_words = set(spam_voc['word'])
    spam_main_words = set(spam_voc['word'][:50])
    fake_words = set(fake_voc['word'])
    fake_main_words = set(fake_voc['word'][:50])
    genuine_words = set(genuine_voc['word'])
    genuine_main_words = set(genuine_voc['word'][:50])
    
    new_porn_words = porn_words - set(prop_main_words | spam_main_words | fake_main_words | genuine_main_words)
    new_prop_words = prop_words - set(porn_main_words | spam_main_words | fake_main_words | genuine_main_words)
    new_spam_words = spam_words - set(prop_main_words | porn_main_words | fake_main_words | genuine_main_words)
    new_fake_words = fake_words - set(prop_main_words | spam_main_words | porn_main_words | genuine_main_words)
    new_genuine_words = genuine_words - set(prop_main_words | spam_main_words | porn_main_words | fake_main_words)
    
    print(str(len(new_porn_words)) + ' porn words')
    print(str(len(new_prop_words)) + ' prop words')
    print(str(len(new_spam_words)) + ' spam words')
    print(str(len(new_fake_words)) + ' fake words')
    print(str(len(new_genuine_words)) + ' genuine words')
    
    # normalize scores
    porn_voc = porn_voc[porn_voc['word'].isin(list(new_porn_words))]
    scaler = MinMaxScaler() 
    porn_voc['score'] = scaler.fit_transform(porn_voc.score.values.reshape(-1, 1))
    porn_voc.drop(columns='Unnamed: 0', inplace=True)
    porn_voc.drop(porn_voc.tail(1).index, inplace=True)
    porn_voc.reset_index(drop=True, inplace=True)
    
    prop_voc = prop_voc[prop_voc['word'].isin(list(new_prop_words))]
    scaler = MinMaxScaler()
    prop_voc['score'] = scaler.fit_transform(prop_voc.score.values.reshape(-1, 1))
    prop_voc.drop(columns='Unnamed: 0', inplace=True)
    prop_voc.drop(prop_voc.tail(1).index, inplace=True)
    prop_voc.reset_index(drop=True, inplace=True)
    
    spam_voc = spam_voc[spam_voc['word'].isin(list(new_spam_words))]
    scaler = MinMaxScaler()
    spam_voc['score'] = scaler.fit_transform(spam_voc.score.values.reshape(-1, 1))
    spam_voc.drop(columns='Unnamed: 0', inplace=True)
    spam_voc.drop(spam_voc.tail(1).index, inplace=True)
    spam_voc.reset_index(drop=True, inplace=True)
    
    fake_voc = fake_voc[fake_voc['word'].isin(list(new_fake_words))]
    scaler = MinMaxScaler()
    fake_voc['score'] = scaler.fit_transform(fake_voc.score.values.reshape(-1, 1))
    fake_voc.drop(columns='Unnamed: 0', inplace=True)
    fake_voc.drop(fake_voc.tail(1).index, inplace=True)
    fake_voc.reset_index(drop=True, inplace=True)
    
    genuine_voc = genuine_voc[genuine_voc['word'].isin(list(new_genuine_words))]
    scaler = MinMaxScaler()
    genuine_voc['score'] = scaler.fit_transform(genuine_voc.score.values.reshape(-1, 1))
    genuine_voc.drop(columns='Unnamed: 0', inplace=True)
    genuine_voc.drop(genuine_voc.tail(1).index, inplace=True)
    genuine_voc.reset_index(drop=True, inplace=True)
    
    
    porn_voc.to_csv('data/porn/filtered_main_words.csv', encoding='utf-8-sig')
    prop_voc.to_csv('data/propaganda/filtered_main_words.csv', encoding='utf-8-sig')
    spam_voc.to_csv('data/spam/filtered_main_words.csv', encoding='utf-8-sig')
    fake_voc.to_csv('data/fake_followers/filtered_main_words.csv', encoding='utf-8-sig')
    genuine_voc.to_csv('data/genuine/filtered_main_words.csv', encoding='utf-8-sig')

In [16]:
def unique_bot_not_vocabularies():
    bot_voc = pd.read_csv('data/bot_or_not/bot/main_words.csv', sep=',')
    gen_voc = pd.read_csv('data/bot_or_not/gen/main_words.csv', sep=',')
    
    bot_words = set(bot_voc['word'])
    bot_main_words = set(bot_voc['word'][:50])
    gen_words = set(gen_voc['word'])
    gen_main_words = set(gen_voc['word'][:50])

    
    new_bot_words = bot_words - set(gen_main_words)
    new_gen_words = gen_words - set(bot_main_words)
    
    print(str(len(new_bot_words)) + ' bot words')
    print(str(len(new_gen_words)) + ' gen words')
    
    # normalize scores
    bot_voc = bot_voc[bot_voc['word'].isin(list(new_bot_words))]
    scaler = MinMaxScaler() 
    bot_voc['score'] = scaler.fit_transform(bot_voc.score.values.reshape(-1, 1))
    bot_voc.drop(columns='Unnamed: 0', inplace=True)
    bot_voc.drop(bot_voc.tail(1).index, inplace=True)
    bot_voc.reset_index(drop=True, inplace=True)
    
    gen_voc = gen_voc[gen_voc['word'].isin(list(new_gen_words))]
    scaler = MinMaxScaler()
    gen_voc['score'] = scaler.fit_transform(gen_voc.score.values.reshape(-1, 1))
    gen_voc.drop(columns='Unnamed: 0', inplace=True)
    gen_voc.drop(gen_voc.tail(1).index, inplace=True)
    gen_voc.reset_index(drop=True, inplace=True)
    
    bot_voc.to_csv('data/bot_or_not/bot/filtered_main_words.csv', encoding='utf-8-sig')
    gen_voc.to_csv('data/bot_or_not/gen/filtered_main_words.csv', encoding='utf-8-sig')

In [18]:
bot_vocabulary = get_main_words(bot_tweets_df)
bot_vocabulary.to_csv('data/bot_or_not/bot/main_words.csv', encoding='utf-8-sig')
gen_vocabulary = get_main_words(gen_tweets_df)
gen_vocabulary.to_csv('data/bot_or_not/gen/main_words.csv', encoding='utf-8-sig')

In [57]:
porn_vocabulary = get_main_words(porn_tweets_df)
porn_vocabulary.to_csv('data/porn/main_words.csv', encoding='utf-8-sig')
prop_vocabulary = get_main_words(prop_tweets_df)
prop_vocabulary.to_csv('data/propaganda/main_words.csv', encoding='utf-8-sig')
spam_vocabulary = get_main_words(spam_tweets_df)
spam_vocabulary.to_csv('data/spam/main_words.csv', encoding='utf-8-sig')
fake_vocabulary = get_main_words(fake_tweets_df)
fake_vocabulary.to_csv('data/fake_followers/main_words.csv', encoding='utf-8-sig')
genuine_vocabulary = get_main_words(genuine_tweets_df)
genuine_vocabulary.to_csv('data/genuine/main_words.csv', encoding='utf-8-sig')

In [19]:
unique_bot_not_vocabularies()

251 bot words
252 gen words




### Compute Context Score

In [20]:
bot_words = pd.read_csv('data/bot_or_not/bot/filtered_main_words.csv', sep=',')
gen_words = pd.read_csv('data/bot_or_not/gen/filtered_main_words.csv', sep=',')

In [59]:
porn_words = pd.read_csv('data/porn/filtered_main_words.csv', sep=',')
prop_words = pd.read_csv('data/propaganda/filtered_main_words.csv', sep=',')
spam_words = pd.read_csv('data/spam/filtered_main_words.csv', sep=',')
fake_words = pd.read_csv('data/fake_followers/filtered_main_words.csv', sep=',')
genuine_words = pd.read_csv('data/genuine/filtered_main_words.csv', sep=',')

In [21]:
def compute_bot_not_score(tweets):

    user_score = pd.DataFrame(columns=['bot_words_score', 'gen_words_score'])

    for tweet in tweets:
        # check for words in main_words and compute the scores for each tweet and for each category
        mask = np.in1d(bot_words.word, tweet.split())
        bot_score = bot_words.loc[mask]['score'].values.sum()
        mask = np.in1d(gen_words.word, tweet.split())
        gen_score = gen_words.loc[mask]['score'].values.sum()
        
        user_score = user_score.append(pd.DataFrame({'bot_words_score': bot_score, 'gen_words_score': gen_score}, index=[0]), ignore_index=True)

    return user_score

In [60]:
def compute_score(tweets):

    user_score = pd.DataFrame(columns=['porn_words_score', 'prop_words_score', 'spam_words_score', 'fake_words_score'])

    for tweet in tweets:
        # check for words in main_words and compute the scores for each tweet and for each category
        mask = np.in1d(porn_words.word, tweet.split())
        porn_score = porn_words.loc[mask]['score'].values.sum()
        mask = np.in1d(prop_words.word, tweet.split())
        prop_score = prop_words.loc[mask]['score'].values.sum()
        mask = np.in1d(spam_words.word, tweet.split())
        spam_score = spam_words.loc[mask]['score'].values.sum()
        mask = np.in1d(fake_words.word, tweet.split())
        fake_score = fake_words.loc[mask]['score'].values.sum()
        mask = np.in1d(genuine_words.word, tweet.split())
        genuine_score = genuine_words.loc[mask]['score'].values.sum()
        
        user_score = user_score.append(pd.DataFrame({'porn_words_score': porn_score, 'prop_words_score': prop_score, 'spam_words_score': spam_score,'fake_words_score': fake_score,'genuine_words_score': genuine_score}, index=[0]), ignore_index=True)

    return user_score

In [42]:
def bot_not_score(tweets_df, id):
    
    tweets = tweets_df[tweets_df.user_id.astype(int) == int(id)]['full_text']
    if len(tweets) > 0:
        # sum all the scores of each category
        user_score = compute_bot_not_score(tweets).sum()
        scores = np.divide(user_score,len(tweets))
    else:
        scores = pd.DataFrame({'bot_words_score': 0, 'gen_words_score': 0}, index=[0])
    
    # return the average scores of each user
    return scores

In [61]:
def score(tweets_df, id):
    
    tweets = tweets_df[tweets_df.user_id == id]['full_text']
    if len(tweets) > 0:
        # sum all the scores of each category
        user_score = compute_score(tweets).sum()
        scores = np.divide(user_score,len(tweets))
    else:
        scores = pd.DataFrame({'porn_words_score': 0, 'prop_words_score': 0, 'spam_words_score': 0, 'fake_words_score': 0, 'genuine_words_score': 0}, index=[0])
    
    # return the average scores of each user
    return scores

In [23]:
def context_bot_not_score(bot_ids, bot_type):
    
    tweets_df = pd.read_csv('data/' + bot_type + '/tweets.csv', encoding='utf-8-sig', sep='\t')
        
    score_df = pd.DataFrame(columns=['bot_words_score', 'gen_words_score', 'user_id'])
    i = 0
    for user_id in bot_ids:
        i += 1
        scores = bot_not_score(tweets_df, user_id)
        score_df = score_df.append(scores, ignore_index=True)
        if (i%10 == 0):
            clear_output()
            print(str(i) +  " " + bot_type + " bots processed")
    
    score_df['user_id'] = bot_ids.values
    
    score_df.reset_index(drop=True, inplace=True)
    score_df.to_csv('data/' + bot_type + '/context_score.csv', index=False)

    print(bot_type + " done!")

In [64]:
def context_score(bot_ids, bot_type):
    
    tweets_df = pd.read_csv('data/' + bot_type + '/tweets.csv', encoding='utf-8-sig', sep='\t')
        
    score_df = pd.DataFrame(columns=['porn_words_score', 'prop_words_score', 'spam_words_score', 'fake_words_score', 'genuine_words_score', 'user_id'])
    i = 0
    for user_id in bot_ids:
        i += 1
        scores = score(tweets_df, user_id)
        score_df = score_df.append(scores, ignore_index=True)
        if (i%10 == 0):
            clear_output()
            print(str(i) +  " " + bot_type + " bots processed")
    
    score_df['user_id'] = bot_ids.values
    
    score_df.reset_index(drop=True, inplace=True)
    score_df.to_csv('data/' + bot_type + '/context_score.csv', index=False)

    print(bot_type + " done!")

In [None]:
context_bot_not_score(gen_ids, 'bot_or_not/gen')

In [25]:
context_bot_not_score(bot_ids, 'bot_or_not/bot')

  if self.run_code(code, result):
  result = method(y)


TypeError: invalid type comparison

In [67]:
context_score(prop_ids, 'propaganda')

3370 propaganda bots processed
propaganda done!


In [68]:
context_score(spam_ids, 'spam')

5360 spam bots processed
spam done!


In [69]:
context_score(fake_ids, 'fake_followers')

9050 fake_followers bots processed
fake_followers done!


In [70]:
context_score(gen_ids, 'genuine')

3660 genuine bots processed
genuine done!


# Descriptive features

In [46]:
def describe_tweets(tweets):
    
    ret_perc, media_perc, url_perc, quote_perc = tweet_perc(tweets)
    
    avg_len, avg_ret, avg_fav, avg_hash = tweet_desc(tweets, 'avg')
    max_len, max_ret, max_fav, max_hash = tweet_desc(tweets, 'max')
    min_len, min_ret, min_fav, min_hash = tweet_desc(tweets, 'min')
    
    freq = tweet_freq(tweets)

    frame = np.array([avg_len, max_len, min_len, avg_ret, max_ret, min_ret, avg_fav, max_fav, min_fav, avg_hash, max_hash, min_hash, freq, ret_perc, media_perc, url_perc, quote_perc])
   
    desc_features = pd.DataFrame({'avg_len': avg_len, 'max_len': max_len, 'min_len': min_len, 'avg_ret': avg_ret, 'max_ret': max_ret, 'min_ret': min_ret, 'avg_fav': avg_fav, 'max_fav': max_fav, 'min_fav': min_fav, 'avg_hash' : avg_hash, 'max_hash' : max_hash, 'min_hash' : max_hash,'freq': freq, 'ret_perc': ret_perc, 'media_perc': media_perc, 'url_perc': url_perc, 'quote_perc': quote_perc}, index=[0])
    
    return desc_features

In [47]:
def tweet_perc(tweets):
    
    ret_perc = np.invert(tweets.retweeted_status.isnull()).sum()/len(tweets)
    media_perc = np.invert(tweets.extended_entities.isnull()).sum()/len(tweets)
    url_perc = np.invert(tweets.url.isnull()).sum()/len(tweets)
    quote_perc = tweets.is_quote_status.sum()/len(tweets)
    
    return ret_perc, media_perc, url_perc, quote_perc

In [48]:
def hashtag_count(tweets):
    
    occurrences = []
    for tweet in tweets:
        occurrences.append(tweet.count('#'))
        
    return occurrences

In [49]:
def tweet_desc(tweets, metric):
    
    tweets_lenght = tweets['full_text'].apply(lambda x: len(x))
    
    if metric == 'avg':
        ret = np.mean(tweets.retweet_count)
        lenght = np.mean(tweets_lenght)
        fav = np.mean(tweets.favorite_count)
        hashtag = np.mean(hashtag_count(tweets['full_text']))
    elif metric == 'max':
        ret = max(tweets.retweet_count)
        lenght = max(tweets_lenght)
        fav = max(tweets.favorite_count)
        hashtag = max(hashtag_count(tweets['full_text']))
    elif metric == 'min':
        ret = min(tweets.retweet_count)
        lenght = min(tweets_lenght)
        fav = min(tweets.favorite_count)
        hashtag = min(hashtag_count(tweets['full_text']))

    return lenght, ret, fav, hashtag

In [50]:
def tweet_freq(tweets):
    
    dates = list(tweets.created_at)
    
    last = dates[0]
    d = last[8:10]
    m = last[4:7]
    y = last[-4:]
    date = d + ' ' + m + ' ' + y
    last = datetime.strptime(date, '%d %b %Y')
    
    first = dates[-1]
    d = first[8:10]
    m = first[4:7]
    y = first[-4:]
    date = d + ' ' + m + ' ' + y
    first = datetime.strptime(date, '%d %b %Y')
    
    delta = (last - first).days + 1
    freq = len(tweets)/delta
    
    return freq

In [95]:
def describe(tweets_df, id):
    
    try:
        tweets = tweets_df[tweets_df.user_id.astype(int) == int(id)]

    
        if len(tweets) > 0:
            # sum all the scores of each category
            features = describe_tweets(tweets)
        else:
            features = pd.DataFrame({'avg_len': 0, 'max_len': 0, 'min_len': 0,'avg_ret': 0, 'max_ret': 0, 'min_ret': 0, 'avg_fav': 0, 'max_fav': 0, 'min_fav': 0, 'avg_hash': 0, 'max_hash': 0, 'min_hash': 0, 'freq': 0, 'ret_perc': 0, 'media_perc': 0, 'url_perc': 0, 'quote_perc':0}, index=[0])

    except:
        features = pd.DataFrame({'avg_len': 0, 'max_len': 0, 'min_len': 0,'avg_ret': 0, 'max_ret': 0, 'min_ret': 0, 'avg_fav': 0, 'max_fav': 0, 'min_fav': 0, 'avg_hash': 0, 'max_hash': 0, 'min_hash': 0, 'freq': 0, 'ret_perc': 0, 'media_perc': 0, 'url_perc': 0, 'quote_perc':0}, index=[0])
    
    
    # return the average scores of each user
    return features

In [53]:
def descriptive_features(bot_ids, bot_type):
    
    tweets_df = pd.read_csv('data/' + bot_type + '/tweets.csv', encoding='utf-8-sig', sep='\t')

        
    desc_df = pd.DataFrame(columns=['avg_len', 'max_len', 'min_len', 'avg_ret', 'max_ret', 'min_ret', 'avg_fav', 'max_fav', 'min_fav', 'avg_hash', 'max_hash', 'min_hash', 'freq', 'ret_perc', 'media_perc', 'url_perc', 'quote_perc'])
    i = 0
    
    for user_id in bot_ids:
        i += 1
        features = describe(tweets_df, user_id)
        desc_df = desc_df.append(features, ignore_index=True)
        if (i%10 == 0):
            clear_output()
            print(str(i) +  " " + bot_type + " bots processed")
    
    desc_df['user_id'] = bot_ids.values
    
    desc_df.reset_index(drop=True, inplace=True)
    desc_df.to_csv('data/' + bot_type + '/descriptive_features.csv', index=False)

    print(bot_type + " done!")

In [56]:
tweets_df = pd.read_csv('data/bot_or_not/bot/tweets.csv', encoding='utf-8-sig', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
i=0
for elem in bot_users.id:
    int(elem)
    i+=1
    clear_output()
    print(i)

3575


In [93]:
bot_users.iloc[3575]

id                                                                             45372197
name                                                                            Etherea
screen_name                                                                 etherearock
statuses_count                                                                      140
followers_count                                                                     122
friends_count                                                                        30
favourites_count                                                                      0
listed_count                                                                          8
url                                                              http://t.co/MeKV7HVI8t
lang                                                                                 en
time_zone                                                                           NaN
location                        

In [91]:
bot_users.drop(bot_users.index[2596], inplace=True)

In [96]:
descriptive_features(bot_ids, 'bot_or_not/bot')

5290 bot_or_not/bot bots processed
bot_or_not/bot done!


In [55]:
descriptive_features(gen_ids, 'bot_or_not/gen')

5160 bot_or_not/gen bots processed
bot_or_not/gen done!


In [79]:
descriptive_features(prop_ids, 'propaganda')

3370 propaganda bots processed
propaganda done!


In [80]:
descriptive_features(spam_ids, 'spam')

5360 spam bots processed
spam done!


In [81]:
descriptive_features(fake_ids, 'fake_followers')

9050 fake_followers bots processed
fake_followers done!


In [82]:
descriptive_features(gen_ids, 'genuine')

3660 genuine bots processed
genuine done!


# NSFW Detection

In [3]:
import os, sys
import tensorflow as tf
import urllib.request
from ast import literal_eval

# Loads label file, strips off carriage return
label_lines = [line.rstrip() for line 
                   in tf.gfile.GFile("../scripts/NSFW-detection/retrained_labels.txt")]

# Unpersists graph from file
with tf.gfile.FastGFile("../scripts/NSFW-detection/retrained_graph.pb", 'rb') as f:
    graph_def = tf.GraphDef()
    graph_def.ParseFromString(f.read())
    tf.import_graph_def(graph_def, name='')


def nsfw(url):
    try:
        urllib.request.urlretrieve(url, "local-filename.jpg")
        image_path = 'local-filename.jpg'
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

        # Read in the image_data
        image_data = tf.gfile.FastGFile(image_path, 'rb').read()

        with tf.Session() as sess:
            # Feed the image_data as input to the graph and get first prediction
            softmax_tensor = sess.graph.get_tensor_by_name('final_result:0')
            predictions = sess.run(softmax_tensor, {'DecodeJpeg/contents:0': image_data})

            return predictions[0][1]
    except:
        return 0

  from ._conv import register_converters as _register_converters


In [8]:
def nsfw_detection(bot_ids, bot_type):
    
    users_df = pd.read_csv('data/' + bot_type + '/users.csv', encoding='utf-8-sig')
    tweets_df = pd.read_csv('data/' + bot_type + '/tweets.csv', encoding='utf-8-sig', sep='\t')
    df = pd.DataFrame(columns=['user_id', 'nsfw_avg', 'nsfw_profile'])
    
    i = 0

    for user_id in bot_ids:
        i += 1
        tweets = tweets_df[tweets_df.user_id == user_id]
        porn = 0
        tot = len(tweets.extended_entities[tweets.extended_entities.notnull()][:10])
        
        for media in tweets.extended_entities[tweets.extended_entities.notnull()][:10]:
            try:
                python_dict = literal_eval(media)
                url = python_dict['media'][0]['media_url_https']
                if nsfw(url) > 0.5:
                    porn += 1
            except:
                pass
            

        if tot > 0:
            nudity = porn/tot
        else:
            nudity = 0
        
        try:
            profile = users_df[users_df.id == user_id]['profile_image_url_https'].iloc[0].replace('normal', '400x400')
            profile_nudity = nsfw(profile)
        except:
            profile_nudity = 0
            
        
        df = df.append({'user_id':user_id, 'nsfw_avg':nudity, 'nsfw_profile': profile_nudity}, ignore_index=True)
        
        if (i%10 == 0):
            clear_output()
            print(str(i) +  " / " + str(len(bot_ids)) + " " + bot_type + " bots processed")
        
        
    #df.to_csv('data/' + bot_type + '/nsfw_avg.csv', index=False)
    
    print(bot_type + " done!")
    return df

In [9]:
porn_nsfw = nsfw_detection(porn_ids, 'porn')

6930 / 6935 porn bots processed
porn done!


In [10]:
prop_nsfw = nsfw_detection(prop_ids, 'propaganda')

3370 / 3372 propaganda bots processed
propaganda done!


In [11]:
spam_nsfw = nsfw_detection(spam_ids, 'spam')

5360 / 5361 spam bots processed
spam done!


In [12]:
ff_nsfw = nsfw_detection(fake_ids, 'fake_followers')

9050 / 9053 fake_followers bots processed
fake_followers done!


In [13]:
gen_nsfw = nsfw_detection(gen_ids, 'genuine')

3660 / 3661 genuine bots processed
genuine done!


In [16]:
porn_nsfw.to_csv('data/porn/nsfw.csv', index=False)

In [18]:
prop_nsfw.to_csv('data/propaganda/nsfw.csv', index=False)

In [19]:
spam_nsfw.to_csv('data/spam/nsfw.csv', index=False)

In [20]:
ff_nsfw.to_csv('data/fake_followers/nsfw.csv', index=False)

In [21]:
gen_nsfw.to_csv('data/genuine/nsfw.csv', index=False)

# Semantic

In [9]:
def clusterize_tweets(tweets_df, k=5):
    
    tweets = tweets_df['full_text']
    
    transformer = TfidfVectorizer(smooth_idf=True)
    tf_idf = transformer.fit_transform(tweets)
    
    kmeans = KMeans(n_clusters=k, random_state=0, verbose=True).fit(tf_idf)
    
    return kmeans

In [None]:
porn = pd.read_csv('data/porn/tweets.csv', encoding='utf-8-sig', sep='\t')
news = pd.read_csv('data/propaganda/tweets.csv', encoding='utf-8-sig', sep='\t')
ff = pd.read_csv('data/fake_followers/tweets.csv', encoding='utf-8-sig', sep='\t')
spam = pd.read_csv('data/spam/tweets.csv', encoding='utf-8-sig', sep='\t')
gen = pd.read_csv('data/genuine/tweets.csv', encoding='utf-8-sig', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
labels = clusterize_tweets(tweets_df, k=5)

Initialization complete
Iteration  0, inertia 80923.216
Iteration  1, inertia 40994.959
Iteration  2, inertia 40901.466
Iteration  3, inertia 40877.730
Iteration  4, inertia 40871.792
Iteration  5, inertia 40870.639
Iteration  6, inertia 40870.193
Iteration  7, inertia 40869.459
Iteration  8, inertia 40869.189
Iteration  9, inertia 40868.484
Iteration 10, inertia 40868.086
Iteration 11, inertia 40867.866
Iteration 12, inertia 40867.622
Iteration 13, inertia 40867.305
Iteration 14, inertia 40867.298
Iteration 15, inertia 40867.297
Converged at iteration 15: center shift 0.000000e+00 within tolerance 1.167001e-09
Initialization complete
Iteration  0, inertia 81451.867
Iteration  1, inertia 40961.765
Iteration  2, inertia 40858.838
Iteration  3, inertia 40837.124
Iteration  4, inertia 40824.035
Iteration  5, inertia 40819.692
Iteration  6, inertia 40815.136
Iteration  7, inertia 40812.155
Iteration  8, inertia 40810.004
Iteration  9, inertia 40808.296
Iteration 10, inertia 40807.175
Itera

In [65]:
tweets = np.asarray(tweets_df['full_text'].tolist())

In [77]:
# get all tweets for each target
target_tweets = tweets[labels.labels_ == 0]

In [87]:
target_tweets = " ".join(target_tweets)

In [90]:
a = filter(lambda x:x[0]!='#', target_tweets.split())

In [None]:
for i in range(5):
    
    # get all the tweets in a cluster
    tweets[labels.labels_ == i]
    
    