In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.spatial.distance import euclidean
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [302]:
#targets:
#    0: porn
#    1: propaganda
#    2: spam
#    3: fake followers
#    4: genuine accounts

users = pd.read_csv('data/full/users.csv', encoding='utf-8-sig')
spam_users = pd.read_csv('data/spam/users.csv', encoding='utf-8-sig')

porn_ids = users[users.target==0]['id']
prop_ids = users[users.target==1]['id']
spam_ids = spam_users['id']
fake_ids = users[users.target==3]['id']
gen_ids = users[users.target==4]['id']

# TF-IDF

## Porn

In [47]:
tweets_df = pd.read_csv('data/porn/tweets.csv', encoding='utf-8-sig', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [156]:
def compute_centroid(tf_idf):

    center = tf_idf.sum(axis=1)/tf_idf.shape[0]
    return center

In [201]:
def dist_from_centroid(tf_idf, centroid):
    
    distances = []
    for elem in tf_idf:
        distances.append(np.linalg.norm(tf_idf - centroid))
    return distances

In [202]:
def wss(id):
    
    # get tweets per id
    tweets = tweets_df[tweets_df.user_id == id]['full_text']
    n_tweets = len(tweets)
    
    transformer = TfidfVectorizer(smooth_idf=True)
    tf_idf = transformer.fit_transform(tweets).todense()
    
    centroid = compute_centroid(tf_idf)
    distances = dist_from_centroid(tf_idf, centroid)
    avg_dist = np.asarray(distances).sum()/n_tweets
    
    return avg_dist

In [203]:
distances = []
for user in porn_ids:
    try:
        distances.append(wss(user))
    except Exception as e:
        print(e)
        distances.append(0)

# Context Score

In [323]:
from collections import Counter
from sklearn.preprocessing import MinMaxScaler, normalize

porn_tweets_df = pd.read_csv('data/porn/tweets.csv', encoding='utf-8-sig', sep='\t')
prop_tweets_df = pd.read_csv('data/propaganda/tweets.csv', encoding='utf-8-sig', sep='\t')
spam_tweets_df = pd.read_csv('data/spam/tweets.csv', encoding='utf-8-sig', sep='\t')
fake_tweets_df = pd.read_csv('data/fake_followers/tweets.csv', encoding='utf-8-sig')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [347]:
from nltk.corpus import stopwords
import string
def get_main_words(tweets_df):
    
    n_words = 26
    tweets = tweets_df['full_text'].values.astype('str')
    
    #tokenize and remove stop words 
    punctuation = list(string.punctuation)
    stopWords = stopwords.words('english') + stopwords.words('italian') + stopwords.words('french') + stopwords.words('spanish') + punctuation + ['...', '"the', "i'm", 'go', 'time', 'get', 'rt', 'via', '&amp;'] + ["it's"]

    word_counter = Counter()
    for elem in tweets:
        word_counter.update(elem.lower().split())
    
    for word in stopWords:
        if word in word_counter:
            del word_counter[word]
            

    main_words = pd.DataFrame(data=word_counter.most_common(n_words), index=None, columns=['word', 'score'])
    
    # normalize scores
    scaler = MinMaxScaler()

    main_words['score'] = pd.DataFrame(scaler.fit_transform(main_words.score.values.reshape(-1, 1)))
    
    print(main_words[:25])
    
    return main_words[:25]

In [348]:
porn_vocabulary = get_main_words(porn_tweets_df)
porn_vocabulary.to_csv('data/porn/main_words.csv', encoding='utf-8-sig')
prop_vocabulary = get_main_words(prop_tweets_df)
prop_vocabulary.to_csv('data/propaganda/main_words.csv', encoding='utf-8-sig')
spam_vocabulary = get_main_words(spam_tweets_df)
spam_vocabulary.to_csv('data/spam/main_words.csv', encoding='utf-8-sig')
fake_vocabulary = get_main_words(fake_tweets_df)
fake_vocabulary.to_csv('data/fake_followers/main_words.csv', encoding='utf-8-sig')



        word     score
0       love  1.000000
1        new  0.482651
2       like  0.439875
3       look  0.427942
4        one  0.356343
5        bio  0.233156
6       good  0.206903
7      never  0.189279
8        see  0.151643
9     people  0.151092
10  @youtube  0.133284
11      want  0.131999
12       day  0.115660
13      need  0.098036
14     today  0.093997
15     check  0.091059
16     great  0.087571
17         2  0.057279
18        us  0.048834
19      know  0.047916
20     happy  0.036534
21      make  0.021663
22    follow  0.013402
23      best  0.013218
24     first  0.007160
                 word     score
0               trump  1.000000
1           president  0.436308
2   @realdonaldtrump:  0.269300
3              people  0.223744
4                like  0.201323
5    @realdonaldtrump  0.198466
6                 one  0.167816
7                  us  0.143656
8               obama  0.122663
9               would  0.093969
10                new  0.069809
11              tr

# Compute Context Score

In [333]:
from IPython.display import clear_output
porn_words = pd.read_csv('data/porn/main_words.csv', sep=',')
prop_words = pd.read_csv('data/propaganda/main_words.csv', sep=',')
spam_words = pd.read_csv('data/spam/main_words.csv', sep=',')
fake_words = pd.read_csv('data/fake_followers/main_words.csv', sep=',')

In [289]:
def compute_score(tweets):

    user_score = pd.DataFrame(columns=['porn_words_score', 'prop_words_score', 'spam_words_score', 'fake_words_score'])

    for tweet in tweets:
        # check for words in main_words and compute the scores for each tweet and for each category
        mask = np.in1d(porn_words.word, tweet.split())
        porn_score = porn_words.loc[mask]['score'].values.sum()
        mask = np.in1d(prop_words.word, tweet.split())
        prop_score = prop_words.loc[mask]['score'].values.sum()
        mask = np.in1d(spam_words.word, tweet.split())
        spam_score = spam_words.loc[mask]['score'].values.sum()
        mask = np.in1d(fake_words.word, tweet.split())
        fake_score = fake_words.loc[mask]['score'].values.sum()
        user_score = user_score.append(pd.DataFrame({'porn_words_score': porn_score, 'prop_words_score': prop_score, 'spam_words_score': spam_score,'fake_words_score': fake_score}, index=[0]), ignore_index=True)

    return user_score

In [290]:
def score(tweets_df, id):
    
    tweets = tweets_df[tweets_df.user_id == id]['full_text']
    if len(tweets) > 0:
        # sum all the scores of each category
        user_score = compute_score(tweets).sum()
        scores = np.divide(user_score,len(tweets))
    else:
        scores = np.array([0,0,0,0])
    
    # return the average scores of each user
    return scores

In [291]:
score_df = pd.DataFrame(columns=['porn_words_score', 'prop_words_score', 'spam_words_score', 'fake_words_score', 'user_id'])
i = 0
for user_id in porn_ids:
    i += 1
    scores = score(porn_tweets_df, user_id)
    temp_df = pd.DataFrame(data=[scores])
    temp_df['user_id'] = user_id
    score_df = score_df.append(temp_df)
    if (i%100 == 0):
        print(str(i) + " users processed")
score_df.to_csv('data/porn/context_score.csv')

100 users processed
200 users processed
300 users processed
400 users processed
500 users processed
600 users processed
700 users processed
800 users processed
900 users processed
1000 users processed
1100 users processed
1200 users processed
1300 users processed
1400 users processed
1500 users processed
1600 users processed
1700 users processed
1800 users processed
1900 users processed
2000 users processed
2100 users processed
2200 users processed
2300 users processed
2400 users processed
2500 users processed
2600 users processed
2700 users processed
2800 users processed
2900 users processed
3000 users processed
3100 users processed
3200 users processed
3300 users processed
3400 users processed
3500 users processed
3600 users processed
3700 users processed


In [293]:
score_df = pd.DataFrame(columns=['porn_words_score', 'prop_words_score', 'spam_words_score', 'fake_words_score', 'user_id'])
i = 0
for user_id in prop_ids:
    i += 1
    scores = score(prop_tweets_df, user_id)
    temp_df = pd.DataFrame(data=[scores])
    temp_df['user_id'] = user_id
    score_df = score_df.append(temp_df)
    if (i%100 == 0):
        clear_output()
        print(str(i) + " users processed")
score_df.to_csv('data/propaganda/context_score.csv')

100 users processed
200 users processed
300 users processed
400 users processed
500 users processed
600 users processed
700 users processed
800 users processed
900 users processed
1000 users processed
1100 users processed
1200 users processed
1300 users processed
1400 users processed
1500 users processed
1600 users processed
1700 users processed
1800 users processed
1900 users processed
2000 users processed
2100 users processed
2200 users processed
2300 users processed
2400 users processed
2500 users processed
2600 users processed
2700 users processed
2800 users processed
2900 users processed
3000 users processed
3100 users processed
3200 users processed
3300 users processed


In [314]:
spam_ids.drop(3399, inplace=True)

In [316]:
score_df = pd.DataFrame(columns=['porn_words_score', 'prop_words_score', 'spam_words_score', 'fake_words_score', 'user_id'])
i = 0
for user_id in spam_ids:
    i += 1
    scores = score(spam_tweets_df, user_id)
    temp_df = pd.DataFrame(data=[scores])
    temp_df['user_id'] = user_id
    score_df = score_df.append(temp_df)
    if (i%100 == 0):
        clear_output()
        print(str(i) + " users processed")
        
score_df.to_csv('data/spam/context_score.csv')

3700 users processed


In [334]:
fake_tweets_df = fake_tweets_df.rename(columns={'text': 'full_text'})

In [335]:
fake_tweets_df['full_text'] = fake_tweets_df['full_text'].astype(str)

In [296]:
score_df = pd.DataFrame(columns=['porn_words_score', 'prop_words_score', 'spam_words_score', 'fake_words_score', 'user_id'])
i = 0
for user_id in fake_ids:
    i += 1
    scores = score(fake_tweets_df, user_id)
    temp_df = pd.DataFrame(data=[scores])
    temp_df['user_id'] = user_id
    score_df = score_df.append(temp_df)
    if (i%100 == 0):
        clear_output()
        print(str(i) + " users processed")
        
score_df.to_csv('data/fake/context_score.csv')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


100 users processed
200 users processed
300 users processed
400 users processed
500 users processed


AttributeError: 'float' object has no attribute 'split'