# Final Project for COGS118A, this is meant to find and identify russian 'troll' tweets.

Group members :
- Gael Van der Lee
- Alex Labranche

In [1]:
import sklearn as sk
import pandas as pd
import matplotlib.pyplot as plt
import re
import random

# Data gathering and cleaning

### Loading and cleaning

In [2]:
# Load the 3 datasets
troll_users = pd.read_csv('Tweets/Trolls/users.csv')
troll_tweets = pd.read_csv('Tweets/Trolls/tweets.csv')
normal_tweets = pd.read_csv('Tweets/Normal/dashboard_x_usa_x_filter_nativeretweets.csv')

In [3]:
print('Columns of each dataframe at import :\n\nTroll users : {}\n\nTroll tweets : {}\n\nNormal tweets : {}'
      .format(troll_users.columns.values, troll_tweets.columns.values, normal_tweets.columns.values))

Columns of each dataframe at import :

Troll users : ['id' 'location' 'name' 'followers_count' 'statuses_count' 'time_zone'
 'verified' 'lang' 'screen_name' 'description' 'created_at'
 'favourites_count' 'friends_count' 'listed_count']

Troll tweets : ['user_id' 'user_key' 'created_at' 'created_str' 'retweet_count'
 'retweeted' 'favorite_count' 'text' 'tweet_id' 'source' 'hashtags'
 'expanded_urls' 'posted' 'mentions' 'retweeted_status_id'
 'in_reply_to_status_id']

Normal tweets : ['Tweet Id' 'Date' 'Hour' 'User Name' 'Nickname' 'Bio' 'Tweet content'
 'Favs' 'RTs' 'Latitude' 'Longitude' 'Country' 'Place (as appears on Bio)'
 'Profile picture' 'Followers' 'Following' 'Listed'
 'Tweet language (ISO 639-1)' 'Tweet Url']


In [4]:
# Make Nicknames consistent
troll_users['screen_name'] = troll_users['screen_name'].str.lower()

# Drop the column we don't want to use
users_to_drop = ['statuses_count', 'time_zone', 'verified', 'favourites_count']
tweets_to_drop = ['created_at', 'tweet_id', 'source', 'posted', 'retweeted_status_id', 'retweeted', 'in_reply_to_status_id']
normal_to_drop = ['Tweet Id', 'Latitude', 'Longitude', 'Country', 'Profile picture', 'Tweet Url']
troll_users = troll_users.drop(users_to_drop, axis=1)
troll_tweets = troll_tweets.drop(tweets_to_drop, axis=1)
normal_tweets = normal_tweets.drop(normal_to_drop, axis=1)

In [5]:
users_new_cols = {'id': 'User Id', 'location': 'Location', 'name': 'User Name', 
                  'followers_count': 'Followers', 'lang': 'Language', 'screen_name': 'Nickname',
                  'description': 'Bio', 'created_at': 'Account creation date', 
                  'friends_count': 'Following', 'listed_count': 'Listed'}
tweets_new_cols = {'user_id': 'User Id', 'user_key': 'Nickname', 'created_str': 'Tweet date',
                   'retweet_count': 'Retweets', 'favorite_count': 'Favorites', 'text': 'Tweet',
                   'hashtags': 'Hashtags', 'expanded_urls': 'URLs', 'mentions': 'Mentions'}
normal_new_cols = {'Tweet content': 'Tweet', 'Favs': 'Favorites', 'RTs': 'Retweets', 
                   'Place (as appears on Bio)': 'Location', 'Tweet language (ISO 639-1)': 'Language'}

troll_users = troll_users.rename(columns=users_new_cols)
troll_tweets = troll_tweets.rename(columns=tweets_new_cols)
normal_tweets = normal_tweets.rename(columns=normal_new_cols)

# Make times formats the same
normal_tweets['Tweet date'] = normal_tweets['Date'] + ' ' + normal_tweets['Hour']
normal_tweets = normal_tweets.drop(['Date', 'Hour'], axis=1)

# Helps with merging
troll_users = troll_users.drop(['User Id', 'Account creation date'], axis=1)
troll_tweets = troll_tweets.drop('User Id', axis=1)

In [6]:
print('Columns of each dataframe after cleaning :\n\nTroll users : {}\n\nTroll tweets : {}\n\nNormal tweets : {}'
      .format(troll_users.columns.values, troll_tweets.columns.values, normal_tweets.columns.values))

Columns of each dataframe after cleaning :

Troll users : ['Location' 'User Name' 'Followers' 'Language' 'Nickname' 'Bio'
 'Following' 'Listed']

Troll tweets : ['Nickname' 'Tweet date' 'Retweets' 'Favorites' 'Tweet' 'Hashtags' 'URLs'
 'Mentions']

Normal tweets : ['User Name' 'Nickname' 'Bio' 'Tweet' 'Favorites' 'Retweets' 'Location'
 'Followers' 'Following' 'Listed' 'Language' 'Tweet date']


### Labeling and merging the datasets

In [7]:
# Merge the troll datasets
trolls = pd.merge(left=troll_tweets, right=troll_users)

In [8]:
trolls.iloc[:2, :]

Unnamed: 0,Nickname,Tweet date,Retweets,Favorites,Tweet,Hashtags,URLs,Mentions,Location,User Name,Followers,Language,Bio,Following,Listed
0,kathiemrr,2017-02-27 14:54:00,,,#ThingsDoneByMistake kissing auntie in the lips,"[""ThingsDoneByMistake""]",[],[],Atlanta,Kathie,2970.0,en,"Imperfection is beauty, madness is genius and ...",3157.0,22.0
1,kathiemrr,2016-11-28 16:17:36,,,RT @jadedsweetangel: #ToDoListBeforeChristmas ...,"[""ToDoListBeforeChristmas""]",[],[],Atlanta,Kathie,2970.0,en,"Imperfection is beauty, madness is genius and ...",3157.0,22.0


In [9]:
normal_tweets.iloc[:2, :]

Unnamed: 0,User Name,Nickname,Bio,Tweet,Favorites,Retweets,Location,Followers,Following,Listed,Language,Tweet date
0,Bill Schulhoff,BillSchulhoff,"Husband,Dad,GrandDad,Ordained Minister, Umpire...","Wind 3.2 mph NNE. Barometer 30.20 in, Rising s...",,,"East Patchogue, NY",386.0,705.0,24.0,en,2016-04-16 12:44
1,Daniele Polis,danipolis,"Viagens, geek, moda, batons laranja, cabelos c...",Pausa pro café antes de embarcar no próximo vô...,,,"Grapevine, TX",812.0,647.0,16.0,pt,2016-04-16 12:44


In [10]:
# Put labels
trolls['y'] = pd.Series([1 for x in range(len(trolls))], index=trolls.index)
normal_tweets['y'] = pd.Series([0 for x in range(len(normal_tweets))], index=normal_tweets.index)

In [11]:
# Merge normal and troll data 
tweets = trolls.append(normal_tweets, ignore_index=True)

In [12]:
# Random rows we will use for visualization
rows = [random.randint(0, len(tweets)) for i in range(3)]

In [13]:
tweets.iloc[rows, :]

Unnamed: 0,Bio,Favorites,Followers,Following,Hashtags,Language,Listed,Location,Mentions,Nickname,Retweets,Tweet,Tweet date,URLs,User Name,y
293284,Follow this account for geo-targeted Healthcar...,,562.0,306.0,,en,87.0,"Fort Worth, TX",,tmj_dfw_health,,Primary Care physicians being... - Plaza Medic...,2016-04-15 18:48,,TMJ- DFW Health Jobs,0
52123,~Reagan Conservative ~Jesus Lover ~PRO-Israel ...,0.0,2225.0,2159.0,"[""TrumpPence16"",""AmericaFirst"",""Trump2016"",""ma...",en,72.0,USA,"[""fingersflying"",""realdonaldtrump""]",hyddrox,0.0,RT @Fingersflying: #CCOT But this doesn't mean...,2016-09-17 05:28:04,"[""https://twitter.com/JackPosobiec/status/7767...",Susan,1
305970,Follow this account for geo-targeted Healthcar...,,575.0,307.0,,en,46.0,"Statesville, NC",,tmj_clt_nursing,,Can you recommend anyone for this #Nursing #jo...,2016-04-15 16:53,,Charlotte Nursing,0


### Extracting hashtags and mentions

In [14]:
def extract_hashtag(string):
    htags = []
    string = str(string)
    for word in string.split():
        if word[0] == '#':
            htags.append(word[1:])
    splits= []
    # Puts space before capital letters and numbers
    for h in htags:
        splits.append(re.sub(r"([A-Z0-9])", r" \1", h)[1:])
    return splits

def extract_mentions(string):
    men = []
    string = str(string)
    for word in string.split():
        if word[0] == '@':
            if word[-1] == ':':
                men.append(word[1:-1])
            else:
                men.append(word[1:])
    return men

In [15]:
tweets['Hashtags'] = tweets['Tweet'].apply(extract_hashtag)
tweets['Mentions'] = tweets['Tweet'].apply(extract_mentions)

In [16]:
tweets.iloc[rows, :]

Unnamed: 0,Bio,Favorites,Followers,Following,Hashtags,Language,Listed,Location,Mentions,Nickname,Retweets,Tweet,Tweet date,URLs,User Name,y
293284,Follow this account for geo-targeted Healthcar...,,562.0,306.0,"[Emergency Medicine, Job]",en,87.0,"Fort Worth, TX",[],tmj_dfw_health,,Primary Care physicians being... - Plaza Medic...,2016-04-15 18:48,,TMJ- DFW Health Jobs,0
52123,~Reagan Conservative ~Jesus Lover ~PRO-Israel ...,0.0,2225.0,2159.0,"[C C O T, Trump 2 0 1 6, M A G A, Trump Pence ...",en,72.0,USA,[Fingersflying],hyddrox,0.0,RT @Fingersflying: #CCOT But this doesn't mean...,2016-09-17 05:28:04,"[""https://twitter.com/JackPosobiec/status/7767...",Susan,1
305970,Follow this account for geo-targeted Healthcar...,,575.0,307.0,"[Nursing, ob?, Statesville,, Hiring, Career Arc]",en,46.0,"Statesville, NC",[],tmj_clt_nursing,,Can you recommend anyone for this #Nursing #jo...,2016-04-15 16:53,,Charlotte Nursing,0


In [23]:
def delete(string):
    string = str(string)
    string = string.split()
    to_remove = []
    for word in string:
        if word[0] == '#' or word[0] == '@' or word == 'RT':
            to_remove.append(word)
    for word in to_remove:
        string.remove(word)
    return ' '.join(string)

In [24]:
tweets['Tweet'] = tweets['Tweet'].apply(delete)

In [26]:
tweets.iloc[rows, :]

Unnamed: 0,Bio,Favorites,Followers,Following,Hashtags,Language,Listed,Location,Mentions,Nickname,Retweets,Tweet,Tweet date,URLs,User Name,y
293284,Follow this account for geo-targeted Healthcar...,,562.0,306.0,"[Emergency Medicine, Job]",en,87.0,"Fort Worth, TX",[],tmj_dfw_health,,Primary Care physicians being... - Plaza Medic...,2016-04-15 18:48,,TMJ- DFW Health Jobs,0
52123,~Reagan Conservative ~Jesus Lover ~PRO-Israel ...,0.0,2225.0,2159.0,"[C C O T, Trump 2 0 1 6, M A G A, Trump Pence ...",en,72.0,USA,[Fingersflying],hyddrox,0.0,But this doesn't mean you don't go out and vot...,2016-09-17 05:28:04,"[""https://twitter.com/JackPosobiec/status/7767...",Susan,1
305970,Follow this account for geo-targeted Healthcar...,,575.0,307.0,"[Nursing, ob?, Statesville,, Hiring, Career Arc]",en,46.0,"Statesville, NC",[],tmj_clt_nursing,,Can you recommend anyone for this https://t.co...,2016-04-15 16:53,,Charlotte Nursing,0


# Different methods for NLP : One hot encoding and word2vec

In [None]:
create 1 hot dataframe :
https://stackoverflow.com/questions/18889588/create-dummies-from-column-with-multiple-values-in-pandas

In [None]:
Create word2vec dataframe

In [None]:
Create training and validation set

# Different methods for classification : SVM and K Nearsest Neighbors 

NOTE: The gridsearch objects may require parameters to reduce memory consumption

In [None]:
Create SVM

svc_1_hot = svm.SVC(C=1.0, kernel=’rbf’, degree=3, gamma=’auto’, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=’ovr’, random_state=None)

svc_word2vec = svm.SVC(C=1.0, kernel=’rbf’, degree=3, gamma=’auto’, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=’ovr’, random_state=None)

C_list =[]
kernel_list =[]


In [None]:
Train SVM using GridSearch on one-hot
save results in list

clf_svm_1 = GridSearchCV(estimator, param_grid, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch=‘2*n_jobs’, error_score=’raise’, return_train_score=’warn’)



In [None]:
Train SVM using Gridsearch on w2v
save results in list

clf_svm_2 = GridSearchCV(estimator, param_grid, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch=‘2*n_jobs’, error_score=’raise’, return_train_score=’warn’)


In [None]:
Create KNN

knn_one_hot = neighbors.KNeighborsClassifier(n_neighbors=5, weights=’uniform’, algorithm=’auto’, leaf_size=30, p=2, metric=’minkowski’, metric_params=None, n_jobs=1, **kwargs)\

knn_word2vec = neighbors.KNeighborsClassifier(n_neighbors=5, weights=’uniform’, algorithm=’auto’, leaf_size=30, p=2, metric=’minkowski’, metric_params=None, n_jobs=1, **kwargs)

neighbors_list = []
weighting_list = []


In [None]:
Train KNN using GridSearch on one-hot
save results in list

clf_knn_1 = GridSearchCV(estimator, param_grid, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch=‘2*n_jobs’, error_score=’raise’, return_train_score=’warn’)


In [None]:
Train KNN using Gridsearch on w2v
save results in list

clf_knn_2 = GridSearchCV(estimator, param_grid, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch=‘2*n_jobs’, error_score=’raise’, return_train_score=’warn’)


# Results

In [None]:
PLot and present results