# Final Project for COGS118A, this is meant to find and identify russian 'troll' tweets.

Group members :
- Gael Van der Lee
- Alex Labranche

In [None]:
import sklearn as sk
import pandas as pd
import matplotlib.pyplot as plt
import time
import re
import random
from urllib.request import urlopen

# Data gathering and cleaning

### Loading and cleaning

In [None]:
# Load the 3 datasets
troll_users = pd.read_csv('Tweets/Trolls/users.csv')
troll_tweets = pd.read_csv('Tweets/Trolls/tweets.csv')
normal_tweets = pd.read_csv('Tweets/Normal/dashboard_x_usa_x_filter_nativeretweets.csv')

In [None]:
print('Columns of each dataframe at import :\n\nTroll users : {}\n\nTroll tweets : {}\n\nNormal tweets : {}'
      .format(troll_users.columns.values, troll_tweets.columns.values, normal_tweets.columns.values))

In [None]:
# Make Nicknames consistent
troll_users['screen_name'] = troll_users['screen_name'].str.lower()

# Drop the irrelevant data
users_to_drop = ['statuses_count', 'time_zone', 'verified', 'favourites_count']
tweets_to_drop = ['created_at', 'tweet_id', 'source', 'posted', 'retweeted_status_id', 'retweeted', 'in_reply_to_status_id']
normal_to_drop = ['Tweet Id', 'Latitude', 'Longitude', 'Country', 'Profile picture', 'Tweet Url']
troll_users = troll_users.drop(users_to_drop, axis=1)
troll_tweets = troll_tweets.drop(tweets_to_drop, axis=1)
normal_tweets = normal_tweets.drop(normal_to_drop, axis=1)

In [None]:
users_new_cols = {'id': 'User Id', 'location': 'Location', 'name': 'User Name', 
                  'followers_count': 'Followers', 'lang': 'Language', 'screen_name': 'Nickname',
                  'description': 'Bio', 'created_at': 'Account creation date', 
                  'friends_count': 'Following', 'listed_count': 'Listed'}
tweets_new_cols = {'user_id': 'User Id', 'user_key': 'Nickname', 'created_str': 'Tweet date',
                   'retweet_count': 'Retweets', 'favorite_count': 'Favorites', 'text': 'Tweet',
                   'hashtags': 'Hashtags', 'expanded_urls': 'URLs', 'mentions': 'Mentions'}
normal_new_cols = {'Tweet content': 'Tweet', 'Favs': 'Favorites', 'RTs': 'Retweets', 
                   'Place (as appears on Bio)': 'Location', 'Tweet language (ISO 639-1)': 'Language'}

troll_users = troll_users.rename(columns=users_new_cols)
troll_tweets = troll_tweets.rename(columns=tweets_new_cols)
normal_tweets = normal_tweets.rename(columns=normal_new_cols)

# Make times formats the same
normal_tweets['Tweet date'] = normal_tweets['Date'] + ' ' + normal_tweets['Hour']
normal_tweets = normal_tweets.drop(['Date', 'Hour'], axis=1)

# Helps with merging
troll_users = troll_users.drop(['User Id', 'Account creation date'], axis=1)
troll_tweets = troll_tweets.drop('User Id', axis=1)

In [None]:
print('Columns of each dataframe after cleaning :\n\nTroll users : {}\n\nTroll tweets : {}\n\nNormal tweets : {}'
      .format(troll_users.columns.values, troll_tweets.columns.values, normal_tweets.columns.values))

### Labeling and merging the datasets

In [None]:
# Merge the troll datasets
trolls = pd.merge(left=troll_tweets, right=troll_users)

In [None]:
trolls.iloc[:2, :]

In [None]:
normal_tweets.iloc[:2, :]

In [None]:
# Put labels
trolls['y'] = pd.Series([1 for x in range(len(trolls))], index=trolls.index)
normal_tweets['y'] = pd.Series([0 for x in range(len(normal_tweets))], index=normal_tweets.index)

In [None]:
# Merge normal and troll data 
tweets = trolls.append(normal_tweets, ignore_index=True)
print(tweets.shape)

In [None]:
# Random rows we will use for visualization
rows = [random.randint(0, len(tweets)) for i in range(4)]

In [None]:
tweets.iloc[rows, :]

### Extracting hashtags and mentions

In [None]:
def extract_hashtag(string):
    htags = []
    string = str(string)
    for word in string.split():
        if word[0] == '#':
            htags.append(word[1:])
    splits= []
    # Puts space before capital letters and numbers
    for h in htags:
        splits.append(re.sub(r"([A-Z0-9])", r" \1", h)[1:])
    return splits

def extract_mentions(string):
    men = []
    string = str(string)
    for word in string.split():
        if word[0] == '@':
            if word[-1] == ':':
                men.append(word[1:-1])
            else:
                men.append(word[1:])
    return men

# Not implemented yet, takes way too long to run on ~400,000 tweets
def extract_links(string):
    links = []
    string = str(string)
    for word in string.split():
        if word[:4] == 'http':
            try:
                link = urlopen(word).geturl()
                links.append(link)
            except:
                pass
    return links

def extract_website(string):
    string = str(string)
    start = string.find('//') + 2
    if start > 1:
        end = string[start:].find('/') + start
        return string[start:end]
    else:
        return ''

In [None]:
test = tweets['URLs'].apply(extract_website)
test[test != '']

In [None]:
trolls[(trolls['test'].str.len() != 0)][['Tweet', 'URLs', 'test']]

In [None]:
tweets['Hashtags'] = tweets['Tweet'].apply(extract_hashtag)
tweets['Mentions'] = tweets['Tweet'].apply(extract_mentions)

In [None]:
def clean(string):
    string = str(string)
    string = string.split()
    to_remove = []
    for word in string:
        if word[0] == '#' or word[0] == '@' or word == 'RT':
            to_remove.append(word)
    for word in to_remove:
        string.remove(word)
    return ' '.join(string)

In [None]:
tweets['Tweet'] = tweets['Tweet'].apply(clean)

In [None]:
tweets.iloc[rows, :]

## Replace Missing Values

In [None]:
#replace NaNs with 0s where appropriate
#replace NaNs with empty list where appropriate

# Different methods for NLP : One hot encoding and word2vec

In [None]:
#preliminary data reformatting
#replace language with one-hot encoded form
#reformat date (?)
#probably won't use: bio, listed, location, nickname, username, URLs?

In [None]:
#remove newlines for one-hot encoding
tweets_1_hot = tweets.copy()
tweets_1_hot['Tweet'].str.replace('\n', ' ', case=True)


In [None]:
create 1 hot dataframe :
https://stackoverflow.com/questions/18889588/create-dummies-from-column-with-multiple-values-in-pandas

In [None]:
Create word2vec dataframe

https://datascience.stackexchange.com/questions/10695/how-to-initialize-a-new-word2vec-model-with-pre-trained-model-weights
    

In [None]:
Create training and validation set

# Different methods for classification : SVM and K Nearsest Neighbors 

NOTE: The gridsearch objects may require parameters to reduce memory consumption

In [None]:
Create SVM
#initialize classifier objects for the 1-hot and word2vec encodings
svc_1_hot = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=None)

svc_word2vec = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=None)

#initialize parameters

#penalty parameter
C_list =[10**(1-i) for i in range(6)]
#kernel function - poly:polynomial
kernel_list =['linear','rbf','sigmoid','poly']
param_dic_SVM = {'C':C_list, 'kernel':kernel_list}

In [None]:
Train SVM using GridSearch on one-hot
save results in list

clf_svm_1 = GridSearchCV(estimator, param_dict_SVM, cv=5, refit = True)


In [None]:
Train SVM using Gridsearch on w2v
save results in list

clf_svm_2 = GridSearchCV(estimator, param_dict_SVM, cv=5, refit = True)


In [None]:
Create KNN

knn_one_hot = neighbors.KNeighborsClassifier(n_jobs=-1)

knn_word2vec = neighbors.KNeighborsClassifier(n_jobs=-1)

#initialize parameters

#number of nearest neighbors to consider: 3-10
n_neighbors_list = [i+3 for i in range(8)]
#weighting of neighbors, whether they all get an equal vote or are weighted by distance
weights_list = ['uniform','distance']
param_dic_knn = {'n_neighbors':n_neighbors_list,'weights':weights_list}


In [None]:
Train KNN using GridSearch on one-hot
save results in list

clf_knn_1 = GridSearchCV(knn_one_hot, param_dict_knn, cv=5, refit = True)

In [None]:
Train KNN using Gridsearch on w2v
save results in list

clf_knn_2 = GridSearchCV(knn_word2vec, param_dict_knn, cv=5, refit = True)

# Results

In [None]:
Plot and present results