## Imports

In [0]:
! pip install pydrive



In [0]:
# Numpy
import numpy as np

# Pandas
import pandas as pd

# PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Scikit Learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_validate, RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, Normalizer
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression 
from sklearn.svm import LinearSVC
from sklearn.metrics import mean_squared_error, confusion_matrix, roc_auc_score, auc, average_precision_score, accuracy_score, average_precision_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import export_graphviz, DecisionTreeClassifier

# # gensim
from gensim import models
import gensim.downloader as api
from gensim.utils import lemmatize

# scipy
from scipy.sparse import hstack

# Warnings
import warnings
warnings.filterwarnings('ignore')

## Helper Function

In [0]:
# Function to plot coefs
def plot_coefs(features, model):
    # Check important features
    # lr.coef_[0]
    
    colors = ['r']*10 + ['b']*10
    
    coefs = pd.DataFrame({'feature':feature_names, 
                          'coef':model.coef_[0]})
    coefs.sort_values(by='coef', ascending=False, 
                      inplace=True)
    top_pos = coefs[:10]
    top_neg = coefs[-10:]
    top_coefs = pd.concat([top_neg, top_pos]).sort_values(by='coef')

    #Plot like in slides
    top_coefs.plot.bar(x='feature', y='coef', color=colors)

## Download Data

In [0]:
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
# Get file links
link_charlie = "https://drive.google.com/open?id=1O29GXq_GE10w_dByM83tPFV5Ysz_aw-u"
link_ferguson = "https://drive.google.com/open?id=1x--tPxfdWHE3K9Du4vCMTyVdqLXgc6eq"
link_germanwings = "https://drive.google.com/open?id=1rTekmKthibm0KKEnXtx44IJ9vr2Oofzp"
link_ottawashooting = "https://drive.google.com/open?id=1QIvIVM_pYSGw8nOXjXjmdmvFtetEKFN4"
link_sydneysiege = "https://drive.google.com/open?id=1KH50vUC3Qb3mXbwk6qTJ77ZTsEdfwJ3h"

# Separate id from links
link, id_charlie = link_charlie.split("=")
link, id_ferguson = link_ferguson.split("=")
link, id_germanwings = link_germanwings.split("=")
link, id_ottawashooting = link_ottawashooting.split("=")
link, id_sydneysiege = link_sydneysiege.split("=")

# Download Files
downloaded = drive.CreateFile({'id':id_charlie}) 
downloaded.GetContentFile('charliehebdo_tweets.csv')

downloaded = drive.CreateFile({'id':id_ferguson}) 
downloaded.GetContentFile('ferguson_tweets.csv')

downloaded = drive.CreateFile({'id':id_germanwings}) 
downloaded.GetContentFile('germanwings_tweets.csv')

downloaded = drive.CreateFile({'id':id_ottawashooting}) 
downloaded.GetContentFile('ottawashooting_tweets.csv')

downloaded = drive.CreateFile({'id':id_sydneysiege}) 
downloaded.GetContentFile('sydneysiege_tweets.csv')

## Import Data

In [0]:
df1 = pd.read_csv('charliehebdo_tweets.csv')
df2 = pd.read_csv('ferguson_tweets.csv')
df3 = pd.read_csv('germanwings_tweets.csv')
df4 = pd.read_csv('ottawashooting_tweets.csv')
df5 = pd.read_csv('sydneysiege_tweets.csv')

df = [df1, df2, df3, df4, df5]

data = pd.concat(df)

data.is_fake = data.is_fake.astype(int)
data.retweet = data.retweet.astype(int)
data.user_verified = data.user_verified.astype(int)
data = data.drop(columns=['tweet_hashtag'])

In [0]:
data.groupby('is_fake').describe().T

In [0]:
data.columns

Index(['tweet_id', 'tweet_text', 'retweet', 'retweet_source_id',
       'retweet_count', 'is_fake', 'user_verified', 'user_followers_count',
       'user_statuses_count', 'user_friends_count', 'user_favourites_count',
       'tweet_relative_age'],
      dtype='object')

## Word2Vec Model

In [0]:
model = api.load("glove-twitter-200")



In [0]:
X = data.tweet_text
y = data.is_fake

# Separate into training and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                  stratify=y, 
                                                  random_state=7)

# Convert input to list of strings
X_train_vector = X_train.values.squeeze()

vect = CountVectorizer(stop_words='english')
X_train_vec = vect.fit_transform(X_train_vector)


In [0]:
tokenized_X_train = vect.inverse_transform(X_train_vec)
tokenized_X_train = np.array(tokenized_X_train)
y_train = y_train.values

In [0]:
vocab = list(model.vocab.keys())

# Convert vocab to set for 
# taking intersection later
vocab_set = set(vocab)

In [0]:
subset = tokenized_X_train

new_subset = []
y_sub = []
for i in range(len(subset)):
    req = vocab_set.intersection(set(subset[i]))
    if len(req) != 0:
        new_subset.append(list(req))
        y_sub.append(y_train[i])

In [0]:
# Form training set of word vectors
train_set = np.array([np.mean(model[new_subset[i]], axis=0) 
                      for i in range(len(new_subset))])

### Grid Search Logistic Regression

In [0]:
params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}

grid_lr = GridSearchCV(LogisticRegression(), param_grid=params,
                      n_jobs=-1, cv=5)

# fir grid
grid_lr.fit(train_set, y_sub)


grid_lr.best_score_

0.7211180042412507

### Grid Search Random Forest

In [0]:
params = {'n_estimators':[50, 100],
          'max_depth': [10, 20, 50, 100]}

grid_rf = GridSearchCV(RandomForestClassifier(), param_grid=params,
                      n_jobs=-1, cv=5)

# fir grid
grid_rf.fit(train_set, y_sub)


grid_rf.best_score_

0.7400324021021851

### Grid Search Random Forest

In [0]:
params = {'n_estimators':[50, 100],
          'learning_rate':[0.1, 0.3, 0.5],
          'max_features': ['auto', 'sqrt', 'log2']}

grid_gb = GridSearchCV(GradientBoostingClassifier(), param_grid=params,
                      n_jobs=-1, cv=5)

# fir grid
grid_gb.fit(train_set, y_sub)


grid_gb.best_score_

0.7297980795827241