## Import relevant libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import unicodedata
import json
import random
import csv
import re
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.utils import resample
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
f = open('tweets-2016-10000-textonly.txt', 'r',encoding="utf8")
lines = f.readlines()



In [3]:
tweets = []
labels = []

len_train = 1000

## Get the dataset

In [4]:
with open('IRAhandle_tweets_1.csv', newline='', encoding="utf8") as csvfile:
    categories = csvfile.readline().split(",")
    tweetreader = csv.reader(csvfile, delimiter=',')
    counter = 0
    for row in tweetreader:
        tweet = dict(zip(categories, row))
        if tweet['language'] == 'English':
            tweets.append(tweet['content'])
            labels.append(1)
            counter += 1
        if counter > len_train:
            break
csvfile.close()



for line in lines:
    tweets.append(line)
    labels.append(0)

f.close()
            
tweets_to_labels = dict(zip(tweets, labels))
random.shuffle(tweets)

actual = []

for tweet in tweets:
    actual.append(tweets_to_labels[tweet])
data=pd.DataFrame()
data['Text']=tweets
data['labels']=actual
data

Unnamed: 0,Text,labels
0,@GStephanopoulos Bernie Sanders on your show ...,0
1,It's 1:00am PST/4:00am EST. Do you know where ...,0
2,#DumpTrump\n,0
3,Look at this bullshit!��� SO topical! everybod...,1
4,I was a #BasementDwellers. Lived out of my car...,0
...,...,...
8968,#Debates2016\n,0
8969,"Live from New York, it's a Trump-Clinton remat...",0
8970,Same shit But now in color today #BlackLivesMa...,1
8971,#TrumpTaxes\n,0


## Exploring the data

In [5]:
data['labels'].value_counts()

0    7972
1    1001
Name: labels, dtype: int64

In [6]:
data.describe(include='all')

Unnamed: 0,Text,labels
count,8973,8973.0
unique,8622,
top,\n,
freq,255,
mean,,0.111557
std,,0.314838
min,,0.0
25%,,0.0
50%,,0.0
75%,,0.0


## Preprocessing the data
- Convsersion to lower case
- Removal Punctuation
- Tokenization of text using word_tokenize
- Removing stop words
- Stemming using Porter Stemmer

In [7]:
df=data.copy()

In [8]:

# train_majority = train_clean[train_clean.label==0]
# train_minority = train_clean[train_clean.label==1]
# train_minority_upsampled = resample(train_minority, 
#                                  replace=True,    
#                                  n_samples=len(train_majority),   
#                                  random_state=123)
# train_upsampled = pd.concat([train_minority_upsampled, train_majority])
# train_upsampled['label'].value_counts()

In [9]:
df['Text'].head()

0    @GStephanopoulos  Bernie Sanders on your show ...
1    It's 1:00am PST/4:00am EST. Do you know where ...
2                                         #DumpTrump\n
3    Look at this bullshit!��� SO topical! everybod...
4    I was a #BasementDwellers. Lived out of my car...
Name: Text, dtype: object

In [10]:
df['Text']=df['Text'].astype('string')

In [11]:
def preprocess(text):
    text = text.lower()
    
    text_p = "".join([char for char in text if char not in string.punctuation])
    
    words = word_tokenize(text_p)
    
    stop_words = stopwords.words('english')
    filtered_words = [word for word in words if word not in stop_words]
#     porter=PorterStemmer()
#     stem=porter.stem(filtered_words)
        
    return filtered_words

In [12]:
df['Text'] = df['Text'].apply(lambda x:preprocess(x))

In [13]:
df['Text'].head(8)

0    [gstephanopoulos, bernie, sanders, show, talki...
1    [100am, pst400am, est, know, child, aka, reald...
2                                          [dumptrump]
3    [look, bullshit���, topical, everybody, talkin...
4    [basementdwellers, lived, car, home, thanks, w...
5                                      [foxnewssunday]
6    [10, emotional, abuse, tactics, trump, blatant...
7    [nytimes, illegally, revealing, trump, may, ta...
Name: Text, dtype: object

In [14]:
porter=PorterStemmer()

def stem(words):
    stem_sentence=[]
    for word in words:
#         stem_words.append(porter.stem(word))
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)
        

In [15]:
df['Text'] = df['Text'].apply(lambda x:stem(x))

In [16]:
df['Text'].head(10)

0    gstephanopoulo berni sander show talk trump di...
1    100am pst400am est know child aka realdonaldtr...
2                                           dumptrump 
3    look bullshit��� topic everybodi talk policebr...
4    basementdwel live car home thank work amp supp...
5                                       foxnewssunday 
6    10 emot abus tactic trump blatantli use first ...
7    nytim illeg reveal trump may taken 950m loss a...
8    must denounc trumpism polit mr ryan disturb se...
9    usa today exclus hundr alleg donald trump ’ pa...
Name: Text, dtype: object

## Pipelining

In [17]:

pipeline_lr = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('lr', LogisticRegression()),
])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df['Text'],df['labels'],test_size=0.2,random_state = 0)

In [19]:
model = pipeline_lr.fit(X_train, y_train)
y_predict = model.predict(X_test)
from sklearn.metrics import f1_score
f1_score(y_test, y_predict)

0.4033613445378152

In [20]:
pipeline_rf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('lr', RandomForestClassifier()),
])

In [21]:
model = pipeline_rf.fit(X_train, y_train)
y_predict = model.predict(X_test)
from sklearn.metrics import f1_score
f1_score(y_test, y_predict)

0.6501766784452297

## Hyper parameter tuning using GridSearch CV


In [22]:
# Create a pipeline
pipeline_rf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('classifier', RandomForestClassifier()),
])
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
#                 {"classifier": [LogisticRegression()],
#                  "classifier__penalty": ['l2','l1'],
#                  "classifier__C": np.logspace(0, 4, 10)
#                  },
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2'],
                 "classifier__C": np.logspace(0, 4, 10),
                 "classifier__solver":['newton-cg','saga','sag','liblinear'] ##This solvers don't allow L1 penalty
                 },
                {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [10, 100, 1000],
                 "classifier__max_depth":[5,8,15,25,30,None],
                 "classifier__min_samples_leaf":[1,2,5,10,15,100],
                 "classifier__max_leaf_nodes": [2, 5,10]}]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipeline_rf, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)



In [23]:
print(best_model.best_estimator_)
print("The mean accuracy of the model is:",best_model.score(X_test,y_test))

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('classifier',
                 LogisticRegression(C=3593.813663804626, solver='saga'))])
The mean accuracy of the model is: 0.950974930362117
