## Import relevant libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import unicodedata
import json
import random
import csv
import re
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
f = open('tweets-2016-10000-textonly.txt', 'r',encoding="utf8")
lines = f.readlines()



In [3]:
tweets = []
labels = []

len_train = 1000

## Get the dataset

In [4]:
with open('IRAhandle_tweets_1.csv', newline='', encoding="utf8") as csvfile:
    categories = csvfile.readline().split(",")
    tweetreader = csv.reader(csvfile, delimiter=',')
    counter = 0
    for row in tweetreader:
        tweet = dict(zip(categories, row))
        if tweet['language'] == 'English':
            tweets.append(tweet['content'])
            labels.append(1)
            counter += 1
        if counter > len_train:
            break
csvfile.close()



for line in lines:
    tweets.append(line)
    labels.append(0)

f.close()
            
tweets_to_labels = dict(zip(tweets, labels))
random.shuffle(tweets)

actual = []

for tweet in tweets:
    actual.append(tweets_to_labels[tweet])
data=pd.DataFrame()
data['Text']=tweets
data['labels']=actual
data

Unnamed: 0,Text,labels
0,Hillary Clinton Offers A Chance To Meet Pusha ...,0
1,"Sur le papier, Hillary Clinton remporte la bat...",0
2,– PoC 72-5\n,0
3,Join Us!!! We need All Of #NewYorkCity ! #Blac...,1
4,Clinton aboga por mantener “presión” para solu...,0
...,...,...
8968,Watch #AlecBaldwin's Perfect #DonaldTrump Impr...,0
8969,Clinton Pretty Much Said That Sanders Supporte...,0
8970,"Twitchy: Inquiring minds want to know, did Hil...",0
8971,@realDonaldTrump Menopause is a bitch isnt it ...,0


## Exploring the data

In [5]:
data['labels'].value_counts()

0    7972
1    1001
Name: labels, dtype: int64

In [6]:
data.describe(include='all')

Unnamed: 0,Text,labels
count,8973,8973.0
unique,8622,
top,\n,
freq,255,
mean,,0.111557
std,,0.314838
min,,0.0
25%,,0.0
50%,,0.0
75%,,0.0


## Preprocessing the data
- Convsersion to lower case
- Removal Punctuation
- Tokenization of text using word_tokenize
- Removing stop words

In [7]:
df=data.copy()

In [8]:
df['Text'].head()

0    Hillary Clinton Offers A Chance To Meet Pusha ...
1    Sur le papier, Hillary Clinton remporte la bat...
2                                         – PoC 72-5\n
3    Join Us!!! We need All Of #NewYorkCity ! #Blac...
4    Clinton aboga por mantener “presión” para solu...
Name: Text, dtype: object

In [9]:
df['Text']=df['Text'].astype('string')

In [10]:
def preprocess(text):
    text = text.lower()
    
    text_p = "".join([char for char in text if char not in string.punctuation])
    
    words = word_tokenize(text_p)
    stop_words = stopwords.words('english')
    filtered_words = [word for word in words if word not in stop_words]
        
    return filtered_words

In [11]:
df['Text'] = df['Text'].apply(lambda x:preprocess(x))

In [12]:
df['Text'].head(10)

0    [hillary, clinton, offers, chance, meet, pusha...
1    [sur, le, papier, hillary, clinton, remporte, ...
2                                        [–, poc, 725]
3    [join, us, need, newyorkcity, blacklivesmatter...
4    [clinton, aboga, por, mantener, “, presión, ”,...
5    [nyff, ava, duvernay, talks, racism, rage, tru...
6                 [hillaryclinton, httpstcojuoxoyvajp]
7    [theleadcnn, baskindr, trumps, mic, debacle, 1...
8    [one, million, wanted, justice, would, pay, bl...
9                                   [maga, trumptrain]
Name: Text, dtype: object

In [13]:
porter=PorterStemmer()

def stem(words):
    stem_sentence=[]
    for word in words:
#         stem_words.append(porter.stem(word))
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)
        
#     return stem_words

In [14]:
df['Text'] = df['Text'].apply(lambda x:stem(x))

In [15]:
df['Text'].head(10)

0    hillari clinton offer chanc meet pusha regist ...
1    sur le papier hillari clinton remport la batai...
2                                           – poc 725 
3    join us need newyorkc blacklivesmatt ramarleyg...
4    clinton aboga por manten “ presión ” para solu...
5    nyff ava duvernay talk racism rage trump netfl...
6                   hillaryclinton httpstcojuoxoyvajp 
7    theleadcnn baskindr trump mic debacl 1st presi...
8    one million want justic would pay blacklivesma...
9                                     maga trumptrain 
Name: Text, dtype: object

## Pipelining
- Converting Text to Features using count vectorizer and tf-idf
- Train Test Split
- Using Logistic Regression and Random Classifier

In [18]:
pipeline_rf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('lr', RandomForestClassifier()),
])

In [19]:
X_train, X_test, y_train, y_test = train_test_split(df['Text'],df['labels'],test_size=0.2,random_state = 0)

In [20]:
model = pipeline_lr.fit(X_train, y_train)
y_predict = model.predict(X_test)
from sklearn.metrics import f1_score
f1_score(y_test, y_predict)

0.6845637583892616