## Import relevant libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import unicodedata
import json
import random
import csv
import re
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
f = open('tweets-2016-10000-textonly.txt', 'r',encoding="utf8")
lines = f.readlines()



In [3]:
tweets = []
labels = []

len_train = 1000

## Get the dataset

In [4]:
with open('IRAhandle_tweets_1.csv', newline='', encoding="utf8") as csvfile:
    categories = csvfile.readline().split(",")
    tweetreader = csv.reader(csvfile, delimiter=',')
    counter = 0
    for row in tweetreader:
        tweet = dict(zip(categories, row))
        if tweet['language'] == 'English':
            tweets.append(tweet['content'])
            labels.append(1)
            counter += 1
        if counter > len_train:
            break
csvfile.close()



for line in lines:
    tweets.append(line)
    labels.append(0)

f.close()
            
tweets_to_labels = dict(zip(tweets, labels))
random.shuffle(tweets)

actual = []

for tweet in tweets:
    actual.append(tweets_to_labels[tweet])
data=pd.DataFrame()
data['Text']=tweets
data['labels']=actual
data

Unnamed: 0,Text,labels
0,TheEconomist: Election brief: Assessing the ca...,0
1,BREAKING: Trump Caught LYING On Medical Record...,0
2,"If Trump thinks debate prep is for chumps, his...",0
3,Oh my god @realDonaldTrump Hillary Clinton's s...,0
4,#OdavelyMusic WATCH: Trump Launches Personal ...,0
...,...,...
8968,Donald Trump on tax 'bombshell': 'I know compl...,0
8969,"When #Trump ""compliments"" @IvankaTrump about h...",0
8970,Trump surrogate blames Trump being able to tak...,0
8971,Incredible moment dog is saved from rubble aft...,1


## Exploring the data

In [5]:
data['labels'].value_counts()

0    7972
1    1001
Name: labels, dtype: int64

In [6]:
data.describe(include='all')

Unnamed: 0,Text,labels
count,8973,8973.0
unique,8622,
top,\n,
freq,255,
mean,,0.111557
std,,0.314838
min,,0.0
25%,,0.0
50%,,0.0
75%,,0.0


## Preprocessing the data
- Convsersion to lower case
- Removal Punctuation
- Tokenization of text using word_tokenize
- Removing stop words

In [7]:
df=data.copy()

In [8]:
df['Text'].head()

0    TheEconomist: Election brief: Assessing the ca...
1    BREAKING: Trump Caught LYING On Medical Record...
2    If Trump thinks debate prep is for chumps, his...
3    Oh my god @realDonaldTrump Hillary Clinton's s...
4    #OdavelyMusic WATCH:  Trump Launches Personal ...
Name: Text, dtype: object

In [9]:
df['Text']=df['Text'].astype('string')

In [10]:
def preprocess(text):
    text = text.lower()
    
    text_p = "".join([char for char in text if char not in string.punctuation])
    
    words = word_tokenize(text_p)
    stop_words = stopwords.words('english')
    filtered_words = [word for word in words if word not in stop_words]
        
    return filtered_words

In [11]:
df['Text'] = df['Text'].apply(lambda x:preprocess(x))

In [12]:
df['Text'].head(10)

0    [theeconomist, election, brief, assessing, can...
1    [breaking, trump, caught, lying, medical, reco...
2    [trump, thinks, debate, prep, chumps, advisers...
3    [oh, god, realdonaldtrump, hillary, clintons, ...
4    [odavelymusic, watch, trump, launches, persona...
5            [trump, supporter, tries, reason, w, blm]
6                 [wise, thoughts, httpstcojwsoea6ad9]
7    [report, fbi, turns, director, comey, comeys, ...
8    [republicans, enemy, trump, supporters, deplor...
9                             [lasttimetrumppaidtaxes]
Name: Text, dtype: object

In [13]:
porter=PorterStemmer()

def stem(words):
    stem_sentence=[]
    for word in words:
#         stem_words.append(porter.stem(word))
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)
        
#     return stem_words

In [14]:
df['Text'] = df['Text'].apply(lambda x:stem(x))

In [15]:
df['Text'].head(10)

0    theeconomist elect brief assess candid complet...
1    break trump caught lie medic record hide big h...
2    trump think debat prep chump advis ’ save danb...
3    oh god realdonaldtrump hillari clinton state d...
4    odavelymus watch trump launch person attack hi...
5                      trump support tri reason w blm 
6                     wise thought httpstcojwsoea6ad9 
7    report fbi turn director comey comey tie clint...
8    republican enemi trump support deplor berni su...
9                                lasttimetrumppaidtax 
Name: Text, dtype: object

## Pipelining
- Converting Text to Features using count vectorizer and tf-idf
- Train Test Split
- Using Logistic Regression and Random Forest Classifier

In [19]:
pipeline_lr = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('lr', LogisticRegression()),
])

In [20]:
X_train, X_test, y_train, y_test = train_test_split(df['Text'],df['labels'],test_size=0.2,random_state = 0)

In [21]:
model = pipeline_lr.fit(X_train, y_train)
y_predict = model.predict(X_test)
from sklearn.metrics import f1_score
f1_score(y_test, y_predict)

0.5121951219512195