## Import relevant libraries

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import unicodedata
import json
import random
import csv
import re
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
f = open('tweets-2016-10000-textonly.txt', 'r',encoding="utf8")
lines = f.readlines()



In [3]:
tweets = []
labels = []

len_train = 1000

## Get the dataset

In [4]:
with open('IRAhandle_tweets_1.csv', newline='', encoding="utf8") as csvfile:
    categories = csvfile.readline().split(",")
    tweetreader = csv.reader(csvfile, delimiter=',')
    counter = 0
    for row in tweetreader:
        tweet = dict(zip(categories, row))
        if tweet['language'] == 'English':
            tweets.append(tweet['content'])
            labels.append(1)
            counter += 1
        if counter > len_train:
            break
csvfile.close()



for line in lines:
    tweets.append(line)
    labels.append(0)

f.close()
            
tweets_to_labels = dict(zip(tweets, labels))
random.shuffle(tweets)

actual = []

for tweet in tweets:
    actual.append(tweets_to_labels[tweet])
data=pd.DataFrame()
data['Text']=tweets
data['labels']=actual
data

Unnamed: 0,Text,labels
0,This Trump impression could go down as the mos...,0
1,Hillary Clinton acusa a Trump de violar leyes ...,0
2,#IslamKills Why do we destroy ourselves I don'...,1
3,I think @JoeTrippi will agree. Trump behavior ...,0
4,Retweeted Bill Kristol (@BillKristol):\n,0
...,...,...
8968,Donald Trump Is Seen as Helping Push Asian-Ame...,0
8969,like i work with someone who likes trump enoug...,0
8970,Trump Was Apparently Right About the Debate Mi...,0
8971,#TrumpStrong #AmericaFirst #MAGA #PresidentTru...,0


## Exploring the data

In [5]:
data['labels'].value_counts()

0    7972
1    1001
Name: labels, dtype: int64

In [6]:
data.describe(include='all')

Unnamed: 0,Text,labels
count,8973,8973.0
unique,8622,
top,\n,
freq,255,
mean,,0.111557
std,,0.314838
min,,0.0
25%,,0.0
50%,,0.0
75%,,0.0


## Preprocessing the data
- Convsersion to lower case
- Removal Punctuation
- Tokenization of text using word_tokenize
- Removing stop words

In [7]:
df=data.copy()

In [8]:
df['Text'].head()

0    This Trump impression could go down as the mos...
1    Hillary Clinton acusa a Trump de violar leyes ...
2    #IslamKills Why do we destroy ourselves I don'...
3    I think @JoeTrippi will agree. Trump behavior ...
4             Retweeted Bill Kristol (@BillKristol):\n
Name: Text, dtype: object

In [9]:
df['Text']=df['Text'].astype('string')

In [10]:
def preprocess(text):
    text = text.lower()
    
    text_p = "".join([char for char in text if char not in string.punctuation])
    
    words = word_tokenize(text_p)
    stop_words = stopwords.words('english')
    filtered_words = [word for word in words if word not in stop_words]
        
    return filtered_words

In [11]:
df['Text'] = df['Text'].apply(lambda x:preprocess(x))

In [12]:
df['Text'].head(10)

0    [trump, impression, could, go, important, thin...
1    [hillary, clinton, acusa, trump, de, violar, l...
2    [islamkills, destroy, dont, get, destroying, p...
3    [think, joetrippi, agree, trump, behavior, man...
4              [retweeted, bill, kristol, billkristol]
5    [joyannreid, nwoga5, special, thanks, yourache...
6    [6, de, reagan, donald, trump, santa, anna, co...
7    [foxnewssunday, chris, shd, trump, apologize, ...
8    [proof, trump, miss, universe, “, fatshaming, ...
9    [bernie, sanders, reacts, leaked, clinton, aud...
Name: Text, dtype: object

In [13]:
porter=PorterStemmer()

def stem(words):
    stem_sentence=[]
    for word in words:
#         stem_words.append(porter.stem(word))
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)
        
#     return stem_words

In [14]:
df['Text'] = df['Text'].apply(lambda x:stem(x))

In [15]:
df['Text'].head(10)

0    trump impress could go import thing alec baldw...
1    hillari clinton acusa trump de violar ley de e...
2    islamkil destroy dont get destroy planet thing...
3    think joetrippi agre trump behavior man consci...
4                    retweet bill kristol billkristol 
5    joyannreid nwoga5 special thank yourachellawr ...
6    6 de reagan donald trump santa anna como metáf...
7    foxnewssunday chri shd trump apolog 4 follow l...
8    proof trump miss univers “ fatsham ” controver...
9    berni sander react leak clinton audio sen bern...
Name: Text, dtype: object

## Pipelining
- Converting Text to Features using count vectorizer and tf-idf
- Train Test Split
- Using Logistic Regression and Random Classifier

In [29]:

pipeline_sgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('sgd', SGDClassifier()),
])

In [30]:
X_train, X_test, y_train, y_test = train_test_split(df['Text'],df['labels'],test_size=0.2,random_state = 0)


In [31]:
model = pipeline_sgd.fit(X_train, y_train)
y_predict = model.predict(X_test)
from sklearn.metrics import f1_score
f1_score(y_test, y_predict)

0.7023411371237458