In [14]:
import pandas as pd
import glob
import os.path
import time
import nltk
import re 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, average_precision_score
from nltk.corpus import stopwords
nltk.download('stopwords') 
nltk.download('wordnet') 

[nltk_data] Downloading package stopwords to C:\Users\Chithra
[nltk_data]     Menon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Chithra
[nltk_data]     Menon\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
##dir_path = os.path.abspath(os.path.dirname(__file__)) 
##input_path = os.path.join(dir_path,'Dataset')

#Get Russian Troll Data from CSV file
#combine all the csv files into one large csv file of approx 3M rows
path1 = r'C:\Users\Chithra Menon\Downloads\RussianTrolls'
all_trollfiles = glob.glob(os.path.join(path1, "*.csv"))
df_from_each_file = (pd.read_csv(f, dtype='unicode') for f in all_trollfiles)
df = pd.concat(df_from_each_file, ignore_index=True)


#Get Non Russian Troll Data from excel
random_df1 = pd.read_csv(r'C:\Users\Chithra Menon\Downloads\RussianTrolls\RandomTweets\RandomTweets1.csv', usecols=['content', 'Country', 'Tweet language (ISO 639-1)'])
random_df2 = pd.read_csv(r'C:\Users\Chithra Menon\Downloads\RussianTrolls\RandomTweets\RandomTweets2.csv', names = ['Sentiment', 'ID', 'publish_date', 'Flag', 'User', 'content'])
random_df3 = pd.read_csv(r'C:\Users\Chithra Menon\Downloads\RussianTrolls\RandomTweets\RandomTweets3.csv', usecols=['content'])
random_df4 = pd.read_csv(r'C:\Users\Chithra Menon\Downloads\RussianTrolls\RandomTweets\RandomTweets4.csv', usecols=['content'])
random_df5 = pd.read_csv(r'C:\Users\Chithra Menon\Downloads\RussianTrolls\RandomTweets\RandomTweets5.csv', usecols=['content'])


In [16]:
#random_df2 = random_df2[['Sentiment', 'ID', 'publish_date', 'Flag', 'User', 'content']]

len(df.index)

2946207

In [17]:
length = len(random_df1.index) + len(random_df2.index) + len(random_df3.index) + len(random_df4.index) + len(random_df5.index)
print(length)

2336661


In [18]:
#Clean Up the seperate files

#select necessary rows from the dataframe from troll data
df = df[df['language'] == 'English']
df['russian_troll'] = 1
df_svm = df[['content', 'russian_troll']]

#select tweets from US from random file1 and drop other columns
random_df1 = random_df1[random_df1['Tweet language (ISO 639-1)'] == 'en']
random_df1['russian_troll'] = 0
random_df1_svm = random_df1[['content', 'russian_troll']]

#select tweets from random file2 and drop other columns
random_df2['russian_troll'] = 0
random_df2_svm = random_df2[['content', 'russian_troll']]

#select tweets from random file3
random_df3['russian_troll'] = 0
random_df3_svm = random_df3[['content', 'russian_troll']]

#select tweets from random file4
random_df4.columns = ['content']
random_df4['russian_troll'] = 0
random_df4_svm = random_df4[['content', 'russian_troll']]

#Combine the three datasets
svm_df1 = df_svm.append(random_df1_svm, ignore_index = True)
svm_df2 = svm_df1.append(random_df2_svm, ignore_index = True)
svm_df3 = svm_df2.append(random_df3_svm, ignore_index = True)
svm_df = svm_df3.append(random_df4_svm, ignore_index = True)

svm_df.head()
len(svm_df.index)

4400864

In [19]:
#Preprocessing the tweet content in svm_df

#remove URLs from the tweet content
svm_df['content'] = svm_df['content'].str.replace('https?:\/\/.*[\r\n]*', '')

#remove non ASCII characters from tweets
svm_df['content'] = svm_df['content'].str.encode('ascii', 'ignore').str.decode('ascii')

#remove words starting with @username as its not relevant to our classification
svm_df['content'] = svm_df['content'].str.replace('@(.+?)[\s,.;]', '')

#remove words starting with & as they represent HTML character reference 
svm_df['content'] = svm_df['content'].str.replace('&(.+?)[\s,.;]', '')

#remove numerics and special characters except # from the string
svm_df['content'] = svm_df['content'].str.replace('[^a-zA-Z#\s]', '')

In [20]:
#Remove stopwords from svm_df
stop = stopwords.words('english')
pat = r'\b(?:{})\b'.format('|'.join(stop))

svm_df['content_without_stopwords'] = svm_df['content'].str.replace(pat, '')
svm_df['content_without_stopwords'] = svm_df['content_without_stopwords'].str.replace(r'\s+', ' ')

In [21]:
#Tokenizing and Lemmatization
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    lemmatized_output = ' '.join(lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text))
    return lemmatized_output


svm_df.fillna('', inplace=True) 

svm_df['text_lemmatized'] = svm_df.content_without_stopwords.apply(lemmatize_text)

#convert tweet into lower case
svm_df['text_lemmatized'] = [entry.lower() for entry in svm_df['text_lemmatized']]

In [22]:
#Drop unnecessary columns

svm_df = svm_df[['text_lemmatized', 'russian_troll']]

svm_df.head()

len(svm_df.index)

4400864

In [74]:
#Training without CV

print('Start time:: '+ time.asctime( time.localtime(time.time())))

#Split data into training and test set

Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(svm_df['text_lemmatized'],svm_df['russian_troll'],test_size=0.3)

#Word Vectorization with TF-IDF
Tfidf_vect = TfidfVectorizer(min_df=1, sublinear_tf = True, use_idf = True, ngram_range=(1, 2))
Tfidf_vect.fit(svm_df['text_lemmatized'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)


#Use ML Algorithms to predict outcome

#Naive Bayes
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
average_precision_NB = average_precision_score(Test_Y, predictions_NB)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)
print("Naive Bayes Precision-Recall Score -> ", average_precision_NB)

# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.LinearSVC(C=1.0)
SVM.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
average_precision_SVM = average_precision_score(Test_Y, predictions_SVM)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print("SVM Precision Score -> ", average_precision_SVM)

print('End time::'+time.asctime( time.localtime(time.time())))

Start time:: Sat Dec  7 23:12:11 2019
Naive Bayes Accuracy Score ->  92.06822898519988
Naive Bayes Precision-Recall Score ->  0.8940514426174226




SVM Accuracy Score ->  94.39322557678032
SVM Precision Score ->  0.9106089871085874
End time::Sat Dec  7 23:30:01 2019


In [12]:
#K-Fold CV with SVM

print('Start time:: '+ time.asctime( time.localtime(time.time())))

#initialize Vectorizer
Tfidf_vect = TfidfVectorizer(sublinear_tf = True, use_idf = True, ngram_range=(1, 2))
Tfidf_vect.fit(svm_df['text_lemmatized'])

#Initialize models
model_naive = naive_bayes.MultinomialNB()
model_svm = svm.LinearSVC(C=1.0)

#Rename for simplicity
X = svm_df['text_lemmatized']
y = svm_df['russian_troll']

#initialize K-Fold to 10
kf = model_selection.KFold(n_splits=10)
kf.get_n_splits(X)

#initialize temporary variables
sum_acc_svm = 0
sum_acc_naive = 0
sum_prec_svm = 0
sum_prec_naive = 0
i = 0

#split the data and CV
for train_index, test_index in kf.split(X):
    
    #split train-test model
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    #Vectorize and transform the data
    Train_X_Tfidf = Tfidf_vect.transform(X_train)
    Test_X_Tfidf = Tfidf_vect.transform(X_test)
    
    Train_X_Tfidf.shape
    Test_X_Tfidf.shape
    
    #Fit the models
    model_svm.fit(Train_X_Tfidf, y_train)
    model_naive.fit(Train_X_Tfidf, y_train)
    
    #Predict on test data
    y_predict_svm = model_svm.predict(Test_X_Tfidf)
    y_predict_naive = model_naive.predict(Test_X_Tfidf)
      
    #Calculate accuracy of models    
    accuracy_svm = accuracy_score(y_test, y_predict_svm) 
    accuracy_naive = accuracy_score(y_test, y_predict_naive) 
    
    #Calculate Precision-Recall score of models
    precision_naive = average_precision_score(y_test, y_predict_naive)
    precision_svm = average_precision_score(y_test, y_predict_svm)
    
    #Add the values into temporary variable to calculate average
    sum_acc_svm += accuracy_svm
    sum_acc_naive += accuracy_naive
    sum_prec_svm += precision_svm
    sum_prec_naive += precision_naive
    i += 1
    
    #Print calculated values for each fold
    print("Accuracy of Naive fold ",i," =", accuracy_naive)
    print("Precision of Naive fold ",i, " = ", precision_naive)
    print("Accuracy of SVM fold ",i," =", accuracy_svm)
    print("Precision of SVM fold ",i, " = ", precision_svm)
    

#calculate mean of all evaluation parameters
mean_accuracy_svm = sum_acc_svm/10   
mean_accuracy_naive = sum_acc_naive/10  
mean_precision_svm = sum_prec_svm/10
mean_precision_naive = sum_prec_naive/10

#Print avergae values across 10 Folds
print("Average Naive accuracy across 10 folds= ", mean_accuracy_naive)    
print("Average SVM accuracy across 10 folds= ", mean_accuracy_svm)   
print("Average Naive precision across 10 folds = ", mean_precision_naive)
print("Average SVM precision across 10 folds = ", mean_precision_svm)

print('End time::'+time.asctime( time.localtime(time.time())))

Start time:: Thu Dec  5 22:39:36 2019




Accuracy of Naive fold  1  = 0.897736129447132
Precision of Naive fold  1  =  1.0
Accuracy of SVM fold  1  = 0.9300479223426277
Precision of SVM fold  1  =  1.0




Accuracy of Naive fold  2  = 0.8750110773551594
Precision of Naive fold  2  =  1.0
Accuracy of SVM fold  2  = 0.9183116065687011
Precision of SVM fold  2  =  1.0




Accuracy of Naive fold  3  = 0.8990472338423994
Precision of Naive fold  3  =  1.0
Accuracy of SVM fold  3  = 0.9375737070170216
Precision of SVM fold  3  =  1.0




Accuracy of Naive fold  4  = 0.6701811232778974
Precision of Naive fold  4  =  1.0
Accuracy of SVM fold  4  = 0.697739310636306
Precision of SVM fold  4  =  1.0




Accuracy of Naive fold  5  = 0.8715569229650568
Precision of Naive fold  5  =  0.9297868664018728
Accuracy of SVM fold  5  = 0.9030644010488859
Precision of SVM fold  5  =  0.9458767319454435


  recall = tps / tps[-1]


Accuracy of Naive fold  6  = 0.8680439732234154
Precision of Naive fold  6  =  nan
Accuracy of SVM fold  6  = 0.9158096372072732
Precision of SVM fold  6  =  nan


  recall = tps / tps[-1]


Accuracy of Naive fold  7  = 0.8878128365819408
Precision of Naive fold  7  =  nan
Accuracy of SVM fold  7  = 0.9257940493449007
Precision of SVM fold  7  =  nan


  recall = tps / tps[-1]


Accuracy of Naive fold  8  = 0.8907599878205623
Precision of Naive fold  8  =  nan
Accuracy of SVM fold  8  = 0.8966792854123967
Precision of SVM fold  8  =  nan


  recall = tps / tps[-1]


Accuracy of Naive fold  9  = 0.9582513417831969
Precision of Naive fold  9  =  nan
Accuracy of SVM fold  9  = 0.9619801584235809
Precision of SVM fold  9  =  nan




Accuracy of Naive fold  10  = 0.9562221929350172
Precision of Naive fold  10  =  nan
Accuracy of SVM fold  10  = 0.9608031157546479
Precision of SVM fold  10  =  nan
Average Naive accuracy across 10 folds=  0.8774622819231779
Average SVM accuracy across 10 folds=  0.9047803193756341
Average Naive precision across 10 folds =  nan
Average SVM precision across 10 folds =  nan
End time::Fri Dec  6 01:30:53 2019


  recall = tps / tps[-1]


In [75]:
#Real time prediction
real_tweets = pd.read_csv(r'C:\Users\Chithra Menon\Downloads\RussianTrolls\RealTweets.csv')

#Preprocessing the tweet content in real_tweets

#remove URLs from the tweet content
real_tweets['content'] = real_tweets['content'].str.replace('https?:\/\/.*[\r\n]*', '')

#remove non ASCII characters from tweets
real_tweets['content'] = real_tweets['content'].str.encode('ascii', 'ignore').str.decode('ascii')

#remove words starting with @username as its not relevant to our classification
real_tweets['content'] = real_tweets['content'].str.replace('@(.+?)[\s,.;]', '')

#remove words starting with & as they represent HTML character reference 
real_tweets['content'] = real_tweets['content'].str.replace('&(.+?)[\s,.;]', '')

#remove numerics and special characters except # from the string
real_tweets['content'] = real_tweets['content'].str.replace('[^a-zA-Z#\s]', '')


#Remove stopwords from svm_df
real_tweets['content_without_stopwords'] = real_tweets['content'].str.replace(pat, '')
real_tweets['content_without_stopwords'] = real_tweets['content_without_stopwords'].str.replace(r'\s+', ' ')

#Tokenizing and Lemmatization
real_tweets.fillna('', inplace=True) 

real_tweets['text_lemmatized'] = real_tweets.content_without_stopwords.apply(lemmatize_text)

#convert tweet into lower case
real_tweets['text_lemmatized'] = [entry.lower() for entry in real_tweets['text_lemmatized']]

#Drop unnecessary columns
real_tweets = real_tweets[['text_lemmatized']]

#Word Vectorization with TF-IDF
Real_X_Tfidf = Tfidf_vect.transform(real_tweets['text_lemmatized'])

#Predict outcome
predictions_SVM = SVM.predict(Real_X_Tfidf)

#print outcomes
print(predictions_SVM)


[0 1 0 1]
   0
0  0
1  1
2  0
3  1
