In [1]:
#Libries to process tweets 
import pandas as pd
import numpy as np
import re
import string
import nltk
nltk.download('stopwords')  
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from string import punctuation 

#Using sk.learn Libraries to test models and then
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split  
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression





[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dhanyamaheswaran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
training_reviews=pd.read_csv("../data/IMDB_Dataset.csv",  encoding='latin-1')


In [3]:
training_reviews.head(5)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
#editing the training review column

text = training_reviews['review'].to_list()
edited_text=[]

#Lowercase tweets
for review in text:
    #convert all letters to a lower case
    edited_review=review.lower()
   
    #removing punctuation 
    
    edited_review=re.sub("([.;:!\'?,\"()\[\]])","", edited_review)

    
    
    #removing the <br / br>
    
    edited_review=re.sub("(<br\s*/><br\s*/>)|(\-)|(\/)", " ", edited_review)
    
    edited_text.append(edited_review)
    

In [5]:
tfidfconverter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))  
X = tfidfconverter.fit_transform(edited_text).toarray()


In [6]:
y=training_reviews['sentiment'].to_list()

In [7]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
 

In [8]:

sentiment_random_classifier = RandomForestClassifier(n_estimators=90, random_state=0)  
sentiment_random_classifier.fit(X_train, y_train)
 

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=90,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [9]:
sentiment_predictions_forest = sentiment_random_classifier.predict(X_test)


In [10]:

 
print(confusion_matrix(y_test,sentiment_predictions_forest))  
print(classification_report(y_test,sentiment_predictions_forest))  
print(accuracy_score(y_test, sentiment_predictions_forest))

[[4292  743]
 [ 838 4127]]
              precision    recall  f1-score   support

    negative       0.84      0.85      0.84      5035
    positive       0.85      0.83      0.84      4965

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000

0.8419


In [11]:
sentiment_bayes_classifier = GaussianNB() 
sentiment_bayes_classifier.fit(X_train, y_train)

sentiment_predictions_bayes = sentiment_bayes_classifier.predict(X_test)


In [12]:
 
print(confusion_matrix(y_test,sentiment_predictions_bayes))  
print(classification_report(y_test,sentiment_predictions_bayes))  
print(accuracy_score(y_test, sentiment_predictions_bayes))


[[4136  899]
 [ 917 4048]]
              precision    recall  f1-score   support

    negative       0.82      0.82      0.82      5035
    positive       0.82      0.82      0.82      4965

    accuracy                           0.82     10000
   macro avg       0.82      0.82      0.82     10000
weighted avg       0.82      0.82      0.82     10000

0.8184


In [13]:
#Naive Bayes performed much worse than the Random Forest Algorithm surpirisngly

In [14]:
#Logistic regression, how did that go

sentiment_regression_classifier=LogisticRegression()
sentiment_regression_classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
linear_predictions = sentiment_regression_classifier.predict(X_test)

In [16]:
print(confusion_matrix(y_test,linear_predictions))  
print(classification_report(y_test,linear_predictions))  
print(accuracy_score(y_test, linear_predictions))


[[4371  664]
 [ 564 4401]]
              precision    recall  f1-score   support

    negative       0.89      0.87      0.88      5035
    positive       0.87      0.89      0.88      4965

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000

0.8772


In [None]:
#close to 80%, the most accurate. Thus, the decision to use the Logistic regression 

In [17]:
#loading in the Trump_dataset

tweets_df=pd.read_csv("../data/tweets_scraped.csv")
tweets_df

Unnamed: 0.1,Unnamed: 0,id,Created at,Screen Name,Tweet Text,Subject
0,0,1321905314665410560,Thu Oct 29 20:02:41 +0000 2020,RBReich,"When you vote, remember that Trump's golf trip...",Trump
1,1,1321860390100201474,Thu Oct 29 17:04:10 +0000 2020,RealJamesWoods,"This smug little number saying her company, #G...",Trump
2,2,1322424354139828224,Sat Oct 31 06:25:10 +0000 2020,Richielynn8,@RealKiraDavis @realDonaldTrump Lots of Trump ...,Trump
3,3,1322424352923668480,Sat Oct 31 06:25:09 +0000 2020,EChesspiece,@Kusandra Trump will still be your President i...,Trump
4,4,1322424352520900609,Sat Oct 31 06:25:09 +0000 2020,DavidOry,RT @cmclymer: This document is fascinating. Th...,Trump
...,...,...,...,...,...,...
140,181,1322407643437166593,Sat Oct 31 05:18:45 +0000 2020,TheLamarckian,RT @BobeMannBball: Never taking crap from any ...,Andrews
141,182,1322407577351680129,Sat Oct 31 05:18:30 +0000 2020,NanaJMcK,RT @deayton_kerry: @Frank__Davies Fauci congra...,Andrews
142,185,1322406885497040897,Sat Oct 31 05:15:45 +0000 2020,southyarradan,@theheraldsun What is at the essence of good g...,Andrews
143,188,1322404742971105281,Sat Oct 31 05:07:14 +0000 2020,kathryntherese2,@DaveMilbo @19maddie08 @ScottMorrisonMP Did I ...,Andrews


In [18]:
#cleaning up the trump tweets:
tweet_text = tweets_df['Tweet Text'].to_list()
edited_text=[]
sentiment_list=[]

#Lowercase tweets
for tweet in range (0, len(tweet_text)):
    #lower case
    #removing speocial cahracters
    edited_tweet = re.sub(r'\W', ' ', str(tweet_text[tweet]))
 
    # remove sinflue
    edited_tweet = re.sub(r'\s+[a-zA-Z]\s+', ' ', edited_tweet)
 
    # Remove single characters from the start
    edited_tweet = re.sub(r'\^[a-zA-Z]\s+', ' ', edited_tweet) 
 
    # Substituting multiple spaces with single space
    edited_tweet= re.sub(r'\s+', ' ', edited_tweet, flags=re.I)
 
    # Removing prefixed 'b'
    edited_tweet = re.sub(r'^b\s+', '', edited_tweet)
    
    #removing stopwords tweets
    
    
    edited_tweet=edited_tweet.lower()
    
    edited_text.append(edited_tweet) 
    
    sentiment_trump = sentiment_regression_classifier.predict(tfidfconverter.transform([edited_tweet]).toarray())
    sentiment_list.append(sentiment_trump)

In [19]:

sentiment_list_edited=[]

for item in sentiment_list:
    if "positive" in item:
        sentiment_list_edited.append("positive")
    else:
        sentiment_list_edited.append("negative")

In [20]:
#Adding the list to the dataframe 

tweets_df["sentiment"]=sentiment_list_edited


In [21]:
tweets_df.to_csv("../data/tweets_classified.csv")

In [22]:
tweets_df

Unnamed: 0.1,Unnamed: 0,id,Created at,Screen Name,Tweet Text,Subject,sentiment
0,0,1321905314665410560,Thu Oct 29 20:02:41 +0000 2020,RBReich,"When you vote, remember that Trump's golf trip...",Trump,positive
1,1,1321860390100201474,Thu Oct 29 17:04:10 +0000 2020,RealJamesWoods,"This smug little number saying her company, #G...",Trump,negative
2,2,1322424354139828224,Sat Oct 31 06:25:10 +0000 2020,Richielynn8,@RealKiraDavis @realDonaldTrump Lots of Trump ...,Trump,negative
3,3,1322424352923668480,Sat Oct 31 06:25:09 +0000 2020,EChesspiece,@Kusandra Trump will still be your President i...,Trump,positive
4,4,1322424352520900609,Sat Oct 31 06:25:09 +0000 2020,DavidOry,RT @cmclymer: This document is fascinating. Th...,Trump,positive
...,...,...,...,...,...,...,...
140,181,1322407643437166593,Sat Oct 31 05:18:45 +0000 2020,TheLamarckian,RT @BobeMannBball: Never taking crap from any ...,Andrews,negative
141,182,1322407577351680129,Sat Oct 31 05:18:30 +0000 2020,NanaJMcK,RT @deayton_kerry: @Frank__Davies Fauci congra...,Andrews,negative
142,185,1322406885497040897,Sat Oct 31 05:15:45 +0000 2020,southyarradan,@theheraldsun What is at the essence of good g...,Andrews,positive
143,188,1322404742971105281,Sat Oct 31 05:07:14 +0000 2020,kathryntherese2,@DaveMilbo @19maddie08 @ScottMorrisonMP Did I ...,Andrews,positive
