In [1]:
import string
import re
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\darin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\darin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\darin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\darin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
df = pd.read_csv('data/finalSentimentdata2.csv')
df = df.rename({'Unnamed: 0': 'id'}, axis=1).set_index('id')
df

Unnamed: 0_level_0,sentiment,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3204,sad,agree the poor in india are treated badly thei...
1431,joy,if only i could have spent the with this cutie...
654,joy,will nature conservation remain a priority in ...
2530,sad,coronavirus disappearing in italy show this to...
2296,sad,uk records lowest daily virus death toll since...
...,...,...
2579,sad,today at 02 30pm a 54 year old bangladeshi mal...
3579,anger,corona virus i implore that you cease activity...
221,joy,issa date once lockdown ends inshaallah (and c...
2705,sad,the death toll due to covid 19 rose to 31 in j...


In [3]:
sadjoy = df[df['sentiment'].isin(['sad', 'joy'])]
sadjoy

Unnamed: 0_level_0,sentiment,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3204,sad,agree the poor in india are treated badly thei...
1431,joy,if only i could have spent the with this cutie...
654,joy,will nature conservation remain a priority in ...
2530,sad,coronavirus disappearing in italy show this to...
2296,sad,uk records lowest daily virus death toll since...
...,...,...
2194,joy,it was tough to see you go brother excellent 6...
2579,sad,today at 02 30pm a 54 year old bangladeshi mal...
221,joy,issa date once lockdown ends inshaallah (and c...
2705,sad,the death toll due to covid 19 rose to 31 in j...


In [4]:
lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

In [5]:
def preprocessor(row: str):
    row = re.sub("[@#][A-Za-z0-9]+", "", row)
    row = re.sub(f"[{string.punctuation}]", "", row)
    row = row.lower()
    row = [lemmatizer.lemmatize(word) for word in word_tokenize(row) if lemmatizer.lemmatize(word) not in stopwords]
    return " ".join(row)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(sadjoy['text'], sadjoy['sentiment'])

X_train = X_train.apply(preprocessor)
X_test = X_test.apply(preprocessor)

In [7]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB()),
])

parameters = {
    'vect__ngram_range': ((1, 1), (1, 2), (1, 3), (2, 3), (2, 2), (3, 3)),
}

grid_search = GridSearchCV(pipeline, parameters)
grid_search.fit(X_train, y_train)
print("Best score: %0.3f" % grid_search.best_score_)
bs = grid_search.best_estimator_.get_params()['vect__ngram_range']
bs

Best score: 0.824


(1, 1)

In [8]:
vectorizer = CountVectorizer(ngram_range=bs)
v_X_train = vectorizer.fit_transform(X_train)

clf = MultinomialNB()
clf.fit(v_X_train, y_train)

pred = clf.predict(vectorizer.transform(X_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

         joy       0.78      0.82      0.80       167
         sad       0.85      0.82      0.84       214

    accuracy                           0.82       381
   macro avg       0.82      0.82      0.82       381
weighted avg       0.82      0.82      0.82       381



In [9]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])

parameters = {
    'vect__ngram_range': ((1, 1), (1, 2), (1, 3), (2, 3)),
    'vect__max_df': (0.25, 0.5, 0.75, 1.0),
    'vect__min_df': (0.01, 0.02, 0.03, 0.04),
    'vect__max_features': (400, 500, 600, 700, 800, 900, 1000, 1141),
}

grid_search = GridSearchCV(pipeline, parameters)
grid_search.fit(X_train, y_train)
print("Best score: %0.3f" % grid_search.best_score_)

Best score: 0.757


In [10]:
grid_search.best_estimator_.get_params()['vect']

In [11]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.25, max_features=400, min_df=0.01)
v_X_train = vectorizer.fit_transform(X_train)

clf = MultinomialNB()
clf.fit(v_X_train, y_train)

pred = clf.predict(vectorizer.transform(X_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

         joy       0.78      0.74      0.76       186
         sad       0.77      0.81      0.79       195

    accuracy                           0.77       381
   macro avg       0.77      0.77      0.77       381
weighted avg       0.77      0.77      0.77       381



In [12]:
newdf = pd.read_csv('data/covid19_tweets.csv')
newdf['text'] = newdf['text'].apply(preprocessor)
newdf

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,ᏉᎥ☻լꂅϮ,astroworld,wednesday addams as a disney princess keepin i...,2017-05-26 05:46:42,624,950,18775,False,2020-07-25 12:27:21,smelled scent hand sanitizers today someone pa...,,Twitter for iPhone,False
1,Tom Basile 🇺🇸,"New York, NY","Husband, Father, Columnist & Commentator. Auth...",2009-04-16 20:06:23,2253,1677,24,True,2020-07-25 12:27:17,hey wouldnt made sense player pay respect a… h...,,Twitter for Android,False
2,Time4fisticuffs,"Pewee Valley, KY",#Christian #Catholic #Conservative #Reagan #Re...,2009-02-28 18:57:41,9275,9525,7254,False,2020-07-25 12:27:14,trump never claimed wa hoax claim effort to… h...,['COVID19'],Twitter for Android,False
3,ethel mertz,Stuck in the Middle,#Browns #Indians #ClevelandProud #[]_[] #Cavs ...,2019-03-07 01:45:06,197,987,1488,False,2020-07-25 12:27:10,one gift ha give appreciation simple thing alw...,['COVID19'],Twitter for iPhone,False
4,DIPR-J&K,Jammu and Kashmir,🖊️Official Twitter handle of Department of Inf...,2017-02-12 06:45:15,101009,168,101,False,2020-07-25 12:27:08,25 july medium bulletin novel … httpstcomn0eec...,"['CoronaVirusUpdates', 'COVID19']",Twitter for Android,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
179103,AJIMATI AbdulRahman O.,"Ilorin, Nigeria",Animal Scientist|| Muslim|| Real Madrid/Chelsea,2013-12-30 18:59:19,412,1609,1062,False,2020-08-29 19:44:21,thanks nominating challenge nominate … httpstc...,['WearAMask'],Twitter for Android,False
179104,Jason,Ontario,When your cat has more baking soda than Ninja ...,2011-12-21 04:41:30,150,182,7295,False,2020-08-29 19:44:16,2020 year insanity lol httpstcoy48np0yzgn,['COVID19'],Twitter for Android,False
179105,BEEHEMOTH ⏳,🇨🇦 Canada,⚒️ The Architects of Free Trade ⚒️ Really Did ...,2016-07-13 17:21:59,1623,2160,98000,False,2020-08-29 19:44:15,powerful painting juan lucena tribute grandpar...,,Twitter Web App,False
179106,Gary DelPonte,New York City,"Global UX UI Visual Designer. StoryTeller, Mus...",2009-10-27 17:43:13,1338,1111,0,False,2020-08-29 19:44:14,1200 student test positive major university ab...,['COVID19'],Twitter for iPhone,False


In [13]:
pred = clf.predict(vectorizer.transform(newdf))

In [14]:
sad, joy = 0, 0
for elem in pred:
    if elem == 'sad':
        sad += 1
    else:
        joy += 1

print('joy:', joy, f'\t{joy / (joy + sad) * 100} %')
print('sad:', sad, f'\t{sad / (joy + sad) * 100} %')

joy: 0 	0.0 %
sad: 13 	100.0 %


In [15]:
from textblob import TextBlob
from textblob import Blobber
from textblob.sentiments import NaiveBayesAnalyzer

nltk.download('movie_reviews', quiet=True)

True

In [16]:
tb = Blobber(analyzer=NaiveBayesAnalyzer())

res = newdf['text'].apply(lambda x: tb(x).sentiment.classification)
res.value_counts(normalize=True)

text
pos    0.702682
neg    0.297318
Name: proportion, dtype: float64