In [1]:
import string
import re
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\darin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\darin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\darin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\darin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
df = pd.read_csv('data/finalSentimentdata2.csv')
df = df.rename({'Unnamed: 0': 'id'}, axis=1).set_index('id')
df

Unnamed: 0_level_0,sentiment,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3204,sad,agree the poor in india are treated badly thei...
1431,joy,if only i could have spent the with this cutie...
654,joy,will nature conservation remain a priority in ...
2530,sad,coronavirus disappearing in italy show this to...
2296,sad,uk records lowest daily virus death toll since...
...,...,...
2579,sad,today at 02 30pm a 54 year old bangladeshi mal...
3579,anger,corona virus i implore that you cease activity...
221,joy,issa date once lockdown ends inshaallah (and c...
2705,sad,the death toll due to covid 19 rose to 31 in j...


In [3]:
sadjoy = df[df['sentiment'].isin(['sad', 'joy'])]
sadjoy

Unnamed: 0_level_0,sentiment,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3204,sad,agree the poor in india are treated badly thei...
1431,joy,if only i could have spent the with this cutie...
654,joy,will nature conservation remain a priority in ...
2530,sad,coronavirus disappearing in italy show this to...
2296,sad,uk records lowest daily virus death toll since...
...,...,...
2194,joy,it was tough to see you go brother excellent 6...
2579,sad,today at 02 30pm a 54 year old bangladeshi mal...
221,joy,issa date once lockdown ends inshaallah (and c...
2705,sad,the death toll due to covid 19 rose to 31 in j...


In [4]:
lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

In [5]:
def preprocessor(row: str):
    row = re.sub("[@#][A-Za-z0-9]+", "", row)
    row = re.sub(f"[{string.punctuation}]", "", row)
    row = row.lower()
    row = [lemmatizer.lemmatize(word) for word in word_tokenize(row) if lemmatizer.lemmatize(word) not in stopwords]
    return " ".join(row)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(sadjoy['text'], sadjoy['sentiment'])

X_train = X_train.apply(preprocessor)
X_test = X_test.apply(preprocessor)

In [7]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB()),
])

parameters = {
    'vect__ngram_range': ((1, 1), (1, 2), (1, 3), (2, 3), (2, 2), (3, 3)),
}

grid_search = GridSearchCV(pipeline, parameters)
grid_search.fit(X_train, y_train)
print("Best score: %0.3f" % grid_search.best_score_)
bs = grid_search.best_estimator_.get_params()['vect__ngram_range']
bs

Best score: 0.824


(1, 1)

In [8]:
vectorizer = CountVectorizer(ngram_range=bs)
v_X_train = vectorizer.fit_transform(X_train)

clf = MultinomialNB()
clf.fit(v_X_train, y_train)

pred = clf.predict(vectorizer.transform(X_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

         joy       0.78      0.82      0.80       167
         sad       0.85      0.82      0.84       214

    accuracy                           0.82       381
   macro avg       0.82      0.82      0.82       381
weighted avg       0.82      0.82      0.82       381



In [None]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])

parameters = {
    'vect__ngram_range': ((1, 1), (1, 2), (1, 3), (2, 3)),
    'vect__max_df': (0.25, 0.5, 0.75, 1.0),
    'vect__min_df': (0.01, 0.02, 0.03, 0.04),
    'vect__max_features': (400, 500, 600, 700, 800, 900, 1000, 1141),
}

grid_search = GridSearchCV(pipeline, parameters)
grid_search.fit(X_train, y_train)
print("Best score: %0.3f" % grid_search.best_score_)

In [None]:
grid_search.best_estimator_.get_params()['vect']

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.25, max_features=400, min_df=0.01)
v_X_train = vectorizer.fit_transform(X_train)

clf = MultinomialNB()
clf.fit(v_X_train, y_train)

pred = clf.predict(vectorizer.transform(X_test))
print(classification_report(pred, y_test))

In [None]:
newdf = pd.read_csv('data/covid19_tweets.csv')
newdf['text'] = newdf['text'].apply(preprocessor)
newdf

In [None]:
pred = clf.predict(vectorizer.transform(newdf))

In [None]:
sad, joy = 0, 0
for elem in pred:
    if elem == 'sad':
        sad += 1
    else:
        joy += 1

print('joy:', joy, f'\t{joy / (joy + sad) * 100} %')
print('sad:', sad, f'\t{sad / (joy + sad) * 100} %')

In [None]:
from textblob import TextBlob
from textblob import Blobber
from textblob.sentiments import NaiveBayesAnalyzer

nltk.download('movie_reviews', quiet=True)

In [None]:
tb = Blobber(analyzer=NaiveBayesAnalyzer())

res = newdf['text'].apply(lambda x: tb(x).sentiment.classification)
res.value_counts(normalize=True)