## ================== Naive Bayes for Text Classification =============

##### Initial vocab consists of both the training and validation words
##### Unline our previous implementation, where we only trained using training words & use laplace smoothining for remaining unknown words

In [1]:
use_bigrams_stopwords  = 0    # could not verify this # however, just removing stopwords had no affect on the accuracy
use_word2vec           = 0
use_lemmatizer_stemmer = 0    # did not improve the accuracy
balance_dataset        = 1    # if some class has less proportion in training dataset, duplicate it sufficient times

# regular scikit-learn accuracy = 0.70
# + balance_dataset             = 0.71

assert (use_bigrams_stopwords   == 1 or use_bigrams_stopwords   == 0)
assert (use_lemmatizer_stemmer  == 1 or use_lemmatizer_stemmer  == 0)
assert (use_word2vec            == 1 or use_word2vec            == 0)
assert (balance_dataset == 1 or balance_dataset == 0)

### Reading training data from .csv file

In [2]:
import os
import pandas as pd

__location__ = os.path.realpath(os.path.join(os.getcwd(), "dataset_corona_sentiment/Corona_train.csv"))
df_train = pd.read_csv(__location__)

In [3]:
df_train.head()

Unnamed: 0,ID,Sentiment,CoronaTweet
0,22979,Positive,I see all kinds of academics already whipping ...
1,9880,Negative,@HenrySmithUK can you raise with Boris please ...
2,35761,Negative,It s a confusing odd time for the shopping pub...
3,37968,Positive,Blog Summary: The Impact of COVID-19 on the Ca...
4,19709,Neutral,??????? ??????? ???\r\r\nWaiting in a long Que...


In [4]:
print ("number of rows    :", df_train.shape[0])
print ("number of columns :", df_train.shape[1])
print ("column values     :", list(df_train.columns.values))

print ("\ndistribution of class lebels :", dict(df_train['Sentiment'].value_counts()))
print ("\nfirst row item  :", dict(df_train.iloc[0]))

number of rows    : 37864
number of columns : 3
column values     : ['ID', 'Sentiment', 'CoronaTweet']

distribution of class lebels : {'Positive': 16602, 'Negative': 14166, 'Neutral': 7096}

first row item  : {'ID': 22979, 'Sentiment': 'Positive', 'CoronaTweet': 'I see all kinds of academics already whipping up some #Covid_19 related projects, cfp, syllabi, articles, and blog posts.\r\r\n\r\r\nIÂ\x92m sittin over here browsing all the food left &amp; tryin to figure out when to go back out to the grocery store. Apparently I donÂ\x92t do well in pandemic'}


### Balance Dataset

In [5]:
if balance_dataset == 1:
    df_train_net = df_train[df_train.Sentiment == 'Neutral']
    df_train = pd.concat([df_train, df_train_net])
print ("\ndistribution of class lebels :", dict(df_train['Sentiment'].value_counts()))


distribution of class lebels : {'Positive': 16602, 'Neutral': 14192, 'Negative': 14166}


### Converting to Lists

In [6]:
y_train = list(df_train['Sentiment'])
x_train = list(df_train['CoronaTweet'])

In [7]:
#print ("number of data points     :", len(x_train))
#print ("number of class labels    :", len(y_train))
assert(len(x_train) == len(y_train))

### Reading validation data from .csv file

In [8]:
import os
import pandas as pd

__location__ = os.path.realpath(os.path.join(os.getcwd(), "dataset_corona_sentiment/Corona_validation.csv"))
df_valid = pd.read_csv(__location__)

In [9]:
print ("number of rows    :", df_valid.shape[0])
print ("number of columns :", df_valid.shape[1])
print ("column values     :", list(df_valid.columns.values))

print ("\ndistribution of class lebels :", dict(df_valid['Sentiment'].value_counts()))
print ("\nfirst row item  :", dict(df_valid.iloc[0]))

number of rows    : 3293
number of columns : 3
column values     : ['ID', 'Sentiment', 'CoronaTweet']

distribution of class lebels : {'Positive': 1444, 'Negative': 1232, 'Neutral': 617}

first row item  : {'ID': 7184, 'Sentiment': 'Negative', 'CoronaTweet': 'I reflected on my own consumer behaviour last week and made this list\r\r\nI confess - as much as I feel bad for people who may lose jobs due to the COVID-19, part of me also wish that unethical businesses will no longer be able to operate "as usual" unless making changes #time4change https://t.co/63lXRFi82N'}


In [10]:
df_valid.head()

Unnamed: 0,ID,Sentiment,CoronaTweet
0,7184,Negative,I reflected on my own consumer behaviour last ...
1,36363,Negative,I know everyone is getting stir crazy but befo...
2,10423,Negative,I haven t seen gas prices this low since I fir...
3,6409,Neutral,Only batmeat left on the supermarket shelves\r...
4,7015,Neutral,"Along with health workers, we need to apprecia..."


### Perform Lematization & Stemming

##### taken from here: https://medium.com/analytics-vidhya/nlp-tutorial-for-text-classification-in-python-8f19cd17b49e

In [11]:
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dishantgoyal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/dishantgoyal/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [12]:
#convert to lowercase, strip and remove punctuations
def preprocess(text):
    text = text.lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text
 
# STOPWORD REMOVAL
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()
 
# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

In [13]:
def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))
if use_lemmatizer_stemmer == 1:
    df_valid['CleanTweet'] = df_valid['CoronaTweet'].apply(lambda x: finalpreprocess(x))
    df_train['CleanTweet'] = df_train['CoronaTweet'].apply(lambda x: finalpreprocess(x))
    df_valid.head()
    df_train.head()

### Converting to Lists

In [14]:
y_valid = list(df_valid['Sentiment'])
x_valid = list(df_valid['CoronaTweet'])

In [15]:
#print ("number of data points     :", len(x_valid))
#print ("number of class labels    :", len(y_valid))
assert(len(x_valid) == len(y_valid))

### Convert Data to Frequency Vectors (Bag of Words or Word2Vec)

##### fed into scikit learn in this form; numpy array

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud, STOPWORDS
stopwords = list(STOPWORDS)

if use_bigrams_stopwords == 1:   # consider only words with frequency > 3; else RAM will be done for
    vectorizer  = CountVectorizer(analyzer='word', ngram_range=(1, 2), stop_words=stopwords, min_df=3)  
elif use_word2vec == 1:
    vectorizer  = CountVectorizer()
else:
    vectorizer  = CountVectorizer()
    
x_vec       = (vectorizer.fit_transform(x_train + x_valid)).toarray()

x_train_vec = x_vec[:len(x_train)]
x_valid_vec = x_vec[len(x_train): len(x_train) + len(x_valid)]

In [17]:
print (vectorizer.get_feature_names_out()[12345:12355])
print (x_train_vec)
print (stopwords[:10])

['bojggl5fi1' 'bojo' 'bojos' 'bokadia_vinita' 'bokakhat' 'bokamotoespn'
 'boko' 'bokoharam' 'boksburg' 'bokuto']
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
['few', 'and', 'her', "i'd", 'between', 'off', 'therefore', 'has', 'k', 'nor']


### Train Multinomial Naive Bayes

In [18]:
import numpy as np

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()                             #clf means classification
clf.fit(x_train_vec, y_train)

print(clf.predict(x_train_vec[2:5]))

['Positive' 'Neutral' 'Neutral']


### Predict 

In [19]:
x_valid_vec   = x_vec[len(x_train): len(x_train) + len(x_valid)]
y_valid_pred  = clf.predict(x_valid_vec)

In [20]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
classes = ['Positive', 'Negative', 'Neutral']
print("PR Report         : \n", classification_report(y_valid, y_valid_pred, labels=classes, zero_division=0))
print("Confusion Matrix  : \n", confusion_matrix(y_valid, y_valid_pred))
print("\nAccuracy        : ", accuracy_score(y_valid, y_valid_pred))

PR Report         : 
               precision    recall  f1-score   support

    Positive       0.74      0.76      0.75      1444
    Negative       0.73      0.74      0.74      1232
     Neutral       0.61      0.54      0.57       617

    accuracy                           0.71      3293
   macro avg       0.69      0.68      0.69      3293
weighted avg       0.71      0.71      0.71      3293

Confusion Matrix  : 
 [[ 913   90  229]
 [ 119  332  166]
 [ 216  126 1102]]

Accuracy        :  0.7127239599149712
