## ========= Text Classification using Word2Vec/BERT & Neural Net ===========

#### Link: https://www.youtube.com/watch?v=hQwFeIupNP0

In [1]:
use_lemmatizer_stemmer = 1    # includes stopword removal as well; did not improve the accuracy
balance_dataset        = 1    # if some class has less proportion in training dataset, duplicate it sufficient times
use_word2vec           = 1    # if 0, then use the bag of words (but kernel died) # if 2, use BERT embeddings

# regular scikit-learn naive-bayes accuracy                       = 0.70
# regular BOW and MLP (Multi Layer Perceptron) scikit-learn       = kernel died
# regular word2vec and MLP (Multi Layer Perceptron) scikit-learn  = 0.69
# regular word2vec and MLP (Multi Layer Perceptron) scikit-learn  = 0.63  (smaller embedding used for fast convergence)

'''
to use BERT, first please create conda env and install all packages
Link: https://stackoverflow.com/questions/54843067/no-module-named-torch
'''

assert (use_lemmatizer_stemmer  == 1 or use_lemmatizer_stemmer  == 0)
assert (balance_dataset         == 1 or balance_dataset         == 0)
assert (use_word2vec            == 2 or use_word2vec            == 1 or use_word2vec            == 0)

### Reading training data from .csv file

In [2]:
import os
import pandas as pd

__location__ = os.path.realpath(os.path.join(os.getcwd(), "dataset_corona_sentiment/Corona_train.csv"))
df_train = pd.read_csv(__location__, dtype='str')

In [3]:
df_train.head()

Unnamed: 0,ID,Sentiment,CoronaTweet
0,22979,Positive,I see all kinds of academics already whipping ...
1,9880,Negative,@HenrySmithUK can you raise with Boris please ...
2,35761,Negative,It s a confusing odd time for the shopping pub...
3,37968,Positive,Blog Summary: The Impact of COVID-19 on the Ca...
4,19709,Neutral,??????? ??????? ???\r\r\nWaiting in a long Que...


In [4]:
print ("number of rows    :", df_train.shape[0])
print ("number of columns :", df_train.shape[1])
print ("column values     :", list(df_train.columns.values))

print ("\ndistribution of class lebels :", dict(df_train['Sentiment'].value_counts()))
print ("\nfirst row item  :", dict(df_train.iloc[0]))

number of rows    : 37864
number of columns : 3
column values     : ['ID', 'Sentiment', 'CoronaTweet']

distribution of class lebels : {'Positive': 16602, 'Negative': 14166, 'Neutral': 7096}

first row item  : {'ID': '22979', 'Sentiment': 'Positive', 'CoronaTweet': 'I see all kinds of academics already whipping up some #Covid_19 related projects, cfp, syllabi, articles, and blog posts.\r\r\n\r\r\nIÂ\x92m sittin over here browsing all the food left &amp; tryin to figure out when to go back out to the grocery store. Apparently I donÂ\x92t do well in pandemic'}


### Balance Dataset

In [5]:
if balance_dataset == 1:
    df_train_net = df_train[df_train.Sentiment == 'Neutral']
    df_train = pd.concat([df_train, df_train_net])
print ("\ndistribution of class lebels :", dict(df_train['Sentiment'].value_counts()))


distribution of class lebels : {'Positive': 16602, 'Neutral': 14192, 'Negative': 14166}


### Reading validation data from .csv file

In [6]:
import os
import pandas as pd

__location__ = os.path.realpath(os.path.join(os.getcwd(), "dataset_corona_sentiment/Corona_validation.csv"))
df_valid = pd.read_csv(__location__)

In [7]:
print ("number of rows    :", df_valid.shape[0])
print ("number of columns :", df_valid.shape[1])
print ("column values     :", list(df_valid.columns.values))

print ("\ndistribution of class lebels :", dict(df_valid['Sentiment'].value_counts()))
print ("\nfirst row item  :", dict(df_valid.iloc[0]))

number of rows    : 3293
number of columns : 3
column values     : ['ID', 'Sentiment', 'CoronaTweet']

distribution of class lebels : {'Positive': 1444, 'Negative': 1232, 'Neutral': 617}

first row item  : {'ID': 7184, 'Sentiment': 'Negative', 'CoronaTweet': 'I reflected on my own consumer behaviour last week and made this list\r\r\nI confess - as much as I feel bad for people who may lose jobs due to the COVID-19, part of me also wish that unethical businesses will no longer be able to operate "as usual" unless making changes #time4change https://t.co/63lXRFi82N'}


In [8]:
df_valid.head()

Unnamed: 0,ID,Sentiment,CoronaTweet
0,7184,Negative,I reflected on my own consumer behaviour last ...
1,36363,Negative,I know everyone is getting stir crazy but befo...
2,10423,Negative,I haven t seen gas prices this low since I fir...
3,6409,Neutral,Only batmeat left on the supermarket shelves\r...
4,7015,Neutral,"Along with health workers, we need to apprecia..."


### Perform Lematization & Stemming

##### taken from here: https://medium.com/analytics-vidhya/nlp-tutorial-for-text-classification-in-python-8f19cd17b49e

In [9]:
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dishantgoyal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/dishantgoyal/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/dishantgoyal/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/dishantgoyal/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [10]:
#convert to lowercase, strip and remove punctuations
def preprocess(text):
    text = text.lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text
 
# STOPWORD REMOVAL
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()
 
# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    if use_word2vec == 1:
        return a
    else:
        return " ".join(a)

In [11]:
def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))
if use_lemmatizer_stemmer == 1:
    df_valid['CleanTweet'] = df_valid['CoronaTweet'].apply(lambda x: finalpreprocess(x))
    df_train['CleanTweet'] = df_train['CoronaTweet'].apply(lambda x: finalpreprocess(x))
    df_valid.head()
    df_train.head()

In [12]:
df_train.head()

Unnamed: 0,ID,Sentiment,CoronaTweet,CleanTweet
0,22979,Positive,I see all kinds of academics already whipping ...,"[see, kind, academic, already, whip, covid, re..."
1,9880,Negative,@HenrySmithUK can you raise with Boris please ...,"[henrysmithuk, raise, boris, please, supermark..."
2,35761,Negative,It s a confusing odd time for the shopping pub...,"[confuse, odd, time, shop, public, store, clos..."
3,37968,Positive,Blog Summary: The Impact of COVID-19 on the Ca...,"[blog, summary, impact, covid, canadian, resid..."
4,19709,Neutral,??????? ??????? ???\r\r\nWaiting in a long Que...,"[wait, long, queue, enter, supermarket, finall..."


### Convert Data to Sentence Embeddings (Word2Vec)

In [13]:
if use_word2vec == 1:
    from gensim.models import KeyedVectors  # Load pre-trained Word2Vec model (e.g., Google News vectors)
    word2vecModel = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

    '''
    wget https://figshare.com/ndownloader/files/10798046 -O GoogleNews-vectors-negative300.bin
    '''

In [14]:
def get_sentence_embedding(tokens):
    vectors = [word2vecModel[word] for word in tokens if word in word2vecModel]
    if vectors:
        return sum(vectors) / len(vectors)  # Average vector
    else:
        return [0] * word2vecModel.vector_size  # Zero vector if no words are in the vocabulary

if use_word2vec == 1:
    df_train['TweetEmbeddings'] = df_train['CleanTweet'].apply(lambda x: get_sentence_embedding(x))
    df_valid['TweetEmbeddings'] = df_valid['CleanTweet'].apply(lambda x: get_sentence_embedding(x))

In [15]:
df_train.head()

Unnamed: 0,ID,Sentiment,CoronaTweet,CleanTweet,TweetEmbeddings
0,22979,Positive,I see all kinds of academics already whipping ...,"[see, kind, academic, already, whip, covid, re...","[0.05482131, 0.030006409, -0.031684287, 0.1578..."
1,9880,Negative,@HenrySmithUK can you raise with Boris please ...,"[henrysmithuk, raise, boris, please, supermark...","[0.040020753, 0.0099823, -0.024865722, 0.10838..."
2,35761,Negative,It s a confusing odd time for the shopping pub...,"[confuse, odd, time, shop, public, store, clos...","[0.0486472, 0.03554862, 0.011557443, 0.1152670..."
3,37968,Positive,Blog Summary: The Impact of COVID-19 on the Ca...,"[blog, summary, impact, covid, canadian, resid...","[-0.08162649, -0.04352078, -0.053622596, 0.042..."
4,19709,Neutral,??????? ??????? ???\r\r\nWaiting in a long Que...,"[wait, long, queue, enter, supermarket, finall...","[-0.100792825, 0.05029656, -0.027210908, 0.156..."


### Convert Data to Sentence Embeddings (BERT)

In [16]:
if use_word2vec == 2:   # using DistilBert for faster embedding generation
    import torch
    from transformers import DistilBertTokenizer, DistilBertModel
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    bertModel = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [17]:
# Function to Generate Embeddings
def get_bert_embeddings(input_string):
    tokens_dict = tokenizer(input_string, return_tensors="pt", padding=True, truncation=True, max_length=64)
    with torch.no_grad():
        outputs = bertModel(**tokens_dict)
    # Extract the last hidden state (CLS token) for sentence embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze()
    return cls_embedding

if use_word2vec == 2:
    df_train['TweetEmbeddings'] = df_train['CleanTweet'].apply(lambda x: get_bert_embeddings(x))
    df_valid['TweetEmbeddings'] = df_valid['CleanTweet'].apply(lambda x: get_bert_embeddings(x))

In [18]:
df_train.head()

Unnamed: 0,ID,Sentiment,CoronaTweet,CleanTweet,TweetEmbeddings
0,22979,Positive,I see all kinds of academics already whipping ...,"[see, kind, academic, already, whip, covid, re...","[0.05482131, 0.030006409, -0.031684287, 0.1578..."
1,9880,Negative,@HenrySmithUK can you raise with Boris please ...,"[henrysmithuk, raise, boris, please, supermark...","[0.040020753, 0.0099823, -0.024865722, 0.10838..."
2,35761,Negative,It s a confusing odd time for the shopping pub...,"[confuse, odd, time, shop, public, store, clos...","[0.0486472, 0.03554862, 0.011557443, 0.1152670..."
3,37968,Positive,Blog Summary: The Impact of COVID-19 on the Ca...,"[blog, summary, impact, covid, canadian, resid...","[-0.08162649, -0.04352078, -0.053622596, 0.042..."
4,19709,Neutral,??????? ??????? ???\r\r\nWaiting in a long Que...,"[wait, long, queue, enter, supermarket, finall...","[-0.100792825, 0.05029656, -0.027210908, 0.156..."


### Converting to Numpy Arrays

In [19]:
import numpy as np

y_train = np.array(list(df_train['Sentiment']))
y_valid = np.array(list(df_valid['Sentiment']))
    
if use_word2vec >= 1:
    x_train_vec = np.array(list(df_train['TweetEmbeddings']))
    x_valid_vec = np.array(list(df_valid['TweetEmbeddings']))

In [20]:
if use_word2vec >= 1:
    #print ("number of data points     :", len(x_valid))
    #print ("number of class labels    :", len(y_valid))
    assert(len(x_train_vec) == len(y_train))
    assert(len(x_valid_vec) == len(y_valid))

### Convert Data to Frequency Vectors (Bag of Words)

##### fed into scikit learn in this form; numpy array

In [21]:
if use_word2vec == 0:
    from sklearn.feature_extraction.text import CountVectorizer
    from wordcloud import WordCloud, STOPWORDS
    stopwords = list(STOPWORDS)

    vectorizer  = CountVectorizer()

    x_train = list(df_train['CleanTweet'])
    x_valid = list(df_valid['CleanTweet'])
    
    x_train_vec = (vectorizer.fit_transform(x_train)).toarray()
    x_valid_vec = vectorizer.transform(x_valid)                   # uses vocab of training data set # for unseen words freq = 0

# x_vec       = (vectorizer.fit_transform(x_train + x_valid)).toarray()
# x_train_vec = x_vec[:len(x_train)]
# x_valid_vec = x_vec[len(x_train): len(x_train) + len(x_valid)]  
# you should not be validation set for training or pre-processing for training

In [22]:
if use_word2vec == 0:
    print (vectorizer.get_feature_names_out()[12345:12355])
    print (x_train_vec)
    print (stopwords[:10])

### MLP Classifier Train

In [23]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(random_state=1, max_iter=300).fit(x_train_vec, y_train)

# number of iter = 300 (epochs), relu, adam, learning rate = 0.001, batch size = 'auto', last activation: softmax, 
# hidden layer = [100], alpha = 0.0001
# setting random state ensures reproducibility



### Predict 

In [24]:
y_valid_pred  = clf.predict(x_valid_vec)

In [25]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
classes = ['Positive', 'Negative', 'Neutral']
print("PR Report         : \n", classification_report(y_valid, y_valid_pred, labels=classes, zero_division=0))
print("Confusion Matrix  : \n", confusion_matrix(y_valid, y_valid_pred))
print("\nAccuracy        : ", accuracy_score(y_valid, y_valid_pred))

PR Report         : 
               precision    recall  f1-score   support

    Positive       0.72      0.76      0.74      1444
    Negative       0.71      0.68      0.69      1232
     Neutral       0.58      0.55      0.56       617

    accuracy                           0.69      3293
   macro avg       0.67      0.66      0.67      3293
weighted avg       0.69      0.69      0.69      3293

Confusion Matrix  : 
 [[ 838  120  274]
 [ 125  340  152]
 [ 218  130 1096]]

Accuracy        :  0.6905557242635895
