# Assignment Two:  Sentiment Classification

For this exercise you will be using the "SemEval 2017 task 4" corpus provided on the module website, available through the following link: https://warwick.ac.uk/fac/sci/dcs/teaching/material/cs918/semeval-tweets.tar.bz2 You will focus particularly on Subtask A, i.e. classifying the overall sentiment of a tweet as positive, negative or neutral.

You are requested to produce a standalone Python program or Jupyter notebook for coursework submission. The input to your program is the SemEval data downloaded. Note that TAs need to run your program on their own machine by using the original SemEval data. As such, don’t submit a Python program that takes as input some preprocessed files.

# Important library import
You may import more packages here.

In [18]:
# importing all the required packages
import re
import nltk
from io import open
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import numpy as np
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
lm = nltk.stem.WordNetLemmatizer()
tokenizr = TweetTokenizer()
from sklearn import metrics
from sklearn.naive_bayes import  MultinomialNB,GaussianNB,ComplementNB
from sklearn.linear_model import SGDClassifier
import string, random, time, math
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
#from IPython.display import clear_output
from collections import Counter
from os.path import join

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
# Instantiates the device to be used as GPU/CPU based on availability
device_gpu = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package wordnet to /Users/Gaurav/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Gaurav/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Gaurav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Imports all the Machine learning functions
from ml_preprocessing file

In [19]:
# import all the functions/modules from ml_preprocessing file
from ml_preprocessing import *

## Load all the data sets and Glove embeddings

In [20]:
# change the path if data is in other directory
path = ""

In [21]:
# Load training set and the testing set
train_df = pd.read_csv(join('semeval-tweets','twitter-training-data.txt'),\
                       names = ['id','sentiment','text'], sep='\t', header=None,encoding='utf8')


In [22]:
# Define test sets
testsets = ['twitter-test1.txt', 'twitter-test2.txt', 'twitter-test3.txt']

In [23]:
# Load Glove embeddings
embeddings_index={}
with open(path+'glove.twitter.27B.100d.txt','r') as f:
    for line in f:
        values=line.split()
        word = values[0]
        vectors=np.asarray(values[1:],'float32')
        embeddings_index[word]=vectors
f.close()

In [24]:
# Skeleton: Evaluation code for the test sets
def read_test(testset):
    '''
    readin the testset and return a dictionary
    :param testset: str, the file name of the testset to compare
    '''
    id_gts = {}
    with open(testset, 'r', encoding='utf8') as fh:
        for line in fh:
            fields = line.split('\t')
            tweetid = fields[0]
            gt = fields[1]

            id_gts[tweetid] = gt

    return id_gts


def confusion(id_preds, testset, classifier):
    '''
    print the confusion matrix of {'positive', 'netative'} between preds and testset
    :param id_preds: a dictionary of predictions formated as {<tweetid>:<sentiment>, ... }
    :param testset: str, the file name of the testset to compare
    :classifier: str, the name of the classifier
    '''
    id_gts = read_test(testset)

    gts = []
    for m, c1 in id_gts.items():
        if c1 not in gts:
            gts.append(c1)

    gts = ['positive', 'negative', 'neutral']

    conf = {}
    for c1 in gts:
        conf[c1] = {}
        for c2 in gts:
            conf[c1][c2] = 0

    for tweetid, gt in id_gts.items():
        if tweetid in id_preds:
            pred = id_preds[tweetid]
        else:
            pred = 'neutral'
        conf[pred][gt] += 1

    print(''.ljust(12) + '  '.join(gts))

    for c1 in gts:
        print(c1.ljust(12), end='')
        for c2 in gts:
            if sum(conf[c1].values()) > 0:
                print('%.3f     ' % (conf[c1][c2] / float(sum(conf[c1].values()))), end='')
            else:
                print('0.000     ', end='')
        print('')

    print('')


def evaluate(id_preds, testset, classifier):
    '''
    print the macro-F1 score of {'positive', 'netative'} between preds and testset
    :param id_preds: a dictionary of predictions formated as {<tweetid>:<sentiment>, ... }
    :param testset: str, the file name of the testset to compare
    :classifier: str, the name of the classifier
    '''
    id_gts = read_test(testset)

    acc_by_class = {}
    for gt in ['positive', 'negative', 'neutral']:
        acc_by_class[gt] = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}

    catf1s = {}

    ok = 0
    for tweetid, gt in id_gts.items():
        if tweetid in id_preds:
            pred = id_preds[tweetid]
        else:
            pred = 'neutral'

        if gt == pred:
            ok += 1
            acc_by_class[gt]['tp'] += 1
        else:
            acc_by_class[gt]['fn'] += 1
            acc_by_class[pred]['fp'] += 1

    catcount = 0
    itemcount = 0
    macro = {'p': 0, 'r': 0, 'f1': 0}
    micro = {'p': 0, 'r': 0, 'f1': 0}
    semevalmacro = {'p': 0, 'r': 0, 'f1': 0}

    microtp = 0
    microfp = 0
    microtn = 0
    microfn = 0
    for cat, acc in acc_by_class.items():
        catcount += 1

        microtp += acc['tp']
        microfp += acc['fp']
        microtn += acc['tn']
        microfn += acc['fn']

        p = 0
        if (acc['tp'] + acc['fp']) > 0:
            p = float(acc['tp']) / (acc['tp'] + acc['fp'])

        r = 0
        if (acc['tp'] + acc['fn']) > 0:
            r = float(acc['tp']) / (acc['tp'] + acc['fn'])

        f1 = 0
        if (p + r) > 0:
            f1 = 2 * p * r / (p + r)

        catf1s[cat] = f1

        n = acc['tp'] + acc['fn']

        macro['p'] += p
        macro['r'] += r
        macro['f1'] += f1

        if cat in ['positive', 'negative']:
            semevalmacro['p'] += p
            semevalmacro['r'] += r
            semevalmacro['f1'] += f1

        itemcount += n

    micro['p'] = float(microtp) / float(microtp + microfp)
    micro['r'] = float(microtp) / float(microtp + microfn)
    micro['f1'] = 2 * float(micro['p']) * micro['r'] / float(micro['p'] + micro['r'])

    semevalmacrof1 = semevalmacro['f1'] / 2

    print(testset + ' (' + classifier + '): %.3f' % semevalmacrof1)

# LSTM functions

In [25]:
# tokenoze and basic preprocessing of the text
def toknz_lematz(twts):
    negation = ['not','no','nor']
    stop = list(set(stopwords.words('english'))-set(negation))
    tk_men_words = [lm.lemmatize(words) for words in tokenizr.tokenize((twts)) if words not in stop]
    return tk_men_words

def txt_preprocessor(twts):
  
    # remocve hashtags and users marked
    story = re.sub(r"[@0-9](\w+)", "",twts)
    # # remove links
    story = re.sub(r"\b(https?)[\w/.:?$@!]+\b", "", story)
    # remove punct or special character
    story = re.sub(r"[^A-Za-z0-9 ]", "", story)
    # remove all words with numbers
    story = re.sub(r'\w*\d\w*', '', story)
    # # remove single digits
    story = re.sub(r"\b\d+\b", "", story)
    # #remove single word character
    story = re.sub(r"\b\w\b", "", story)
    # tokenize lemmatize
    story_1 = toknz_lematz(story.lower())
  
    return story_1

def clean_df(df):
    df['cln_twts'] = df['text'].apply(lambda x: txt_preprocessor(x))
    df['flag']=df['cln_twts'].apply(lambda x : len(x))
    df = df[df.flag > 0]
    df = df.sort_values(by=['flag']).reset_index() 

    return df

In [26]:
def word_idx(word,vocab_dict): # provide index of all the words from word to index mapping dictionary(vocab_dict)
    try :
        return vocab_dict[word]
    except:
        return vocab_dict['<unknown>'] # if word not found provide index of unknown word.
    
# creates a list of tensors(with word index) of all the tweets of the training set
def data_preprocess_embed(df,vocab_dict):
    df['twt_idx'] = df.cln_twts.apply(lambda x: [word_idx(i,vocab_dict) for i in x])
    tensor_ls = [torch.LongTensor(i) for i in df['twt_idx'].values]
    return tensor_ls

In [27]:
def y_encode(sentiment): # encodes all the sentiments in the data as the mapping given
    sentmnt_map = {'negative': 0,'neutral': 1,'positive' : 2}
    return sentmnt_map[sentiment]
def y_decode(value): # encodes all the sentiments in the data as the mapping given
    value_sentiment_map = {0:'negative', 1:'neutral', 2:'positive'}
    return value_sentiment_map[value]
    
def batch_sentiments_vec(sentiments): # return encoded sentiment array of a batch
    batch_sntmnt_rep = torch.zeros([len(sentiments)], dtype=torch.long)
    for index, sentiment in enumerate(sentiments):
        batch_sntmnt_rep[index] = y_encode(sentiment)
    return batch_sntmnt_rep
# return a batch of data on which model trains
def batched_dataloader_n(i,n_points,df,matrix_idx, device = 'cpu'):
    x = matrix_idx[i:i+n_points]#.to(device)
    y = batch_sentiments_vec(df.loc[i:i+n_points-1,'sentiment'].values).to(device)
    leng = list(df.loc[i:i+n_points-1,'flag'].values)
    return  x,y,leng

In [28]:
# preprocess the data and returns clean dataframe, 
# Also returns indexed tensor array of each row of a dataframe
# selects best 5000 vocabs, and returns embedding matrix along with word to index mapping dictionary
def lstm_preprocsr(df,dataset_type): 
    
    if dataset_type == 'train' :   
        train_df = clean_df(df) # cleans train_data
        # sub-sampling of the neutral data based on random index
        ax1 = train_df[train_df.sentiment == 'neutral'].index
        ax2 = random.sample(range(0, len(ax1)), int(0.7*len(ax1)))
        pos_idx = train_df[train_df.sentiment == 'positive'].index
        neg_idx = train_df[train_df.sentiment == 'negative'].index

        req_idx = list(pos_idx) + list(neg_idx)
        for i in ax2:
            req_idx.append(ax1[i])

        train_df = train_df[train_df.index.isin(req_idx)].reset_index(drop=True)
        
        # create vocab on the basic of frequency in all the sentimetns sentiments then merge them.
        neg_vocab = [j for i in train_df[train_df.sentiment == 'negative'].\
                     cln_twts.values for j in i if j in embeddings_index.keys() and len(j)>1]
        neu_vocab = [j for i in train_df[train_df.sentiment == 'neutral'].\
                     cln_twts.values for j in i if j in embeddings_index.keys() and len(j)>1]
        pos_vocab = [j for i in train_df[train_df.sentiment == 'positive'].\
                     cln_twts.values for j in i if j in embeddings_index.keys() and len(j)>1]
        
        # logic to get 5000 vocab size based on inverse proportion of sentiments
        random_n = 10000
        for i in range(1000):  
            #negative vocab
            neg_count_tup = Counter(neg_vocab).most_common(int(random_n*0.40))
            neg_frequent_words = list(pd.DataFrame(neg_count_tup, columns = ['word','coun']).word.values)
            required_words = neg_frequent_words[20:]
            #neutral vocabs
            neu_count_tup = Counter(neu_vocab).most_common(int(random_n*0.30))
            neu_frequent_words = list(pd.DataFrame(neu_count_tup, columns = ['word','coun']).word.values)
            required_words += neu_frequent_words[20:]
            # positive vocabs    
            pos_count_tup = Counter(pos_vocab).most_common(int(random_n*0.31))
            pos_frequent_words = list(pd.DataFrame(pos_count_tup, columns = ['word','coun']).word.values)
            required_words += pos_frequent_words[20:]
            
            final_count = len(set(required_words))
            if final_count < 5000:
                random_n += 0.5
            elif final_count > 5000: 
                random_n -= 0.5
            else: break
        # final vocab list "required_words"
        
        # based on final vocab list creates embedding matrix
        #print(len(set(required_words)))
        vocab_ls = set(required_words)
        embedding_matrix = np.zeros((len(vocab_ls)+2,100))
        vocab_dict = {}
        vocab_dict['<pad>'] = 0
        vocab_dict['<unknown>'] = 1

        unk_word_embd = np.random.rand(100)
        embedding_matrix[1] = unk_word_embd

        for idx,word in enumerate(vocab_ls):
            vocab_dict[word] = idx+2
            embedding_matrix[idx+2] = embeddings_index[word]

        train_matrix_df = data_preprocess_embed(train_df,vocab_dict)
        
    return train_df,train_matrix_df,embedding_matrix,vocab_dict


In [29]:
class LSTM_net(nn.Module):
    def __init__(self,vocab_size, input_size, hidden_size, output_size,device):
        super(LSTM_net, self).__init__()
        self.device=device
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(vocab_size, input_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix,dtype=torch.float32))
        
        self.lstm_cell = nn.LSTM(input_size, hidden_size,batch_first=False)
        
        self.h2o = nn.Linear(hidden_size, output_size)

        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, input,lengths, hidden = None):
        # padded the batch data
        input_pad = pad_sequence(input).to(self.device).permute(1,0) 
        embedded = self.embedding(input_pad).transpose(0, 1)
        # pack padded the output of padded data
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, lengths,enforce_sorted = False)
        out, hidden = self.lstm_cell(packed, hidden)
        
        output = self.h2o(hidden[0].view(-1, self.hidden_size))

        output = self.softmax(output)
        
        return output, hidden

In [30]:
# trains the network for a batch and return the training loss for visuallization.
def train_batch(net,lstm_train_df,lstm_train_matrix_idx, opt, criterion, n_points, device = 'cpu'):
    
    net.train().to(device)
    loss_training = 0
    
    for i in range(0,len(lstm_train_df)-n_points,n_points): 
        opt.zero_grad()
        #batch_x, batch_y = batched_dataloader(i, n_points, train_df , device)
        batch_x, batch_y,batch_lengths = batched_dataloader_n(i, n_points ,lstm_train_df,lstm_train_matrix_idx ,device)
        if len(batch_x)>0:
            output, hidden = net(batch_x,batch_lengths)
            loss = criterion(output, batch_y)
            loss_training += loss.item()
            loss.backward()
            opt.step()
        else:
            continue
    return  loss_training/len(lstm_train_df)

In [31]:
# makes prediction on the test data.
def lstm_predictor(net, df,df_matrix, device = 'cpu'):
    net = net.eval().to(device)
    batch_test_x,batch_test_y,batch_lengths = batched_dataloader_n(0,len(df), df,df_matrix, device)
    output,hidden = net(batch_test_x,batch_lengths)
    actuals = batch_test_y.cpu().detach().numpy()
    pred = torch.argmax(output,dim=1).cpu().detach().numpy()      
    return pred,actuals

In [32]:
# trains the entire network for n number of epochs
def train_LSTM(net,lstm_train_df,lstm_train_matrix_idx, lr = 0.01, n_batches = 100, batch_size = 10, display_freq=5, device = 'cpu'):
    net = net.to(device)
    net.embedding.weight.requires_grad = unfrozen = False
    criterion = nn.NLLLoss()
    opt = optim.Adam(net.parameters(), lr=lr,betas = (0.99,0.99))
    
    loss_arr = np.zeros(n_batches + 1)
    for i in range(n_batches):
        loss_arr[i+1] = (loss_arr[i]*i + train_batch(net,lstm_train_df,lstm_train_matrix_idx, opt, criterion, batch_size, device))/(i + 1)
        
#         if i%display_freq == display_freq-1:
#             clear_output(wait=True)
#             print('Iteration', i, 'Loss', loss_arr[i])
#             plt.figure()
#             plt.plot(loss_arr[1:i], '-*')
#             plt.xlabel('Iteration')
#             plt.ylabel('Loss')
#             plt.show()

# Building All Classifiers
You need to create your own classifiers (at least 3 classifiers). For each classifier, you can choose between the bag-of-word features and the word-embedding-based features. Each classifier has to be evaluated over 3 test sets. Make sure your classifier produce consistent performance across the test sets. Marking will be based on the performance over all 5 test sets (2 of them are not provided to you).

### Evaluate classifiers,

In [33]:
# given the predictions and the actuals it prints confusion matrix and macroaveraged f1-score
# def evaluate(prediction,ground_truth,features,classifier,i):
    
#     confusion_matrix = metrics.confusion_matrix(ground_truth,prediction)
#     f1_score = metrics.f1_score(ground_truth,prediction, average=None)
    
#     f1_neg_pos = (f1_score[0]+f1_score[2])/2
#     print(f"Evaluation {classifier} and {features} for Test{i+1} dataset")
#     print("-consusion_matrix: \n",confusion_matrix)
#     print(f"-f1score macroavg(pos&neg): {f1_neg_pos}\n")

In [34]:
# Note:  All the ML based functions are getting pulled from ml_preprocessing file
# Didn't include in this code to make it look a bit clean.
for classifier in ['SVM', 'Naive Bayes','Compliment NB','Gaussian NB','LSTM']:
    for features in ['CountVectorizer', 'TFIDF','Glove']:
        dataset_type = 'train'
        
        if classifier == 'SVM':
            X,Y,feat_vect = preprocessor(train_df,classifier,features,dataset_type)

            print(f"Training {classifier} with {features}")
            clf = model_trainer(X,Y,classifier)


        elif classifier == 'Naive Bayes':
            X,Y,feat_vect = preprocessor(train_df,classifier,features,dataset_type)

            if features != 'Glove':
                print(f"Training {classifier} with {features}")
                clf = model_trainer(X,Y,classifier)
            else :
                print("Multinomial NB doesn't work with negative feature vec ")
                continue



        elif classifier == 'Compliment NB':
            X,Y,feat_vect = preprocessor(train_df,classifier,features,dataset_type)
            if features != 'Glove':
                print(f"Training {classifier} with {features}")
                clf = model_trainer(X,Y,classifier)
            else :
                print("Compliment NB doesn't work with negative feature vec ")
                continue


        elif classifier == 'Gaussian NB':
            X,Y,feat_vect = preprocessor(train_df,classifier,features,dataset_type)
            print(f"Training {classifier} with {features}")
            clf = model_trainer(X,Y,classifier)



        elif classifier == 'LSTM':
            if features == 'Glove':
                print(f"Training {classifier} with {features}")
                # preprocessing
                lstm_train_df,lstm_train_matrix_idx,embedding_matrix,vocab_dict = lstm_preprocsr(train_df,dataset_type)
                hidden_layer_size = 64
                word_vector_size = 100
                distnct_sntmtn = 3
                
                net = LSTM_net(embedding_matrix.shape[0],embedding_matrix.shape[1], hidden_layer_size, distnct_sntmtn,device_gpu)
                train_LSTM(net,lstm_train_df,lstm_train_matrix_idx, lr=0.0001, n_batches=30, batch_size = 64, display_freq=1, device = device_gpu)
            else: continue
        else:
            print('Unknown classifier name' + classifier)
            continue

        # Predition performance of thez classifiers
        for testset in testsets:
            dataset_type = 'test'
            
            testset_name = testset
            testset_path = join('semeval-tweets', testset_name)
            test_df = pd.read_csv(testset_path, names = ['id','sentiment','text'], 
                                  sep='\t', header=None,encoding='utf-8')
           

            if classifier != "LSTM":
                prediction,ground_truth = predictor(test_df,clf,feat_vect,classifier,features,dataset_type)

            else:
                
                test_df = clean_df(test_df)
                test_matrix_idx = data_preprocess_embed(test_df,vocab_dict)
                prediction,ground_truth = lstm_predictor(net,test_df,test_matrix_idx,device_gpu)
            
            prediction_in_sentiment = [y_decode(i) for i in prediction]
            tweed_id = ['{0:0>18}'.format(i) for i in test_df.id.values]
            id_preds = dict(zip(tweed_id,prediction_in_sentiment))

            #evaluate(prediction,ground_truth,features,classifier,i)
            evaluate(id_preds, testset_path, features + '-' + classifier)


Training SVM with CountVectorizer
semeval-tweets/twitter-test1.txt (CountVectorizer-SVM): 0.438
semeval-tweets/twitter-test2.txt (CountVectorizer-SVM): 0.441
semeval-tweets/twitter-test3.txt (CountVectorizer-SVM): 0.421
Training SVM with TFIDF
semeval-tweets/twitter-test1.txt (TFIDF-SVM): 0.316
semeval-tweets/twitter-test2.txt (TFIDF-SVM): 0.330
semeval-tweets/twitter-test3.txt (TFIDF-SVM): 0.290
Training SVM with Glove
semeval-tweets/twitter-test1.txt (Glove-SVM): 0.438
semeval-tweets/twitter-test2.txt (Glove-SVM): 0.466
semeval-tweets/twitter-test3.txt (Glove-SVM): 0.476
Training Naive Bayes with CountVectorizer
semeval-tweets/twitter-test1.txt (CountVectorizer-Naive Bayes): 0.500
semeval-tweets/twitter-test2.txt (CountVectorizer-Naive Bayes): 0.531
semeval-tweets/twitter-test3.txt (CountVectorizer-Naive Bayes): 0.492
Training Naive Bayes with TFIDF
semeval-tweets/twitter-test1.txt (TFIDF-Naive Bayes): 0.360
semeval-tweets/twitter-test2.txt (TFIDF-Naive Bayes): 0.405
semeval-tweets/t