In [None]:
# Sentiment Analysis
# Binary Classification
# From Scratch

In [14]:
# Load libraries

import numpy as np
import pandas as pd
import csv

In [15]:
# Load data and inspection

train_data = pd.read_csv(r"D:\SLU\AI MSc\Fall 22\NLP\train-v2.tsv", sep="\t", header=None, quoting = csv.QUOTE_NONE)
test_data = pd.read_csv(r"D:\SLU\AI MSc\Fall 22\NLP\test.tsv", sep="\t", header=None, quoting = csv.QUOTE_NONE)

In [18]:
# Rename columns

train_data.rename(columns={0:"Label", 1:"Document"}, inplace=True)
test_data.rename(columns={0:"Label", 1:"Document"}, inplace=True)


In [19]:
# Defining bags of words

bag_of_words = []
positive_bag_of_words = []
negative_bag_of_words = []

# Deviding the dataframe in to psitive and negative classes

df_p = train_data[train_data['Label'] == 1]
df_n = train_data[train_data['Label'] == 0]

# Bag of words; all words

for i in range(len(train_data)):
    doc = train_data.loc[i, 'Document']
    words_of_doc = doc.split()
    bag_of_words += words_of_doc

# Bag of words; in POSITIVE class

for i in range(len(df_p)):
    doc = df_p.loc[df_p.index[i], 'Document']
    words_of_doc = doc.split()
    positive_bag_of_words += words_of_doc

# Bag of words; in NEGATIVE class

for i in range(len(df_n)):
    doc = df_n.loc[df_n.index[i], 'Document']
    words_of_doc = doc.split()
    negative_bag_of_words += words_of_doc

# Finding unique words in each bags of words

unique_words = list(set(bag_of_words))
positive_unique_words = list(set(positive_bag_of_words))
negative_unique_words = list(set(negative_bag_of_words))


In [20]:
# Removing unnecessary words

unn_words = ['{URL}', '@USER']

unique_words = [w for w in unique_words if w not in unn_words]
positive_unique_words = [w for w in positive_unique_words if w not in unn_words]
negative_unique_words = [w for w in negative_unique_words if w not in unn_words]


print('Number of all words:', len(bag_of_words))
print('Number of unique words:', len(unique_words))
print('Number of unique words in positive class:', len(positive_unique_words))
print('Number of unique words in negative class:', len(negative_unique_words))

Number of all words: 1175307
Number of unique words: 131360
Number of unique words in positive class: 81138
Number of unique words in negative class: 78202


In [22]:
# Calculating prior possibilities for negative and positive classes

train_num_negative_label = len(train_data[train_data['Label'] == 0])
train_num_positive_label = len(train_data[train_data['Label'] == 1])

negative_prior = train_num_negative_label / len(train_data)
positive_prior = train_num_positive_label / len(train_data)

print('P(-):', negative_prior)
print('P(+):', positive_prior)

P(-): 0.4997625
P(+): 0.5002375


In [23]:
# Adding a new column named "Class"

test_data['Class'] = None
test_data

Unnamed: 0,Label,Document,Class
0,0,"@USER @USER nos sadwrn dwi fyd, rudimental a e...",
1,1,@USER @USER haia! Pob hwyl fory hogs. Cofiwch ...,
2,0,Loner trwy'r dydd heddiw yn ddiolchgar am cwmm...,
3,1,"@USER good news, ma'r braid yn cal aros miwn! ...",
4,1,@USER Wi'n gobeithio ti'n cael nadolig gwych g...,
...,...,...,...
9995,1,@USER Ahh scary D: Swni'n ofn mynd mwy na 10mp...,
9996,0,Ma'n neud i fi deimlo bach yn sâl mod i rwan w...,
9997,0,"So ma Bac fi di gal ei ddewis i yrru ffwrdd, gret",
9998,1,Bore da a diolch am ein dilyn @USER a @USER! B...,


In [24]:
# Calculating the likelihood possibility of each sentence in test

for j in test_data.index:

    ## Setting the initial value 
    p_doc_negative = negative_prior
    p_doc_positive = positive_prior

    # finding all words in the sentence (each row)
    test_words = list(set(test_data.loc[j, 'Document'].split()))

    for i in range(len(test_words)):

        # Selecting words in the sentence one by one
        sample = test_words[i]

        # Frequency of the word in positive and negative bags of words 
        w_p = positive_bag_of_words.count(sample)
        w_n = negative_bag_of_words.count(sample)

        # Likelihood possibilities for positive class
        p_likelihood_positive = (w_p + 1) / (len(positive_bag_of_words) + len(unique_words))
        p_doc_positive *= p_likelihood_positive

        # Likelihood possibilities for negative class
        p_likelihood_negative = (w_n + 1) / (len(negative_bag_of_words) + len(unique_words))
        p_doc_negative *= p_likelihood_negative

    # Write the predicted class to the "Class" column
    if p_doc_positive > p_doc_negative:
        test_data.loc[j, 'Class'] = 1
    elif p_doc_negative > p_doc_positive:
        test_data.loc[j, 'Class'] = 0


In [25]:
# Test dataframe

test_data

Unnamed: 0,Label,Document,Class
0,0,"@USER @USER nos sadwrn dwi fyd, rudimental a e...",0
1,1,@USER @USER haia! Pob hwyl fory hogs. Cofiwch ...,1
2,0,Loner trwy'r dydd heddiw yn ddiolchgar am cwmm...,0
3,1,"@USER good news, ma'r braid yn cal aros miwn! ...",1
4,1,@USER Wi'n gobeithio ti'n cael nadolig gwych g...,1
...,...,...,...
9995,1,@USER Ahh scary D: Swni'n ofn mynd mwy na 10mp...,0
9996,0,Ma'n neud i fi deimlo bach yn sâl mod i rwan w...,0
9997,0,"So ma Bac fi di gal ei ddewis i yrru ffwrdd, gret",0
9998,1,Bore da a diolch am ein dilyn @USER a @USER! B...,1


In [26]:
# Calculating accuracy

accuracy = len(test_data[test_data['Label'] == test_data['Class']]) / len(test_data)
print('accuracy is:', accuracy)

accuracy is: 0.8071
