In [None]:
import codecs

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
class SentimentCorpus:
    
    def __init__(self, train_per=0.8, dev_per=0, test_per=0.2):
        '''
        prepare dataset
        1) build feature dictionaries
        2) split data into train/dev/test sets 
        '''
        X, y, feat_dict, feat_counts = build_dicts()
        self.nr_instances = y.shape[0]
        self.nr_features = X.shape[1]
        self.X = X
        self.y = y
        self.feat_dict = feat_dict
        self.feat_counts = feat_counts
        
        train_y, dev_y, test_y, train_X, dev_X, test_X = split_train_dev_test(self.X, self.y, train_per, dev_per, test_per)
        self.train_X = train_X
        self.train_y = train_y
        self.dev_X = dev_X
        self.dev_y = dev_y
        self.test_X = test_X
        self.test_y = test_y

def split_train_dev_test(X, y, train_per, dev_per, test_per):
    if (train_per + dev_per + test_per) > 1:
        print("train/dev/test splits should sum to one")
        return
    dim = y.shape[0]
    split1 = int(dim * train_per)
    
    if dev_per == 0:
        train_y, test_y = np.vsplit(y, [split1])
        dev_y = np.array([])
        train_X = X[0:split1,:]
        test_X = X[split1:,:]
        dev_X = np.array([])
    else:
        split2 = int(dim*(train_per+dev_per))
        train_y,dev_y,test_y = np.vsplit(y,(split1,split2))
        train_X = X[0:split1,:]
        dev_X = X[split1:split2,:]
        test_X = X[split2:,:]
        
    return train_y,dev_y,test_y,train_X,dev_X,test_X

def build_dicts():
    '''
    builds feature dictionaries
    ''' 
    feat_counts = {}

    # build feature dictionary with counts
    nr_pos = 0
    with codecs.open("/content/positive.review", 'r', 'utf8') as pos_file:
        for line in pos_file:
            nr_pos += 1
            toks = line.split(" ")
            for feat in toks[0:-1]:
                name, counts = feat.split(":")
                if name not in feat_counts:
                    feat_counts[name] = 0
                feat_counts[name] += int(counts)
    
    nr_neg = 0
    with codecs.open("/content/negative.review", 'r', 'utf8') as neg_file:
        for line in neg_file:
            nr_neg += 1
            toks = line.split(" ")
            for feat in toks[0:-1]:
                name, counts = feat.split(":")
                if name not in feat_counts:
                    feat_counts[name] = 0
                feat_counts[name] += int(counts)

    # remove all features that occur less than 5 (threshold) times
    to_remove = []
    for key, value in feat_counts.items():
        if value < 5:
            to_remove.append(key)
    for key in to_remove:
        del feat_counts[key]

    # map feature to index
    feat_dict = {}
    i = 0
    for key in feat_counts.keys():
        feat_dict[key] = i
        i += 1

    nr_feat = len(feat_counts) 
    nr_instances = nr_pos + nr_neg
    X = np.zeros((nr_instances, nr_feat), dtype=float)
    y = np.vstack((np.zeros([nr_pos,1], dtype=int), np.ones([nr_neg,1], dtype=int)))
    
    with codecs.open("/content/positive.review", 'r', 'utf8') as pos_file:
        nr_pos = 0
        for line in pos_file:
            toks = line.split(" ")
            for feat in toks[0:-1]:
                name, counts = feat.split(":")
                if name in feat_dict:
                    X[nr_pos,feat_dict[name]] = int(counts)
            nr_pos += 1
        
    with codecs.open("/content/negative.review", 'r', 'utf8') as neg_file:
        nr_neg = 0
        for line in neg_file:
            toks = line.split(" ")
            for feat in toks[0:-1]:
                name, counts = feat.split(":")
                if name in feat_dict:
                    X[nr_pos+nr_neg,feat_dict[name]] = int(counts)
            nr_neg += 1
    
    # shuffle the order, mix positive and negative examples
    new_order = np.arange(nr_instances)
    np.random.seed(0) # set seed
    np.random.shuffle(new_order)
    X = X[new_order,:]
    y = y[new_order,:]
    
    return X, y, feat_dict, feat_counts


In [None]:
a = SentimentCorpus()

In [None]:
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import matplotlib.pyplot as plt

In [None]:

glove_vecs = {}

#importing the static pretrained dataset

with open("/content/glove.6B.300d.txt", encoding="utf8") as f:
    for line in f:
        word, vec = line.split(maxsplit=1)
        glove_vecs[word] = np.fromstring(vec, sep=" ") #splitting based on whitespace

corpus_words = set(a.feat_dict.keys()) #importing the feature dictionary 
glove_words = set(glove_vecs.keys()) #the dictionary of words with the weightage

words_not_present = list(corpus_words - glove_words) #list of words in the dataset not in glove dataset

for word in a.feat_dict.keys():
    if word not in glove_vecs.keys():
        sub_words = word.split("_")#splitting the hyphenated words that are not present in the glove dataset
        for sub_word in sub_words:
            sub_word = sub_word.replace("'", "") # Remove apostrophes
            if sub_word not in glove_vecs.keys():
                print(sub_word)

embedding_dict = {}

for word in a.feat_dict.keys():
    if word in glove_vecs:
        embedding_dict[word] = glove_vecs[word] #creating an embedding dictionary for the words in the dataset
    else:
        emb = np.zeros(300)
        sub_words = word.split("_")
        emb_vecs = []
        for sub_word in sub_words:
            sub_word = sub_word.replace("'", "") # Remove apostrophes
            if sub_word in glove_vecs:
                emb_vecs.append(glove_vecs[sub_word])
        if len(emb_vecs) > 0:
            emb_vecs = np.array(emb_vecs)
            emb = np.mean(emb_vecs, axis=0)
        embedding_dict[word] = emb

#mean pooling
def convert_data_embedding(data):
    converted_data = []
    for datum in data:
        count = 0
        sent_emb_list = []
        for word in a.feat_dict:
            if word in embedding_dict:
                sent_emb_list.append(datum[a.feat_dict[word]]*embedding_dict[word])
                count += datum[a.feat_dict[word]]
        if count > 0:
            sent_emb_list = np.array(sent_emb_list)
            final_emb = np.einsum("ij->j",sent_emb_list)
            final_emb = final_emb / count
        else:
            final_emb = np.zeros(50)
        converted_data.append(final_emb)
    converted_data = np.array(converted_data)
    return converted_data

#new train and test data with the embedding
train_x_emb = convert_data_embedding(a.train_X)
test_x_emb = convert_data_embedding(a.test_X)

#using three layer mlp classifier with hidden unit layer 100 and activation function relu
nn_model = MLPClassifier(hidden_layer_sizes=(100,100,100),activation='relu', random_state=0, solver= 'adam')
nn_model.fit(train_x_emb, a.train_y.ravel())
predictions = nn_model.predict(test_x_emb)
acc= accuracy_score(a.test_y.ravel(),predictions)
p= precision_score(a.test_y.ravel(),predictions)
r = recall_score(a.test_y.ravel(),predictions)
f1 = f1_score(a.test_y.ravel(),predictions)




<num>
<num>
<num>
<num>
(e.g.
<year>
<year>
<num>
<num>
&quot;the
<num>
<year>
koontzs
koontzs

(p.
(p.
<num>
<num>
<num>
<num>
<num>
<num>
<num>
<num>
(im
<num>
<num>
<num>
<num>
<num>
<num>
<num>

<dash-num>
youve
youve

<num>
<year>
<year>
"passing"
calvinos
<num>
<num>
<num>
<num>
<num>
<year>
<year>

&quot;
<num>
<num>
<num>
<num>
<fraction>
<num>
<num>
<num>
<num>
<num>
<num>
<num>

<num>
<num>
<num>
<num>
<num>
grishams
<num>
<num>
<year>
<num>
<num>
<num>
whove
<num>

<num>
mcculloughs
<num>
"one"
<year>
<num>
<year>

book.this
book.this
story.the
<num>
<num>
<year>
&quot;must
&quot;must
have&quot;
have&quot;
salems
<year>
salems
hadnt
hadnt
hiassins
<num>
hiassin
<num>
<num>
(i.e.

<num>
werent

<num>
<fraction>
youve

<num>
youve

<num>
everyones
familys
hiaasens
<year>
<num>
it.i
shouldnt
theyve
twains
heinleins
thorby
youve
<year>
<num>
(pp.
(pp.
<num>
<num>
<num>
rosss
explaination
<num>
book.i
<num>
<year>
youve
<num>
cashs

<num>
marxs
<num>
<num>
falwells
lyddie
<num>



In [None]:
print("accuracy score: ",acc) 
print("precision score:" ,p)
print("recall score:", r)
print("f1 score:", f1)

accuracy score:  0.77
precision score: 0.7526315789473684
recall score: 0.7606382978723404
f1 score: 0.7566137566137566
