In [8]:
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.metrics import precision_recall_fscore_support
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

## 1. Dataset Preparation

In [9]:
# Read data
#df = pd.read_csv('C:/Users/guanz/OneDrive/Desktop/CSCI544/homework/HW1/data/amazon_reviews_us_Kitchen_v1_00.tsv', sep='\t', on_bad_lines='skip')
df = pd.read_csv('https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Kitchen_v1_00.tsv.gz', sep='\t', on_bad_lines='skip')

In [10]:
df = df[['star_rating', 'review_body']].rename(columns={'star_rating':'ratings', 'review_body':'reviews'})
df.head()

Unnamed: 0,ratings,reviews
0,5.0,Beautiful. Looks great on counter.
1,5.0,I personally have 5 days sets and have also bo...
2,5.0,Fabulous and worth every penny. Used for clean...
3,5.0,A must if you love garlic on tomato marinara s...
4,5.0,Worth every penny! Buy one now and be a pizza ...


In [93]:
df.groupby('ratings').describe()

Unnamed: 0_level_0,reviews,reviews,reviews,reviews
Unnamed: 0_level_1,count,unique,top,freq
ratings,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1.0,426870,419692,Too small,110
2.0,241939,239368,Too small,105
3.0,349539,342257,ok,550
4.0,731701,703142,good,1163
5.0,3124595,2876141,Great,5604


## 2. Word Embedding

### Load the pretrained Word2Vec model

In [5]:
# load the pretrained model as model1
pretrained = 'GoogleNews-vectors-negative300.bin.gz'
model1 = KeyedVectors.load_word2vec_format(pretrained, binary=True)

In [6]:
# check the similarity between two similar words
print(model1.similarity('excellent', 'outstanding'))

# find out the corresponding word given that A - B = C - D (King - Man = Queen - Woman)
print(model1.most_similar(positive=['king', 'woman'], negative=['man'], topn=1))

0.5567486
[('queen', 0.7118193507194519)]


### Train a Word2Vec model using my own dataset

In [7]:
# X = df['reviews'].fillna('').tolist()

In [8]:
# convert reviews to lower case
# X = [str(x).lower() for x in X]
# remove HTML and URLs from reviews
# X = [re.sub('<.*>', '', x) for x in X]
# X = [re.sub(r'https?://\S+', '', x) for x in X]
# remove non-alphabetical characters
# X = [re.sub('[^a-z ]', '', x) for x in X]
# remove extra spaces
# X = [re.sub(' +', ' ', x) for x in X]

In [9]:
# expand contractions
'''contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}
def decontraction(s):
    for word in s.split(' '):
        if word in contractions.keys():
            s = re.sub(word, contractions[word], s)
    return s
X = [decontraction(x) for x in X]'''

'contractions = { \n"ain\'t": "am not",\n"aren\'t": "are not",\n"can\'t": "cannot",\n"can\'t\'ve": "cannot have",\n"\'cause": "because",\n"could\'ve": "could have",\n"couldn\'t": "could not",\n"couldn\'t\'ve": "could not have",\n"didn\'t": "did not",\n"doesn\'t": "does not",\n"don\'t": "do not",\n"hadn\'t": "had not",\n"hadn\'t\'ve": "had not have",\n"hasn\'t": "has not",\n"haven\'t": "have not",\n"he\'d": "he would",\n"he\'d\'ve": "he would have",\n"he\'ll": "he will",\n"he\'ll\'ve": "he will have",\n"he\'s": "he is",\n"how\'d": "how did",\n"how\'d\'y": "how do you",\n"how\'ll": "how will",\n"how\'s": "how is",\n"I\'d": "I would",\n"I\'d\'ve": "I would have",\n"I\'ll": "I will",\n"I\'ll\'ve": "I will have",\n"I\'m": "I am",\n"I\'ve": "I have",\n"isn\'t": "is not",\n"it\'d": "it would",\n"it\'d\'ve": "it would have",\n"it\'ll": "it will",\n"it\'ll\'ve": "it will have",\n"it\'s": "it is",\n"let\'s": "let us",\n"ma\'am": "madam",\n"mayn\'t": "may not",\n"might\'ve": "might have",\n"might

In [10]:
# perform lemmatization
# wnl = WordNetLemmatizer()
# X = [' '.join([wnl.lemmatize(word) for word in x.split(' ')]) for x in X]

### Here we didn't remove stop words from the dataset and this is because we are modeling the similarity between context and center words during modeling. If we removed stop words, we would not be able to model the true context.

In [11]:
# train a word2vec model using my own dataset
# convert X_train to a list of lists of words
# sentences = [x.split(' ') for x in X]

# use X_train to train a word2vec model2
# model2 = Word2Vec(vector_size=300, window=11, min_count=10)
# model2.build_vocab(sentences)
# model2.train(sentences, total_examples=model2.corpus_count, epochs=model2.epochs)

In [12]:
# save the trained model
# model2.save('my-own-word2vec.model')
# store just the words + their trained embeddings
# word_vectors = model2.wv
# word_vectors.save('my-own-word2vec.wordvectors')

In [11]:
model2 = KeyedVectors.load('my-own-word2vec.wordvectors', mmap='r')

In [12]:
# check the similarity between two similar words using model 2
print(model2.similarity('excellent', 'outstanding'))

# find out the corresponding word given that A - B = C - D (King - Man = Queen - Woman) using model 2
print(model2.most_similar(positive=['king', 'woman'], negative=['man'], topn=1))

0.8761742
[('queen', 0.5082318186759949)]


## 3. Build training and testing datasets

In [13]:
# Build a balanced dataset of 250,000 reviews (50K instances per each rating score).
df_1 = df[df['ratings']==1].sample(n=50000, random_state=1)
df_2 = df[df['ratings']==2].sample(n=50000, random_state=1)
df_3 = df[df['ratings']==3].sample(n=50000, random_state=1)
df_4 = df[df['ratings']==4].sample(n=50000, random_state=1)
df_5 = df[df['ratings']==5].sample(n=50000, random_state=1)
df = pd.concat([df_1, df_2, df_3, df_4, df_5]).sample(frac=1).reset_index(drop=True)

# Create ternary labels: ratings > 3 --> 1, ratings < 3 --> 2, ratings = 3 --> 3
# Map ratings > 3 to 1, ratings < 3 to 2, and ratings = 3 to 3.
df['label'] = df['ratings'].map(lambda x: 1 if x > 3 else (2 if x < 3 else 3))
df.drop('ratings', axis=1, inplace=True)

df.groupby('label').describe()

Unnamed: 0_level_0,reviews,reviews,reviews,reviews
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,99995,96477,good,149
2,99995,99276,too small,28
3,49997,49404,ok,74


In [14]:
X, y = df['reviews'].fillna('').tolist(), df['label'].tolist()

In [15]:
# convert reviews to lower case
X = [str(x).lower() for x in X]
# remove HTML and URLs from reviews
X = [re.sub('<.*>', '', x) for x in X]
X = [re.sub(r'https?://\S+', '', x) for x in X]
# remove non-alphabetical characters
X = [re.sub('[^a-z ]', '', x) for x in X]
# remove extra spaces
X = [re.sub(' +', ' ', x) for x in X]

In [16]:
# expand contractions
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}
def decontraction(s):
    for word in s.split(' '):
        if word in contractions.keys():
            s = re.sub(word, contractions[word], s)
    return s
X = [decontraction(x) for x in X]

In [17]:
# perform lemmatization
wnl = WordNetLemmatizer()
X = [' '.join([wnl.lemmatize(word) for word in x.split(' ')]) for x in X]

In [18]:
# remove stop words

stopWords =set(stopwords.words('english'))
def rmstopWords(s):
    wordlist = s.split(' ')
    newlist = []
    for word in wordlist:
        if word not in stopWords:
            newlist.append(word)
    s = ' '.join(newlist)
    return s

X = [rmstopWords(x) for x in X]

In [19]:
# Split the downsized dataset into 80% training dataset and 20% testing dataset.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [20]:
# turn the original ternary training and testing datasets to binary
# return a list of indices of y = 1 or y =2
y_train_bi = []
idx_train = []
for i, y in enumerate(y_train):
    if y == 1 or y == 2:
        y_train_bi.append(y)
        idx_train.append(i)
        
y_test_bi = []
idx_test = []
for i, y in enumerate(y_test):
    if y == 1 or y == 2:
        y_test_bi.append(y)
        idx_test.append(i)
# use the list of indices to select a sub-list of X_train
X_train_bi = [X_train[i] for i in idx_train]
X_test_bi = [X_test[i] for i in idx_test]

In [15]:
len(X_train)

200000

## 4. Simple Models

### Use the pretrained word2vec model to train models (perceptron and SVM)

In [23]:
# take the average word vectors of important words (i.e. non-stop words) in
# a review as the feature of a training sample
X_train_bi1 = []
for x in X_train_bi:
    wordveclist = []
    for word in x.split(' '):
        try:
            wordvec = model1[word]
            wordveclist.append(wordvec)
        except:
            pass
    X_train_bi1.append(np.mean(wordveclist, axis=0)) 

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [24]:
X_test_bi1 = []
for x in X_test_bi:
    wordveclist = []
    for word in x.split(' '):
        try:
            wordvec = model1[word]
            wordveclist.append(wordvec)
        except:
            pass
    X_test_bi1.append(np.mean(wordveclist, axis=0)) 

In [25]:
# handle non-word-vector values in the dataset
# return the indices of word-vector values
wv_train = []
for i, x in enumerate(X_train_bi1):
    try:
        len(x)
        wv_train.append(i)
    except:
        print(x)
        

wv_test = []
for i, x in enumerate(X_test_bi1):
    try:
        len(x)
        wv_test.append(i)
    except:
        print(x)
        
# remove the non-word-vector values from the dataset
X_train_bi1 = [X_train_bi1[i] for i in wv_train]
X_test_bi1 = [X_test_bi1[i] for i in wv_test]
y_train_bi1 = [y_train_bi[i] for i in wv_train]
y_test_bi1 = [y_test_bi[i] for i in wv_test]

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


In [26]:
# use pretrained word2vec features to train a perceptron
perceptron = Perceptron(random_state=1)
perceptron.fit(X_train_bi1, y_train_bi1)
y_train_predict1, y_test_predict1 = perceptron.predict(X_train_bi1), perceptron.predict(X_test_bi1)

# report accuracy, precision, recall, and f1-score on both the training and testing split
train_stats = precision_recall_fscore_support(y_train_bi1, y_train_predict1, average='binary')
precision_train, recall_train, fscore_train = train_stats[0], train_stats[1], train_stats[2]

test_stats = precision_recall_fscore_support(y_test_bi1, y_test_predict1, average='binary')
precision_test, recall_test, fscore_test = test_stats[0], test_stats[1], test_stats[2]

print('The accuracy of training dataset: {:2.1%}'.format(perceptron.score(X_train_bi1, y_train_bi1)))
print('The precision of training dataset: {:2.1%}'.format(precision_train))
print('The recall of training dataset: {:2.1%}'.format(recall_train))
print('The fscore of training dataset: {:2.1%}\n'.format(fscore_train))

print('The accuracy of testing dataset: {:2.1%}'.format(perceptron.score(X_test_bi1, y_test_bi1)))
print('The precision of testing dataset: {:2.1%}'.format(precision_test))
print('The recall of testing dataset: {:2.1%}'.format(recall_test))
print('The fscore of testing dataset: {:2.1%}'.format(fscore_test))

The accuracy of training dataset: 77.7%
The precision of training dataset: 85.0%
The recall of training dataset: 67.3%
The fscore of training dataset: 75.1%

The accuracy of testing dataset: 77.4%
The precision of testing dataset: 84.6%
The recall of testing dataset: 66.7%
The fscore of testing dataset: 74.6%


In [27]:
# use pretrained word2vec features to train a SVM
# refer to https://stackoverflow.com/questions/52008548/python-running-into-x-test-y-test-fit-errors
# for why with_mean should be set to False
svm = LinearSVC(random_state=1)
svm.fit(X_train_bi1, y_train_bi1)

y_train_predict1, y_test_predict1 = svm.predict(X_train_bi1), svm.predict(X_test_bi1)

# report accuracy, precision, recall, and f1-score on both the training and testing split
train_stats = precision_recall_fscore_support(y_train_bi1, y_train_predict1, average='binary')
precision_train, recall_train, fscore_train = train_stats[0], train_stats[1], train_stats[2]

test_stats = precision_recall_fscore_support(y_test_bi1, y_test_predict1, average='binary')
precision_test, recall_test, fscore_test = test_stats[0], test_stats[1], test_stats[2]

print('The accuracy of training dataset: {:2.1%}'.format(svm.score(X_train_bi1, y_train_bi1)))
print('The precision of training dataset: {:2.1%}'.format(precision_train))
print('The recall of training dataset: {:2.1%}'.format(recall_train))
print('The fscore of training dataset: {:2.1%}\n'.format(fscore_train))

print('The accuracy of testing dataset: {:2.1%}'.format(svm.score(X_test_bi1, y_test_bi1)))
print('The precision of testing dataset: {:2.1%}'.format(precision_test))
print('The recall of testing dataset: {:2.1%}'.format(recall_test))
print('The fscore of testing dataset: {:2.1%}'.format(fscore_test))

The accuracy of training dataset: 81.9%
The precision of training dataset: 83.3%
The recall of training dataset: 79.7%
The fscore of training dataset: 81.5%

The accuracy of testing dataset: 81.5%
The precision of testing dataset: 82.8%
The recall of testing dataset: 79.4%
The fscore of testing dataset: 81.1%


### Use my own word2vec model to train models (perceptron and SVM)

In [45]:
# take the average word vectors of important words (i.e. non-stop words) in
# a review as the feature of a training sample
X_train_bi2 = []
for x in X_train_bi:
    wordveclist = []
    for word in x.split(' '):
        try:
            wordvec = model2[word]
            wordveclist.append(wordvec)
        except:
            pass
    X_train_bi2.append(np.mean(wordveclist, axis=0))

# do the same to the testing dataset
X_test_bi2 = []
for x in X_test_bi:
    wordveclist = []
    for word in x.split(' '):
        try:
            wordvec = model2[word]
            wordveclist.append(wordvec)
        except:
            pass
    X_test_bi2.append(np.mean(wordveclist, axis=0)) 

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [29]:
# handle non-word-vector values in the dataset
# return the indices of word-vector values
wv_train = []
for i, x in enumerate(X_train_bi2):
    try:
        len(x)
        wv_train.append(i)
    except:
        print(x)
        

wv_test = []
for i, x in enumerate(X_test_bi2):
    try:
        len(x)
        wv_test.append(i)
    except:
        print(x)
        
# remove the non-word-vector values from the dataset
X_train_bi2 = [X_train_bi2[i] for i in wv_train]
X_test_bi2 = [X_test_bi2[i] for i in wv_test]
y_train_bi2 = [y_train_bi[i] for i in wv_train]
y_test_bi2 = [y_test_bi[i] for i in wv_test]

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


In [30]:
# use my own word2vec features to train a perceptron
perceptron = Perceptron(random_state=1)
perceptron.fit(X_train_bi2, y_train_bi2)
y_train_predict2, y_test_predict2 = perceptron.predict(X_train_bi2), perceptron.predict(X_test_bi2)

# report accuracy, precision, recall, and f1-score on both the training and testing split
train_stats = precision_recall_fscore_support(y_train_bi2, y_train_predict2, average='binary')
precision_train, recall_train, fscore_train = train_stats[0], train_stats[1], train_stats[2]

test_stats = precision_recall_fscore_support(y_test_bi2, y_test_predict2, average='binary')
precision_test, recall_test, fscore_test = test_stats[0], test_stats[1], test_stats[2]

print('The accuracy of training dataset: {:2.1%}'.format(perceptron.score(X_train_bi2, y_train_bi2)))
print('The precision of training dataset: {:2.1%}'.format(precision_train))
print('The recall of training dataset: {:2.1%}'.format(recall_train))
print('The fscore of training dataset: {:2.1%}\n'.format(fscore_train))

print('The accuracy of testing dataset: {:2.1%}'.format(perceptron.score(X_test_bi2, y_test_bi2)))
print('The precision of testing dataset: {:2.1%}'.format(precision_test))
print('The recall of testing dataset: {:2.1%}'.format(recall_test))
print('The fscore of testing dataset: {:2.1%}'.format(fscore_test))

The accuracy of training dataset: 82.0%
The precision of training dataset: 83.2%
The recall of training dataset: 80.1%
The fscore of training dataset: 81.6%

The accuracy of testing dataset: 81.8%
The precision of testing dataset: 82.9%
The recall of testing dataset: 80.1%
The fscore of testing dataset: 81.5%


In [31]:
# use my own word2vec features to train a SVM
# refer to https://stackoverflow.com/questions/52008548/python-running-into-x-test-y-test-fit-errors
# for why with_mean should be set to False
svm = LinearSVC(random_state=1, max_iter=5000)
svm.fit(X_train_bi2, y_train_bi2)

y_train_predict2, y_test_predict2 = svm.predict(X_train_bi2), svm.predict(X_test_bi2)

# report accuracy, precision, recall, and f1-score on both the training and testing split
train_stats = precision_recall_fscore_support(y_train_bi2, y_train_predict2, average='binary')
precision_train, recall_train, fscore_train = train_stats[0], train_stats[1], train_stats[2]

test_stats = precision_recall_fscore_support(y_test_bi2, y_test_predict2, average='binary')
precision_test, recall_test, fscore_test = test_stats[0], test_stats[1], test_stats[2]

print('The accuracy of training dataset: {:2.1%}'.format(svm.score(X_train_bi2, y_train_bi2)))
print('The precision of training dataset: {:2.1%}'.format(precision_train))
print('The recall of training dataset: {:2.1%}'.format(recall_train))
print('The fscore of training dataset: {:2.1%}\n'.format(fscore_train))

print('The accuracy of testing dataset: {:2.1%}'.format(svm.score(X_test_bi2, y_test_bi2)))
print('The precision of testing dataset: {:2.1%}'.format(precision_test))
print('The recall of testing dataset: {:2.1%}'.format(recall_test))
print('The fscore of testing dataset: {:2.1%}'.format(fscore_test))



The accuracy of training dataset: 86.2%
The precision of training dataset: 86.7%
The recall of training dataset: 85.5%
The fscore of training dataset: 86.1%

The accuracy of testing dataset: 86.2%
The precision of testing dataset: 86.6%
The recall of testing dataset: 85.6%
The fscore of testing dataset: 86.1%


## 5. Feedforward Neural Networks

https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel
https://www.kaggle.com/code/mishra1993/pytorch-multi-layer-perceptron-mnist/notebook

In [15]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
import functools
from sklearn.metrics import accuracy_score 

In [16]:
# Enable CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True

In [18]:
# Hyperparameters
params = {'batch_size': 64,
          'shuffle': True,
          'num_workers': 0}
max_epochs = 50

In [116]:
# Override the Dataset class
class Dataset(Dataset):
    
    def __init__(self, list_IDs, labels):
        'Initialization'
        self.list_IDs = list_IDs
        self.labels = labels
    
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.list_IDs)
    
    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        ID = self.list_IDs[index]
        
        # Load data and get label
        X = torch.load('data/' + ID + '.pt')
        y = self.labels[index]
        return X, y

## Use the average of the Word2Vec vectors for each review as the input feature

### Binary Classification

In [71]:
# Since my own word2vec model performed better than the pretrained one in terms
# of training simple models, I am gonna only use features generated from my own
# word2vec model to train the FNN/MLP model.

# Split the original training dataset into one training and one validation set
# The original labels are 1 and 2, and we need to convert them to 0 and 1,
# because in pytorch for binary classification, the labels should be 0 and 1.
train_IDs = {}
y_train = []
len_train_IDs = int(0.8 * len(X_train_bi2))
for i in range(len_train_IDs):
    train_IDs[i] = 'train_bi_' + str(i)
    y_train.append(y_train_bi2[i] - 1) # Convert label from 1 and 2 to 0 and 1

valid_IDs = {}
y_valid = []
len_valid_IDs = len(X_train_bi2) - len_train_IDs
for i in range(len_valid_IDs):
    valid_IDs[i] = 'valid_bi_' + str(i)
    y_valid.append(y_train_bi2[len(y_train) + i] - 1)
    
test_IDs = {}
len_test_IDs = len(X_test_bi2)
for i in range(len_test_IDs):
    test_IDs[i] = 'test_bi_' + str(i)

for i in range(len(train_IDs)):
    torch.save(X_train_bi2[i], 'data/' + train_IDs[i] + '.pt')

for i in range(len(valid_IDs)):
    torch.save(X_train_bi2[len(train_IDs) + i], 'data/' + valid_IDs[i] + '.pt')

for i in range(len(test_IDs)):
    torch.save(X_test_bi2[i], 'data/' + test_IDs[i] + '.pt')

"for i in range(len(train_IDs)):\n    torch.save(X_train_bi2[i], 'data/' + train_IDs[i] + '.pt')\n\nfor i in range(len(valid_IDs)):\n    torch.save(X_train_bi2[len(train_IDs) + i], 'data/' + valid_IDs[i] + '.pt')\n\nfor i in range(len(test_IDs)):\n    torch.save(X_test_bi2[i], 'data/' + test_IDs[i] + '.pt')"

In [120]:
# Generate training, validation and testing datasets
train_set = Dataset(train_IDs, y_train)
valid_set = Dataset(valid_IDs, y_valid)
test_set = Dataset(test_IDs, y_test_bi2)

# Generate dataloaders for the training, validation and testing datasets
train_loader = torch.utils.data.DataLoader(train_set, **params)
valid_loader = torch.utils.data.DataLoader(valid_set, **params)
test_loader = torch.utils.data.DataLoader(test_set, **params)

In [19]:
# Define the network architecture
class Net(nn.Module):
    def __init__(self, input_dim=300, output_dim=2, hidden_1=50, hidden_2=10):
        super(Net, self).__init__()
        # dimension of inputs
        self.input_dim = input_dim 
        # number of classes (2 for binary, 3 for ternary, ...)
        self.output_dim = output_dim
        # number of nodes in each hidden layer
        self.hidden_1 = hidden_1
        self.hidden_2 = hidden_2
        # linear layer (input --> hidden_1)
        self.fc1 = nn.Linear(self.input_dim, self.hidden_1)
        # linear layer (hidden_1 --> hidden_2)
        self.fc2 = nn.Linear(self.hidden_1, self.hidden_2)
        # linear layer (hidden_2 --> 2)
        self.fc3 = nn.Linear(self.hidden_2, self.output_dim)
        # dropout prevents overfitting of data
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        # add the 1st hidden layer, with relu activation function
        x = F.relu(self.fc1(x))
        # add dropout layer after the 1st hidden layer
        x = self.dropout(x)
        # add the 2nd hidden layer, with relu activation function
        x = F.relu(self.fc2(x))
        # add dropout layer after the 2nd hidden layer
        x = self.dropout(x)
        # add output layer
        x = self.fc3(x)
        return x

In [76]:
# initialize the NN
model = Net()
print(model)

Net(
  (fc1): Linear(in_features=300, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [77]:
# specify cross entropy loss as loss function
criterion = nn.CrossEntropyLoss()

# specify optimizer (stochastic gradient descent) and learning rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [78]:
# train the network

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity

for epoch in range(max_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ###################
    # train the model #
    ###################
    model.train() # prep model for training
    for data, target in train_loader:
        # transfer to GPU
        #data, target = data.to(device), target.to(device)
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
        
    ######################    
    # validate the model #
    ######################
    model.eval() # prep model for evaluation
    for data, target in valid_loader:
        # transfer to GPU
        #data, target = data.to(device), target.to(device)
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, target)
        # update running validation loss 
        valid_loss += loss.item()*data.size(0)
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)
    valid_loss = valid_loss/len(valid_loader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss
        ))
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model.state_dict(), 'model.pt')
        valid_loss_min = valid_loss

Epoch: 1 	Training Loss: 0.452994 	Validation Loss: 0.355458
Validation loss decreased (inf --> 0.355458).  Saving model ...
Epoch: 2 	Training Loss: 0.369756 	Validation Loss: 0.340425
Validation loss decreased (0.355458 --> 0.340425).  Saving model ...
Epoch: 3 	Training Loss: 0.355497 	Validation Loss: 0.334846
Validation loss decreased (0.340425 --> 0.334846).  Saving model ...
Epoch: 4 	Training Loss: 0.347683 	Validation Loss: 0.331255
Validation loss decreased (0.334846 --> 0.331255).  Saving model ...
Epoch: 5 	Training Loss: 0.340256 	Validation Loss: 0.327505
Validation loss decreased (0.331255 --> 0.327505).  Saving model ...
Epoch: 6 	Training Loss: 0.336159 	Validation Loss: 0.325886
Validation loss decreased (0.327505 --> 0.325886).  Saving model ...
Epoch: 7 	Training Loss: 0.331625 	Validation Loss: 0.322672
Validation loss decreased (0.325886 --> 0.322672).  Saving model ...
Epoch: 8 	Training Loss: 0.328647 	Validation Loss: 0.320394
Validation loss decreased (0.32267

In [124]:
## Test the trained network
def predict(model, dataloader):
    prediction_list = []
    truth_list = []
    for i, batch in enumerate(dataloader):
        # batch[0] is X, and batch[1] is y
        outputs = model(batch[0])
        _, predicted = torch.max(outputs.data, 1)
        prediction_list.append(predicted)
        truth_list.append(batch[1])
    return prediction_list, truth_list

In [133]:
# Load model parameters from the trained model with the lowest validation loss
#model.load_state_dict(torch.load('model.pt'))

# Use the trained model to predict the testing samples
# The output is a list of tensors and a tensor consists of the predicted labels
# of a single batch of testing samples
predictions, truths = predict(model, test_loader)
# Convert predictions and truths from list of tensors to list of lists
predictions = [list(torch.Tensor.numpy(t)) for t in predictions]
truths = [list(torch.Tensor.numpy(t)) for t in truths]
# Convert predictions and truths from list of lists to a single list
predictions = functools.reduce(lambda a, b: a + b, predictions)
truths = functools.reduce(lambda a, b: a + b, truths)
# Convert predictions from (0, 1) to (1, 2)
predictions = [p + 1 for p in predictions]

In [134]:
# report accuracy, precision, recall, and f1-score on the testing dataset

test_stats = precision_recall_fscore_support(truths, predictions, average='binary')
precision_test, recall_test, fscore_test = test_stats[0], test_stats[1], test_stats[2]

print('The accuracy of testing dataset: {:2.1%}'.format(accuracy_score(truths, predictions)))
print('The precision of testing dataset: {:2.1%}'.format(precision_test))
print('The recall of testing dataset: {:2.1%}'.format(recall_test))
print('The fscore of testing dataset: {:2.1%}'.format(fscore_test))

The accuracy of testing dataset: 86.6%
The precision of testing dataset: 87.0%
The recall of testing dataset: 85.9%
The fscore of testing dataset: 86.5%


### Ternary Classification

In [27]:
# Since my own word2vec model performed better than the pretrained one in terms
# of training simple models, I am gonna only use features generated from my own
# word2vec model to train the FNN/MLP model.

# take the average word vectors of important words (i.e. non-stop words) in
# a review as the feature of a training sample
X_train_te2 = []
for x in X_train:
    wordveclist = []
    for word in x.split(' '):
        try:
            wordvec = model2[word]
            wordveclist.append(wordvec)
        except:
            pass
    X_train_te2.append(np.mean(wordveclist, axis=0))

# do the same to the testing dataset
X_test_te2 = []
for x in X_test:
    wordveclist = []
    for word in x.split(' '):
        try:
            wordvec = model2[word]
            wordveclist.append(wordvec)
        except:
            pass
    X_test_te2.append(np.mean(wordveclist, axis=0)) 

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [28]:
# handle non-word-vector values in the dataset
# return the indices of word-vector values
wv_train = []
for i, x in enumerate(X_train_te2):
    try:
        len(x)
        wv_train.append(i)
    except:
        print(x)
        

wv_test = []
for i, x in enumerate(X_test_te2):
    try:
        len(x)
        wv_test.append(i)
    except:
        print(x)
        
# remove the non-word-vector values from the dataset
X_train_te2 = [X_train_te2[i] for i in wv_train]
X_test_te2 = [X_test_te2[i] for i in wv_test]
y_train_te2 = [y_train[i] for i in wv_train]
y_test_te2 = [y_test[i] for i in wv_test]

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


In [29]:
# Split the original training dataset into one training and one validation set
# The original labels are 1, 2 and 3, and we need to convert them to 0, 1 and 2,
# because in pytorch for multinomial classification with n classes, the labels
# should be 0, 1, ..., n-1.
train_IDs = {}
y_train = []
len_train_IDs = int(0.8 * len(X_train_te2))
for i in range(len_train_IDs):
    train_IDs[i] = 'train_te_' + str(i)
    y_train.append(y_train_te2[i] - 1) # Convert label from 1, 2 and 3 to 0, 1 and 2

valid_IDs = {}
y_valid = []
len_valid_IDs = len(X_train_te2) - len_train_IDs
for i in range(len_valid_IDs):
    valid_IDs[i] = 'valid_te_' + str(i)
    y_valid.append(y_train_te2[len(y_train) + i] - 1)
    
test_IDs = {}
len_test_IDs = len(X_test_te2)
for i in range(len_test_IDs):
    test_IDs[i] = 'test_te_' + str(i)

for i in range(len(train_IDs)):
    torch.save(X_train_te2[i], 'data/' + train_IDs[i] + '.pt')

for i in range(len(valid_IDs)):
    torch.save(X_train_te2[len(train_IDs) + i], 'data/' + valid_IDs[i] + '.pt')

for i in range(len(test_IDs)):
    torch.save(X_test_te2[i], 'data/' + test_IDs[i] + '.pt')

In [30]:
# Generate training, validation and testing datasets
train_set = Dataset(train_IDs, y_train)
valid_set = Dataset(valid_IDs, y_valid)
test_set = Dataset(test_IDs, y_test_te2)

# Generate dataloaders for the training, validation and testing datasets
train_loader = torch.utils.data.DataLoader(train_set, **params)
valid_loader = torch.utils.data.DataLoader(valid_set, **params)
test_loader = torch.utils.data.DataLoader(test_set, **params)

In [60]:
# Define the network architecture
# The only difference between the ternary and binary classifications is the
# dimension of the output layer (2 for binary and 3 for ternary)
'''class Net(nn.Module):
    def __init__(self, input_dim=300, output_dim=2, hidden_1=50, hidden_2=10):
        super(Net, self).__init__()
        # dimension of inputs
        self.input_dim = input_dim 
        # number of classes (2 for binary, 3 for ternary, ...)
        self.output_dim = output_dim
        # number of nodes in each hidden layer
        self.hidden_1 = hidden_1
        self.hidden_2 = hidden_2
        # linear layer (input --> hidden_1)
        self.fc1 = nn.Linear(self.input_dim, self.hidden_1)
        # linear layer (hidden_1 --> hidden_2)
        self.fc2 = nn.Linear(self.hidden_1, self.hidden_2)
        # linear layer (hidden_2 --> 2)
        self.fc3 = nn.Linear(self.hidden_2, self.output_dim)
        # dropout prevents overfitting of data
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        # add the 1st hidden layer, with relu activation function
        x = F.relu(self.fc1(x))
        # add dropout layer after the 1st hidden layer
        x = self.dropout(x)
        # add the 2nd hidden layer, with relu activation function
        x = F.relu(self.fc2(x))
        # add dropout layer after the 2nd hidden layer
        x = self.dropout(x)
        # add output layer
        x = self.fc3(x)
        return x'''

In [32]:
# initialize the NN
# use model_te to be distinguishable from the binary model
model_te = Net(output_dim=3)
print(model_te)

Net(
  (fc1): Linear(in_features=300, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [161]:
# specify cross entropy loss as loss function
criterion = nn.CrossEntropyLoss()

# specify optimizer (stochastic gradient descent) and learning rate = 0.01
optimizer = torch.optim.SGD(model_te.parameters(), lr=0.01)

In [162]:
# train the network

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity

for epoch in range(max_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ###################
    # train the model #
    ###################
    model_te.train() # prep model for training
    for data, target in train_loader:
        # transfer to GPU
        #data, target = data.to(device), target.to(device)
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_te(data)
        # calculate the loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
        
    ######################    
    # validate the model #
    ######################
    model_te.eval() # prep model for evaluation
    for data, target in valid_loader:
        # transfer to GPU
        #data, target = data.to(device), target.to(device)
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_te(data)
        # calculate the loss
        loss = criterion(output, target)
        # update running validation loss 
        valid_loss += loss.item()*data.size(0)
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)
    valid_loss = valid_loss/len(valid_loader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss
        ))
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model_te.state_dict(), 'model_te.pt')
        valid_loss_min = valid_loss

Epoch: 1 	Training Loss: 0.829787 	Validation Loss: 0.740806
Validation loss decreased (inf --> 0.740806).  Saving model ...
Epoch: 2 	Training Loss: 0.752027 	Validation Loss: 0.722248
Validation loss decreased (0.740806 --> 0.722248).  Saving model ...
Epoch: 3 	Training Loss: 0.738544 	Validation Loss: 0.714632
Validation loss decreased (0.722248 --> 0.714632).  Saving model ...
Epoch: 4 	Training Loss: 0.729446 	Validation Loss: 0.707867
Validation loss decreased (0.714632 --> 0.707867).  Saving model ...
Epoch: 5 	Training Loss: 0.723327 	Validation Loss: 0.703287
Validation loss decreased (0.707867 --> 0.703287).  Saving model ...
Epoch: 6 	Training Loss: 0.718479 	Validation Loss: 0.699139
Validation loss decreased (0.703287 --> 0.699139).  Saving model ...
Epoch: 7 	Training Loss: 0.713546 	Validation Loss: 0.695830
Validation loss decreased (0.699139 --> 0.695830).  Saving model ...
Epoch: 8 	Training Loss: 0.710006 	Validation Loss: 0.694426
Validation loss decreased (0.69583

In [34]:
# Test the trained network
# Use the same predict function as the binary case
def predict(model, dataloader):
    prediction_list = []
    truth_list = []
    for i, batch in enumerate(dataloader):
        # batch[0] is X, and batch[1] is y
        outputs = model(batch[0])
        _, predicted = torch.max(outputs.data, 1)
        prediction_list.append(predicted)
        truth_list.append(batch[1])
    return prediction_list, truth_list

In [35]:
# Load model parameters from the trained model with the lowest validation loss
model_te.load_state_dict(torch.load('model_te.pt'))

# Use the trained model to predict the testing samples
# The output is a list of tensors and a tensor consists of the predicted labels
# of a single batch of testing samples
predictions, truths = predict(model_te, test_loader)
# Convert predictions and truths from list of tensors to list of lists
predictions = [list(torch.Tensor.numpy(t)) for t in predictions]
truths = [list(torch.Tensor.numpy(t)) for t in truths]
# Convert predictions and truths from list of lists to a single list
predictions = functools.reduce(lambda a, b: a + b, predictions)
truths = functools.reduce(lambda a, b: a + b, truths)
# Convert predictions from (0, 1, 2) to (1, 2, 3)
predictions = [p + 1 for p in predictions]

In [40]:
# report accuracy, precision, recall, and f1-score on the testing dataset

test_stats = precision_recall_fscore_support(truths, predictions, average='micro')
precision_test, recall_test, fscore_test = test_stats[0], test_stats[1], test_stats[2]

print('The accuracy of testing dataset: {:2.1%}'.format(accuracy_score(truths, predictions)))
print('The precision of testing dataset: {:2.1%}'.format(precision_test))
print('The recall of testing dataset: {:2.1%}'.format(recall_test))
print('The fscore of testing dataset: {:2.1%}'.format(fscore_test))

The accuracy of testing dataset: 71.1%
The precision of testing dataset: 71.1%
The recall of testing dataset: 71.1%
The fscore of testing dataset: 71.1%


## Use the concatenation of the first 10 Word2Vec vectors for each review as the input feature

### Binary Classification

In [21]:
# The function below cannot be applied to the training and testing sets due to
# the memory limitation, so I am gonna use a for loop to append words for every
# review in the datasets.

# define an appendWords function to append the word vectors of the first 10 words
'''def appendWords(s):
    ls = s.split(' ')
    wordveclist = []
    for i in range(10):
        try:
            wordveclist.append(list(model2[ls[i]]))
        except:
            pass
    return wordveclist

# append the word vectors of first 10 words together for all the reviews in both the training and testing sets
X_train_bi2 = [appendWords(x) for x in X_train_bi]
X_test_bi2 = [appendWords(x) for x in X_test_bi]'''

# Since my own word2vec model performed better than the pretrained one in terms
# of training simple models, I am gonna only use features generated from my own
# word2vec model to train the FNN/MLP model.
# concatenate the first 10 word vectors of important words (i.e. non-stop words)
# in a review as the feature of a training sample
X_train_bi2 = []
for x in X_train_bi:
    wordveclist = []
    first_10_words = x.split(' ')[:10]
    for word in first_10_words:
        try:
            wordvec = list(model2[word])
            wordveclist.append(wordvec)
        except:
            pass
            
    X_train_bi2.append(wordveclist)

# do the same to the testing dataset
X_test_bi2 = []
for x in X_test_bi:
    wordveclist = []
    first_10_words = x.split(' ')[:10]
    for word in first_10_words:
        try:
            wordvec = list(model2[word])
            wordveclist.append(wordvec)
        except:
            pass
            
    X_test_bi2.append(wordveclist)

In [22]:
# define a function that adds padding to the reviews with length less than 10 words
def add_padding(x):
    padding = [0 for _ in range(300)]
    if len(x) < 10:
        for i in range(10 - len(x)):
            x.append(padding)
    return x

# add paddings
X_train_bi2 = [add_padding(x) for x in X_train_bi2]
X_test_bi2 = [add_padding(x) for x in X_test_bi2]

In [23]:
# reshape the word vector of a review from (10, 300) to (3000,)
X_train_bi2 = [functools.reduce(lambda a, b: a + b, x) for x in X_train_bi2]
X_test_bi2 = [functools.reduce(lambda a, b: a + b, x) for x in X_test_bi2]

In [109]:
# SKIP this step because all non-word-vectors are padded with 0's.
# handle non-word-vector values in the dataset
# return the indices of word-vector values
'''wv_train = []
for i, x in enumerate(X_train_bi2):
    try:
        len(x)
        wv_train.append(i)
    except:
        print(x)
        

wv_test = []
for i, x in enumerate(X_test_bi2):
    try:
        len(x)
        wv_test.append(i)
    except:
        print(x)
        
# remove the non-word-vector values from the dataset
X_train_bi2 = [X_train_bi2[i] for i in wv_train]
X_test_bi2 = [X_test_bi2[i] for i in wv_test]
y_train_bi2 = [y_train_bi[i] for i in wv_train]
y_test_bi2 = [y_test_bi[i] for i in wv_test]'''

In [37]:
# Split the original training dataset into one training and one validation set
# The original labels are 1 and 2, and we need to convert them to 0 and 1,
# because in pytorch for binary classification, the labels should be 0 and 1.
len_train = int(0.8 * len(X_train_bi2))
X_train, y_train = X_train_bi2[:len_train], []
for i in range(len_train):
    y_train.append(y_train_bi[i] - 1) # Convert label from 1 and 2 to 0 and 1

len_valid = len(X_train_bi2) - len_train
X_valid, y_valid = X_train_bi2[len_train:], []
for i in range(len_valid):
    y_valid.append(y_train_bi[len_train + i] - 1)

X_test, y_test = X_test_bi2, y_test_bi

In [45]:
# Override the Dataset class
class Dataset(Dataset):
    
    def __init__(self, features, labels):
        'Initialization'
        self.features = features
        self.labels = labels
    
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.features)
    
    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        X = torch.tensor(self.features[index])
        y = self.labels[index]
        
        return X, y

In [46]:
# Generate training, validation and testing datasets
train_set = Dataset(X_train, y_train)
valid_set = Dataset(X_valid, y_valid)
test_set = Dataset(X_test, y_test)

# Generate dataloaders for the training, validation and testing datasets
train_loader = torch.utils.data.DataLoader(train_set, **params)
valid_loader = torch.utils.data.DataLoader(valid_set, **params)
test_loader = torch.utils.data.DataLoader(test_set, **params)

In [40]:
# Define the network architecture
# The same as before
class Net(nn.Module):
    def __init__(self, input_dim=300, output_dim=2, hidden_1=50, hidden_2=10):
        super(Net, self).__init__()
        # dimension of inputs
        self.input_dim = input_dim 
        # number of classes (2 for binary, 3 for ternary, ...)
        self.output_dim = output_dim
        # number of nodes in each hidden layer
        self.hidden_1 = hidden_1
        self.hidden_2 = hidden_2
        # linear layer (input --> hidden_1)
        self.fc1 = nn.Linear(self.input_dim, self.hidden_1)
        # linear layer (hidden_1 --> hidden_2)
        self.fc2 = nn.Linear(self.hidden_1, self.hidden_2)
        # linear layer (hidden_2 --> 2)
        self.fc3 = nn.Linear(self.hidden_2, self.output_dim)
        # dropout prevents overfitting of data
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        # add the 1st hidden layer, with relu activation function
        x = F.relu(self.fc1(x))
        # add dropout layer after the 1st hidden layer
        x = self.dropout(x)
        # add the 2nd hidden layer, with relu activation function
        x = F.relu(self.fc2(x))
        # add dropout layer after the 2nd hidden layer
        x = self.dropout(x)
        # add output layer
        x = self.fc3(x)
        return x

In [41]:
# initialize the NN
model_bi2 = Net(input_dim=3000)
print(model_bi2)

Net(
  (fc1): Linear(in_features=3000, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [42]:
# specify cross entropy loss as loss function
criterion = nn.CrossEntropyLoss()

# specify optimizer (stochastic gradient descent) and learning rate = 0.01
optimizer = torch.optim.SGD(model_bi2.parameters(), lr=0.01)

In [None]:
# train the network

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity

for epoch in range(max_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ###################
    # train the model #
    ###################
    model_bi2.train() # prep model for training
    for data, target in train_loader:
        # transfer to GPU
        #data, target = data.to(device), target.to(device)
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_bi2(data)
        # calculate the loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
        
    ######################    
    # validate the model #
    ######################
    model_bi2.eval() # prep model for evaluation
    for data, target in valid_loader:
        # transfer to GPU
        #data, target = data.to(device), target.to(device)
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_bi2(data)
        # calculate the loss
        loss = criterion(output, target)
        # update running validation loss 
        valid_loss += loss.item()*data.size(0)
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)
    valid_loss = valid_loss/len(valid_loader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss
        ))
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model_bi2.state_dict(), 'model_bi2.pt')
        valid_loss_min = valid_loss

Epoch: 1 	Training Loss: 0.486026 	Validation Loss: 0.436483
Validation loss decreased (inf --> 0.436483).  Saving model ...
Epoch: 2 	Training Loss: 0.434841 	Validation Loss: 0.426167
Validation loss decreased (0.436483 --> 0.426167).  Saving model ...
Epoch: 3 	Training Loss: 0.414335 	Validation Loss: 0.422378
Validation loss decreased (0.426167 --> 0.422378).  Saving model ...
Epoch: 4 	Training Loss: 0.398915 	Validation Loss: 0.420361
Validation loss decreased (0.422378 --> 0.420361).  Saving model ...
Epoch: 5 	Training Loss: 0.385116 	Validation Loss: 0.421389
Epoch: 6 	Training Loss: 0.371048 	Validation Loss: 0.424653
Epoch: 7 	Training Loss: 0.358265 	Validation Loss: 0.427968
Epoch: 8 	Training Loss: 0.346016 	Validation Loss: 0.434038
Epoch: 9 	Training Loss: 0.332788 	Validation Loss: 0.442058
Epoch: 10 	Training Loss: 0.318841 	Validation Loss: 0.451648
Epoch: 11 	Training Loss: 0.308052 	Validation Loss: 0.461122


In [48]:
# Test the trained network
# The same as before
def predict(model, dataloader):
    prediction_list = []
    truth_list = []
    for i, batch in enumerate(dataloader):
        # batch[0] is X, and batch[1] is y
        outputs = model(batch[0])
        _, predicted = torch.max(outputs.data, 1)
        prediction_list.append(predicted)
        truth_list.append(batch[1])
    return prediction_list, truth_list

In [51]:
# Load model parameters from the trained model with the lowest validation loss
model_bi2.load_state_dict(torch.load('model_bi2.pt'))

# Use the trained model to predict the testing samples
# The output is a list of tensors and a tensor consists of the predicted labels
# of a single batch of testing samples
predictions, truths = predict(model_bi2, test_loader)
# Convert predictions and truths from list of tensors to list of lists
predictions = [list(torch.Tensor.numpy(t)) for t in predictions]
truths = [list(torch.Tensor.numpy(t)) for t in truths]
# Convert predictions and truths from list of lists to a single list
predictions = functools.reduce(lambda a, b: a + b, predictions)
truths = functools.reduce(lambda a, b: a + b, truths)
# Convert predictions from (0, 1) to (1, 2)
predictions = [p + 1 for p in predictions]

In [52]:
# report accuracy, precision, recall, and f1-score on the testing dataset

test_stats = precision_recall_fscore_support(truths, predictions, average='binary')
precision_test, recall_test, fscore_test = test_stats[0], test_stats[1], test_stats[2]

print('The accuracy of testing dataset: {:2.1%}'.format(accuracy_score(truths, predictions)))
print('The precision of testing dataset: {:2.1%}'.format(precision_test))
print('The recall of testing dataset: {:2.1%}'.format(recall_test))
print('The fscore of testing dataset: {:2.1%}'.format(fscore_test))

The accuracy of testing dataset: 79.9%
The precision of testing dataset: 80.3%
The recall of testing dataset: 79.1%
The fscore of testing dataset: 79.7%


## 6. Recurrent Neural Networks

In [14]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
import functools
from sklearn.metrics import accuracy_score 

In [15]:
# Enable CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True

### Binary Classification

In [16]:
X_train_bi

['fry pan look good ha sticking problem hard clean yellow stain ',
 'like container except doe leak took one star still good nondrippy food two large boiled egg fit easily one container work well good dab hummus carrot stick frozen ice pack last several hour best insulated bag price high one containerthat vendor may include shipping help available local store amazon vendor price varycheck grand total price picture shown make lid seem hingedits',
 'love enhanced flavor water many recipe combination cant wait try',
 'thing worked well first wash machine sealant came loose water became trapped two layer guess hand wash want keep month',
 'sushi dish wonderful stoneware solid durable piece well made design doe rough texture may bother people like chopstick basic well crafted functional overall great set couple like eat sushi japanese cuisine',
 'overpricedto smalldid keep drink cold',
 'cool great money convenient way pop open ice cold nice job island dog',
 'work well look great counter c

In [17]:
# Split the original training dataset into one training and one validation set
# The original labels are 1 and 2, and we need to convert them to 0 and 1,
# because in pytorch for binary classification, the labels should be 0 and 1.
len_train = int(0.8 * len(X_train_bi))
X_train_rnn, y_train_rnn = X_train_bi[:len_train], []
for i in range(len_train):
    y_train_rnn.append(y_train_bi[i] - 1) # Convert label from 1 and 2 to 0 and 1

len_valid = len(X_train_bi) - len_train
X_valid_rnn, y_valid_rnn = X_train_bi[len_train:], []
for i in range(len_valid):
    y_valid_rnn.append(y_train_bi[len_train + i] - 1)

X_test_rnn, y_test_rnn = X_test_bi, y_test_bi

In [18]:
# Override the Dataset class
class DatasetRNN(Dataset):
    
    def __init__(self, features, labels):
        'Initialization'
        self.features = features
        self.labels = labels
    
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.features)
    
    def __getitem__(self, index):
        'Generates one sample of data'
        # Select a review (string)
        s = self.features[index]
        
        # convert the string to a list of words
        s = s.split(' ')
        
        wordVec = []
        # To reduce the computational burden, I am gonna truncate each review to
        # 20 words instead of 50 words.
        for i in range(20):
            try:
                wv = model2[s[i]]
                # convert the word vector to a tensor
                wordVec.append(wv)
            except:
                pass
        if len(wordVec) < 20:
            for _ in range(20-len(wordVec)):
                wordVec.append([0 for _ in range(300)])
        
        X = torch.Tensor(wordVec)
        y = self.labels[index]
        
        return X, y

In [19]:
# Hyperparameters
params = {'batch_size': 64,
          'shuffle': True,
          'num_workers': 0}
max_epochs = 50

In [20]:
# Generate training, validation and testing datasets
train_set = DatasetRNN(X_train_rnn, y_train_rnn)
valid_set = DatasetRNN(X_valid_rnn, y_valid_rnn)
test_set = DatasetRNN(X_test_rnn, y_test_rnn)

# Generate dataloaders for the training, validation and testing datasets
train_loader = torch.utils.data.DataLoader(train_set, **params)
valid_loader = torch.utils.data.DataLoader(valid_set, **params)
test_loader = torch.utils.data.DataLoader(test_set, **params)

In [41]:
# Define the network architecture
class RNN(nn.Module):
    def __init__(self, input_dim=300, hidden_dim=50, output_dim=2):
        super(RNN, self).__init__()
        # dimension of inputs
        self.input_dim = input_dim 
        # number of classes (2 for binary, 3 for ternary, ...)
        self.output_dim = output_dim
        # number of nodes in the hidden layer
        self.hidden_dim = hidden_dim
        # linear layer (input --> hidden)
        self.fc1 = nn.Linear(self.input_dim + self.hidden_dim, self.hidden_dim)
        # linear layer (hidden --> output)
        self.fc2 = nn.Linear(self.input_dim + self.hidden_dim, self.output_dim)
    
    def forward(self, wordVec):
        # for each iteration, concatenate the input word vector and the output
        # of the previous hidden layer as the 'total' input of the current
        # hidden layer
        
        # only use the output layer to calculate the output when it comes to
        # the last word (i.e. the 20th word in each review)
        
        # initialize a zero tensor of shape (batch_size * hidden dimension)
        batch_size = wordVec.shape[0]
        hidden = torch.zeros(batch_size, self.hidden_dim)
        combined = torch.cat((wordVec[:,0,], hidden), 1)
        for i in range(1, 20):
            hidden = self.fc1(combined)
            combined = torch.cat((wordVec[:,i,], hidden), 1)
        output = self.fc2(combined)
       
        return output
    
    '''def initHidden(self, batch_size):
        # initialize a zero tensor of shape (batch_size * hidden dimension)
        return torch.zeros(batch_size, self.hidden_dim)'''

In [42]:
# initialize the RNN
model_rnn = RNN()
print(model_rnn)

RNN(
  (fc1): Linear(in_features=350, out_features=50, bias=True)
  (fc2): Linear(in_features=350, out_features=2, bias=True)
)


In [43]:
# specify cross entropy loss as loss function
criterion = nn.CrossEntropyLoss()

# specify optimizer (stochastic gradient descent) and learning rate = 0.01
optimizer = torch.optim.SGD(model_rnn.parameters(), lr=0.005)

In [44]:
# train the network

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity

for epoch in range(max_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ###################
    # train the model #
    ###################
    model_rnn.train() # prep model for training
    for data, target in train_loader:
        # transfer to GPU
        #data, target = data.to(device), target.to(device)
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        '''hidden = model_rnn.initHidden(64)'''
        output = model_rnn(data)
        # calculate the loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
        
    ######################    
    # validate the model #
    ######################
    model_rnn.eval() # prep model for evaluation
    for data, target in valid_loader:
        # transfer to GPU
        #data, target = data.to(device), target.to(device)
        # forward pass: compute predicted outputs by passing inputs to the model
        '''hidden = model_rnn.initHidden(64)'''
        output = model_rnn(data)
        # calculate the loss
        loss = criterion(output, target)
        # update running validation loss 
        valid_loss += loss.item()*data.size(0)
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)
    valid_loss = valid_loss/len(valid_loader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss
        ))
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model_rnn.state_dict(), 'model_rnn.pt')
        valid_loss_min = valid_loss

Epoch: 1 	Training Loss: 0.668292 	Validation Loss: 0.640593
Validation loss decreased (inf --> 0.640593).  Saving model ...
Epoch: 2 	Training Loss: 0.440111 	Validation Loss: 0.411271
Validation loss decreased (0.640593 --> 0.411271).  Saving model ...
Epoch: 3 	Training Loss: 0.398765 	Validation Loss: 0.395794
Validation loss decreased (0.411271 --> 0.395794).  Saving model ...
Epoch: 4 	Training Loss: 0.397220 	Validation Loss: 0.394400
Validation loss decreased (0.395794 --> 0.394400).  Saving model ...
Epoch: 5 	Training Loss: 0.396308 	Validation Loss: 0.409668
Epoch: 6 	Training Loss: 0.396186 	Validation Loss: 0.395039
Epoch: 7 	Training Loss: 0.395135 	Validation Loss: 0.398773
Epoch: 8 	Training Loss: 0.395456 	Validation Loss: 0.403708
Epoch: 9 	Training Loss: 0.394970 	Validation Loss: 0.400147
Epoch: 10 	Training Loss: 0.394706 	Validation Loss: 0.393275
Validation loss decreased (0.394400 --> 0.393275).  Saving model ...
Epoch: 11 	Training Loss: 0.393877 	Validation Lo

In [49]:
# Test the trained network
# The same as before
def predict(model, dataloader):
    prediction_list = []
    truth_list = []
    for i, batch in enumerate(dataloader):
        # batch[0] is X, and batch[1] is y
        outputs = model(batch[0])
        _, predicted = torch.max(outputs.data, 1)
        prediction_list.append(predicted)
        truth_list.append(batch[1])
    return prediction_list, truth_list

In [50]:
# Load model parameters from the trained model with the lowest validation loss
model_rnn.load_state_dict(torch.load('model_rnn.pt'))

# Use the trained model to predict the testing samples
# The output is a list of tensors and a tensor consists of the predicted labels
# of a single batch of testing samples
predictions, truths = predict(model_rnn, test_loader)
# Convert predictions and truths from list of tensors to list of lists
predictions = [list(torch.Tensor.numpy(t)) for t in predictions]
truths = [list(torch.Tensor.numpy(t)) for t in truths]
# Convert predictions and truths from list of lists to a single list
predictions = functools.reduce(lambda a, b: a + b, predictions)
truths = functools.reduce(lambda a, b: a + b, truths)
# Convert predictions from (0, 1) to (1, 2)
predictions = [p + 1 for p in predictions]

In [51]:
# report accuracy, precision, recall, and f1-score on the testing dataset

test_stats = precision_recall_fscore_support(truths, predictions, average='binary')
precision_test, recall_test, fscore_test = test_stats[0], test_stats[1], test_stats[2]

print('The accuracy of testing dataset: {:2.1%}'.format(accuracy_score(truths, predictions)))
print('The precision of testing dataset: {:2.1%}'.format(precision_test))
print('The recall of testing dataset: {:2.1%}'.format(recall_test))
print('The fscore of testing dataset: {:2.1%}'.format(fscore_test))

The accuracy of testing dataset: 83.0%
The precision of testing dataset: 82.8%
The recall of testing dataset: 83.3%
The fscore of testing dataset: 83.0%


## It can be shown that RNN (with accuracy of 83.0%) outperforms FNN (with accuracy of 79.9%) on the same binary classification problem.

## 7. Gated Recurrent Units

In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
import functools
from sklearn.metrics import accuracy_score 

In [21]:
# Enable CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True

### Binary Classification

In [22]:
X_train_bi

['ive mug le six month loved first couple month really disappointed couple thing bad another thermos mug ha lasted year first big problem terrible odor mug developed cannot get rid matter many time wash soak baking soda etc second issue lid annoying clean hollow inside fill water youre washing split two piece put back together seems like bit hassle id able overlook lid fact pick odor easily make unusable point',
 'ordered last year started using one stopped week second one lasted week amazon ha return policy expired contact manufacturer black decker authenticated year warranty date cut plug send along get another coffee maker using one month manufacturer way get year warranty honoredi say pot awesome want fresh cup coffee time take everywhere go pleased many year one even making cup per day lasted almost year',
 'gave gift wedding wish list wa appreciative',
 'thankfully mug wa cracked wa inside nothing shipping look like wa made way love color kid everybody need color ',
 'bought repl

In [23]:
# Split the original training dataset into one training and one validation set
# The original labels are 1 and 2, and we need to convert them to 0 and 1,
# because in pytorch for binary classification, the labels should be 0 and 1.
len_train = int(0.8 * len(X_train_bi))
X_train_gru, y_train_gru = X_train_bi[:len_train], []
for i in range(len_train):
    y_train_gru.append(y_train_bi[i] - 1) # Convert label from 1 and 2 to 0 and 1

len_valid = len(X_train_bi) - len_train
X_valid_gru, y_valid_gru = X_train_bi[len_train:], []
for i in range(len_valid):
    y_valid_gru.append(y_train_bi[len_train + i] - 1)

X_test_gru, y_test_gru = X_test_bi, y_test_bi

In [24]:
# Override the Dataset class
class DatasetGRU(Dataset):
    
    def __init__(self, features, labels):
        'Initialization'
        self.features = features
        self.labels = labels
    
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.features)
    
    def __getitem__(self, index):
        'Generates one sample of data'
        # Select a review (string)
        s = self.features[index]
        
        # convert the string to a list of words
        s = s.split(' ')
        
        wordVec = []
        # To reduce the computational burden, I am gonna truncate each review to
        # 20 words instead of 50 words.
        for i in range(20):
            try:
                wv = model2[s[i]]
                # convert the word vector to a tensor
                wordVec.append(wv)
            except:
                pass
        if len(wordVec) < 20:
            for _ in range(20-len(wordVec)):
                wordVec.append([0 for _ in range(300)])
        
        X = torch.Tensor(wordVec)
        y = self.labels[index]
        
        return X, y

In [25]:
# Hyperparameters
params = {'batch_size': 64,
          'shuffle': True,
          'num_workers': 0}
max_epochs = 50

In [26]:
# Generate training, validation and testing datasets
train_set = DatasetGRU(X_train_gru, y_train_gru)
valid_set = DatasetGRU(X_valid_gru, y_valid_gru)
test_set = DatasetGRU(X_test_gru, y_test_gru)

# Generate dataloaders for the training, validation and testing datasets
train_loader = torch.utils.data.DataLoader(train_set, **params)
valid_loader = torch.utils.data.DataLoader(valid_set, **params)
test_loader = torch.utils.data.DataLoader(test_set, **params)

In [74]:
# Define the network architecture
# Refer to page 188 of ed3book for the details of GRU
class GRU(nn.Module):
    def __init__(self, input_dim=300, hidden_dim=50, output_dim=2):
        super(GRU, self).__init__()
        # dimension of inputs
        self.input_dim = input_dim 
        # number of classes (2 for binary, 3 for ternary, ...)
        self.output_dim = output_dim
        # number of nodes in the hidden layer
        self.hidden_dim = hidden_dim
        # linear layer for reset gate
        self.fc1 = nn.Linear(self.input_dim + self.hidden_dim, self.hidden_dim)
        # linear layer for update gate
        self.fc2 = nn.Linear(self.input_dim + self.hidden_dim, self.hidden_dim)
        # linear layer for intermediate hidden state
        self.fc3 = nn.Linear(self.input_dim + self.hidden_dim, self.hidden_dim)
        # linear layer for output
        self.fc4 = nn.Linear(self.input_dim + self.hidden_dim, self.output_dim)
    
    def forward(self, wordVec):
        # for each iteration, concatenate the input word vector and the output
        # of the previous hidden layer as the 'total' input of the current
        # hidden layer
        
        # only use the output layer to calculate the output when it comes to
        # the last word (i.e. the 20th word in each review)
        
        # initialize a zero tensor of shape (batch_size * hidden dimension)
        batch_size = wordVec.shape[0]
        hidden = torch.zeros(batch_size, self.hidden_dim)
        combined = torch.cat((wordVec[:,0,], hidden), 1)
        sigmoid, tanh = nn.Sigmoid(), nn.Tanh()
        for i in range(1, 20):
            # reset gate
            reset = self.fc1(combined)
            reset = sigmoid(reset)
            # update gate
            update = self.fc2(combined)
            update = sigmoid(update)
            # previous hidden state passes through reset gate
            hidden2 = torch.mul(reset, hidden)
            # intermediate hidden state
            combined2 = torch.cat((wordVec[:,0,], hidden2), 1)
            hidden_inter = self.fc3(combined2)
            hidden_inter = tanh(hidden_inter)
            # hidden state
            hidden = torch.mul(1-update, hidden) + torch.mul(update, hidden_inter)
            combined = torch.cat((wordVec[:,i,], hidden), 1)
        output = self.fc4(combined)
       
        return output

In [75]:
# initialize the RNN
model_gru = GRU()
print(model_gru)

GRU(
  (fc1): Linear(in_features=350, out_features=50, bias=True)
  (fc2): Linear(in_features=350, out_features=50, bias=True)
  (fc3): Linear(in_features=350, out_features=50, bias=True)
  (fc4): Linear(in_features=350, out_features=2, bias=True)
)


In [76]:
# specify cross entropy loss as loss function
criterion = nn.CrossEntropyLoss()

# specify optimizer (stochastic gradient descent) and learning rate = 0.01
optimizer = torch.optim.SGD(model_gru.parameters(), lr=0.005)

In [77]:
# train the network

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity

for epoch in range(max_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ###################
    # train the model #
    ###################
    model_gru.train() # prep model for training
    for data, target in train_loader:
        # transfer to GPU
        #data, target = data.to(device), target.to(device)
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_gru(data)
        # calculate the loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
        
    ######################    
    # validate the model #
    ######################
    model_gru.eval() # prep model for evaluation
    for data, target in valid_loader:
        # transfer to GPU
        #data, target = data.to(device), target.to(device)
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_gru(data)
        # calculate the loss
        loss = criterion(output, target)
        # update running validation loss 
        valid_loss += loss.item()*data.size(0)
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)
    valid_loss = valid_loss/len(valid_loader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss
        ))
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model_gru.state_dict(), 'model_gru.pt')
        valid_loss_min = valid_loss

Epoch: 1 	Training Loss: 0.587332 	Validation Loss: 0.568845
Validation loss decreased (inf --> 0.568845).  Saving model ...
Epoch: 2 	Training Loss: 0.563591 	Validation Loss: 0.564967
Validation loss decreased (0.568845 --> 0.564967).  Saving model ...
Epoch: 3 	Training Loss: 0.559337 	Validation Loss: 0.563758
Validation loss decreased (0.564967 --> 0.563758).  Saving model ...
Epoch: 4 	Training Loss: 0.557626 	Validation Loss: 0.560760
Validation loss decreased (0.563758 --> 0.560760).  Saving model ...
Epoch: 5 	Training Loss: 0.556185 	Validation Loss: 0.562119
Epoch: 6 	Training Loss: 0.555508 	Validation Loss: 0.559465
Validation loss decreased (0.560760 --> 0.559465).  Saving model ...
Epoch: 7 	Training Loss: 0.554636 	Validation Loss: 0.558504
Validation loss decreased (0.559465 --> 0.558504).  Saving model ...
Epoch: 8 	Training Loss: 0.553820 	Validation Loss: 0.558705
Epoch: 9 	Training Loss: 0.553356 	Validation Loss: 0.558221
Validation loss decreased (0.558504 --> 0.

In [78]:
# Test the trained network
# The same as before
def predict(model, dataloader):
    prediction_list = []
    truth_list = []
    for i, batch in enumerate(dataloader):
        # batch[0] is X, and batch[1] is y
        outputs = model(batch[0])
        _, predicted = torch.max(outputs.data, 1)
        prediction_list.append(predicted)
        truth_list.append(batch[1])
    return prediction_list, truth_list

In [79]:
# Load model parameters from the trained model with the lowest validation loss
model_gru.load_state_dict(torch.load('model_gru.pt'))

# Use the trained model to predict the testing samples
# The output is a list of tensors and a tensor consists of the predicted labels
# of a single batch of testing samples
predictions, truths = predict(model_gru, test_loader)
# Convert predictions and truths from list of tensors to list of lists
predictions = [list(torch.Tensor.numpy(t)) for t in predictions]
truths = [list(torch.Tensor.numpy(t)) for t in truths]
# Convert predictions and truths from list of lists to a single list
predictions = functools.reduce(lambda a, b: a + b, predictions)
truths = functools.reduce(lambda a, b: a + b, truths)
# Convert predictions from (0, 1) to (1, 2)
predictions = [p + 1 for p in predictions]

In [80]:
# report accuracy, precision, recall, and f1-score on the testing dataset

test_stats = precision_recall_fscore_support(truths, predictions, average='binary')
precision_test, recall_test, fscore_test = test_stats[0], test_stats[1], test_stats[2]

print('The accuracy of testing dataset: {:2.1%}'.format(accuracy_score(truths, predictions)))
print('The precision of testing dataset: {:2.1%}'.format(precision_test))
print('The recall of testing dataset: {:2.1%}'.format(recall_test))
print('The fscore of testing dataset: {:2.1%}'.format(fscore_test))

The accuracy of testing dataset: 81.5%
The precision of testing dataset: 81.7%
The recall of testing dataset: 81.2%
The fscore of testing dataset: 81.4%
