## Module Import

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.metrics import confusion_matrix
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np
import pandas as pd
import re
from torch.utils.data import TensorDataset
import gensim
import gensim.downloader as api
from tqdm import tqdm


## 1. Dataset Generation 

In [None]:
def cleaning(s):
    # Remove extra space
    s = re.sub(r'\s+', ' ', s)
    # Remove too short review
    if len(s.split(' ')) < 15:
        return '!'
    return s

#df = pd.read_table('../HW1/data/amazon_reviews_us_Office_Products_v1_00.tsv', on_bad_lines='skip', usecols=['star_rating', 'review_body'], dtype={'star_rating':'str', 'review_body':'str'}).dropna()
df = pd.read_table('data.tsv', on_bad_lines='skip', usecols=['star_rating', 'review_body'], dtype={'star_rating':'str', 'review_body':'str'}).dropna()
df['content'] = df['review_body'].apply(lambda context: cleaning(context))

class_label = []
too_short_index = []
for index, row in df.iterrows():
    if int(row['star_rating']) > 3:
        class_label.append(1)
    else:
        class_label.append(0)
df['class_label'] = class_label
df2 = df.drop(df[df['content'] == '!'].index)
df2.to_csv('cleaned_data.csv', index=False)

In [None]:
#df2 = pd.read_csv('cleaned_data.csv')
df3 = df2.groupby("class_label").sample(n = 50000, random_state=1)

In [None]:
sentences = []
for s in df3['content']:
    cur = s.split(' ')
    sentences.append(cur)

## 2. Word Embedding

### (a)

In [None]:
wv = api.load('word2vec-google-news-300')
pairs = [
    ('great', 'good'),
    ('large', 'huge'),
    ('concern', 'worry')   
]
for w1, w2 in pairs:
    print('%r, %r, %.2f' % (w1, w2, wv.similarity(w1, w2)))

### (b)

In [None]:
model = gensim.models.Word2Vec(sentences=sentences, vector_size=300, window=13, min_count=9)
for w1, w2 in pairs:
    print('%r, %r, %.2f' % (w1, w2, model.wv.similarity(w1, w2)))

### Q: What do you conclude from comparing vectors generated by yourself and the pretrained model? <br>
Ans: According to the similarities comparison between our training model and the pretrained model, the similarity of every similar pair from pretrained one is better. I would like to assume that this differences could result from the dataset. The pretrained model's training data could be more dynamic, and our amazon's review data could somehow being ordinary (since they're all feedbacks about products). <br>
### Q: Which of the Word2Vec models seems to encode semantic similarities between words better?<br>
Ans: The "“word2vec-googlenews-300" pretrained model.

## 3. Simple models

Split dataset

In [None]:
label_V = df3['class_label'].to_numpy()
content_V = df3['content'].to_numpy()
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_V = tfidf_vectorizer.fit_transform(content_V).toarray()
print(tfidf_V.shape)

In [None]:
def word2V_features(content_V):
    res = []
    for i, s in enumerate(content_V):
        words = s.split(' ')
        cur = []
        for w in words:
            try:
                cur.append(wv[w])
            except KeyError:
                pass
        cur = np.array(cur)
        res.append(np.mean(cur, axis=0))
    return np.array(res)
word2V_V = word2V_features(content_V)
print(word2V_V.shape)

In [None]:
word2V_train, word2V_test, word2V_label_train, word2V_label_test = train_test_split(word2V_V, label_V, test_size=0.2, random_state=1)
tfidf_train, tfidf_test, tfidf_label_train, tfidf_label_test = train_test_split(tfidf_V, label_V, test_size=0.2, random_state=1)

## Perceptron

In [None]:
def calculate_result(tn, fp, fn, tp):
    acc = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 / ((1 / precision) + (1 / recall))
    return acc, precision, recall, f1
    
model = Perceptron(max_iter=1000)
model.fit(word2V_train, word2V_label_train)
predictions_test = model.predict(word2V_test)
tn, fp, fn, tp = confusion_matrix(word2V_label_test, predictions_test).ravel()
acc, precision, recall, f1 = calculate_result(tn, fp, fn, tp)
print("Word2Vec Perceptron accuracy: {:.3f}%".format(acc*100))

In [None]:
model = Perceptron(max_iter=1000)
model.fit(tfidf_train, tfidf_label_train)
predictions_test = model.predict(tfidf_test)
tn, fp, fn, tp = confusion_matrix(tfidf_label_test, predictions_test).ravel()
acc, precision, recall, f1 = calculate_result(tn, fp, fn, tp)
print("TF-IDF Perceptron accuracy: {:.3f}%".format(acc*100))

## SVM

In [None]:
from sklearn.svm import LinearSVC
model = LinearSVC(max_iter=1000)
model.fit(word2V_train, word2V_label_train)
predictions_test = model.predict(word2V_test)
tn, fp, fn, tp = confusion_matrix(word2V_label_test, predictions_test).ravel()
acc, precision, recall, f1 = calculate_result(tn, fp, fn, tp)
print("Word2Vec SVM accuracy: {:.3f}%".format(acc*100))

In [None]:
model = LinearSVC(max_iter=1000)
model.fit(tfidf_train, tfidf_label_train)
predictions_test = model.predict(tfidf_test)
tn, fp, fn, tp = confusion_matrix(tfidf_label_test, predictions_test).ravel()
acc, precision, recall, f1 = calculate_result(tn, fp, fn, tp)
print("TF-IDF SVM accuracy: {:.3f}%".format(acc*100))

### Q: What do you conclude from comparing performances for the models trained using the two different feature types? <br>
Ans: TF-IDF has the better accuracy result for both model than word2Vec. For word2Vec features, SVM has better result than perceptron. 

## 4. Feedforward Neural Networks

In [None]:
class myDataset(Dataset):
    def __init__(self, data, labels, transform=None):
        self.data = data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        feature = self.data[index].astype('float32').reshape((-1, 1))
        label = self.labels[index]

        if self.transform is not None:
            feature = self.transform(feature)
        
        return feature, label

In [None]:
class TwoLayerMLP(nn.Module):
    def __init__(self, D_in, H1, H2, D_out):
        super().__init__()
        self.D_in = D_in
        self.fc1 = nn.Linear(D_in, H1)
        self.fc2 = nn.Linear(H1, H2)
        self.fc3 = nn.Linear(H2, D_out)
        self.dropout = nn.Dropout(0.2)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = x.view(-1, self.D_in)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.softmax(x)
        
        return x

In [None]:
params = {'batch_size': 32, 'shuffle': True}

train_data = myDataset(word2V_train, word2V_label_train, transform=transforms.ToTensor())
test_data = myDataset(word2V_test, word2V_label_test, transform=transforms.ToTensor())
train_data_generator = DataLoader(train_data, **params)
test_data_generator = DataLoader(test_data, **params)

### (a)

In [None]:
MLPmodel = TwoLayerMLP(300, 50, 5, 2)
print(MLPmodel)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(MLPmodel.parameters(), lr=0.01)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience = 5) 

In [None]:
try:
    MLPmodel.load_state_dict(torch.load('model_4a.pt'))
except:
    pass

MLPmodel.train()
max_epochs = 100
min_loss = np.Inf
for epoch in tqdm(range(max_epochs)):
    test_loss = 0.0
    for data, target in train_data_generator:
        optimizer.zero_grad()
        output = MLPmodel(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
    
    MLPmodel.eval()
    for data, target in test_data_generator:
        output = MLPmodel(data)
        loss = criterion(output, target)
        test_loss += loss.item()*data.size(0)
    test_loss = test_loss/len(test_data_generator.dataset)
    scheduler.step(test_loss/len(test_data_generator))
    if epoch == 0:
        min_loss = test_loss
    if test_loss < min_loss:
        min_loss = test_loss
        torch.save(MLPmodel.state_dict(), 'model_4a.pt')
        print("Model saved. Loss = ", min_loss)
    

In [None]:
def cal_acc(MLPmodel, dataloader):
    total = 0
    match = 0
    for data, target in dataloader:
        outputs = MLPmodel(data)
        _, predicted = torch.max(outputs.data, 1)
        for a, b in zip(target, predicted):
            total += 1
            if a == b:
                match += 1 
    return match / total

try:
    MLPmodel.load_state_dict(torch.load('model_4a.pt'))
except:
    pass
MLPmodel.eval()
acc = cal_acc(MLPmodel, test_data_generator)
print("Word2Vec Feedforward Neural Networks accuracy: {:.3f}%".format(acc*100))

### (b)

In [None]:
def concat_word2V_features(content_V):
    res = []
    for i, s in enumerate(content_V):
        words = s.split(' ')
        cur = []
        for w in words:
            try:
                cur.append(wv[w])
            except KeyError:
                pass
            if len(cur) >= 10:
                break
        while len(cur) < 10:
            cur.append([0 for _ in range(300)])
        cur = np.array(cur)
        res.append(cur)
    return np.array(res)

concat_word2Vec_V = concat_word2V_features(content_V)
print(concat_word2Vec_V.shape)
concat_word2V_train, concat_word2V_test, concat_word2V_label_train, concat_word2V_label_test = train_test_split(concat_word2Vec_V, label_V, test_size=0.2, random_state=32)
concat_train_data = myDataset(concat_word2V_train, concat_word2V_label_train, transform=transforms.ToTensor())
concat_test_data = myDataset(concat_word2V_test, concat_word2V_label_test, transform=transforms.ToTensor())
concat_train_data_generator = DataLoader(concat_train_data, **params)
concat_test_data_generator = DataLoader(concat_test_data, **params)

In [None]:
concat_MLPmodel = TwoLayerMLP(3000, 50, 5, 2)
concat_criterion = nn.CrossEntropyLoss()
concat_optimizer = torch.optim.SGD(concat_MLPmodel.parameters(), lr=0.1)
concat_scheduler = ReduceLROnPlateau(concat_optimizer, 'min', patience = 5)

try:
    concat_MLPmodel.load_state_dict(torch.load('model_4b.pt'))
except:
    pass

max_epochs = 100
min_loss = np.Inf
for epoch in tqdm(range(max_epochs)):
    test_loss = 0.0
    concat_MLPmodel.train()
    for data, target in concat_train_data_generator:
        concat_optimizer.zero_grad()
        output = concat_MLPmodel(data)
        loss = concat_criterion(output, target)
        loss.backward()
        concat_optimizer.step()
    
    concat_MLPmodel.eval()
    for data, target in concat_test_data_generator:
        output = concat_MLPmodel(data)
        loss = concat_criterion(output, target)
        test_loss += loss.item()*data.size(0)
    test_loss = test_loss/len(concat_test_data_generator.dataset)
    concat_scheduler.step(test_loss/len(concat_test_data_generator))
    if epoch == 0:
        min_loss = test_loss
    if test_loss < min_loss:
        min_loss = test_loss
        torch.save(concat_MLPmodel.state_dict(), 'model_4b.pt')
        print("Model saved. Loss = ", min_loss)

try:
    concat_MLPmodel.load_state_dict(torch.load('model_4b.pt'))
except:
    pass

concat_MLPmodel.eval()
acc = cal_acc(concat_MLPmodel, concat_test_data_generator)
print("Concat Word2Vec Feedforward Neural Networks accuracy: {:.3f}%".format(acc*100))

### Q: What do you conclude by comparing accuracy values you obtain with those obtained in the “’Simple Models” section? <br>
Ans: The "mean" feature has better result than "concatenate" feature.


## 5. Recurrent Neural Networks

## (a)

In [None]:
def rnn_word2V_features(content_V):
    res = []
    for i, s in enumerate(content_V):
        words = s.split(' ')
        cur = []
        for w in words:
            try:
                cur.append(wv[w])
            except KeyError:
                pass
            if len(cur) >= 10:
                break
        while len(cur) < 10:
            cur.append([0 for _ in range(300)])
        cur = np.array(cur)
        res.append(cur)
    return np.array(res)

rnn_data = rnn_word2V_features(content_V)
print(rnn_data.shape)

In [None]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.h2o(hidden)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)


In [None]:
def train(sentence, label, model, learning_rate):
    model.train()
    hidden = model.initHidden()
    label_tensor = torch.tensor([label], dtype=torch.long)
    model.zero_grad()

    for i in range(len(sentence)):
        input_tensor = torch.from_numpy(sentence[i].astype('float32')).view(-1, 300)
        output, hidden = model(input_tensor, hidden)

    loss = criterion(output, label_tensor)
    loss.backward()

    for p in model.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)

    return output, loss.item()

In [None]:
def test(sentence, label, model):
    model.eval()
    hidden = model.initHidden()
    label_tensor = torch.tensor([label], dtype=torch.long)

    for i in range(len(sentence)):
        input_tensor = torch.from_numpy(sentence[i].astype('float32')).view(-1, 300)
        output, hidden = model(input_tensor, hidden)

    loss = criterion(output, label_tensor)

    return output, loss.item()

In [None]:
rnn_train, rnn_test, rnn_label_train, rnn_label_test = train_test_split(rnn_data, label_V, test_size=0.2, random_state=32)
n_hidden = 10
rnn = RNN(300, n_hidden, 2)

criterion = nn.NLLLoss()
learning_rate = 0.01
max_epoch = 1
min_loss = np.Inf
train_size = len(rnn_data)
load_trained = False

try:
    rnn.load_state_dict(torch.load('model_5a.pt'))
    load_trained = True
except:
    pass

for epoch in tqdm(range(max_epoch)):
    cur_loss = 0
    for sentence, label in zip(rnn_train, rnn_label_train):
        output, loss = train(sentence, label, rnn, learning_rate)
    
    for sentence, label in zip(rnn_test, rnn_label_test):
        output, loss = test(sentence, label, rnn)
        cur_loss += loss
    cur_loss = cur_loss / train_size
    if epoch == 0:
        min_loss = cur_loss
        if not load_trained:
            torch.save(rnn.state_dict(), 'model_5a.pt')
    elif cur_loss < min_loss:
        min_loss = cur_loss
        torch.save(rnn.state_dict(), 'model_5a.pt')
    print("Model saved. Loss = ", min_loss)

        

In [None]:
def evaluate_test_acc(test_data, test_labels, model):
    match = 0
    total = 0
    for sentence, label in zip(test_data, test_labels):
        hidden = model.initHidden()
        for i in range(len(sentence)):
            input_tensor = torch.from_numpy(sentence[i].astype('float32')).view(-1, 300)
            output, hidden = model(input_tensor, hidden)
        _, predicted = torch.max(output.data, 1)
        total += 1
        if predicted == label:
            match += 1

    return match / total
try:
    rnn.load_state_dict(torch.load('model_5a.pt'))
except:
    pass
acc = evaluate_test_acc(rnn_test, rnn_label_test, rnn)
print("RNN accuracy: {:.3f}%".format(acc*100))

## (b)

In [None]:
batch_size = 64

train_data = TensorDataset(torch.from_numpy(rnn_train), torch.from_numpy(rnn_label_train))
test_data = TensorDataset(torch.from_numpy(rnn_test), torch.from_numpy(rnn_label_test))
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [None]:
class GRU(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers, drop_prob=0.5):
        super(GRU, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.lstm = nn.GRU(input_size, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        lstm_out, hidden = self.lstm(x, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = self.sigmoid(out)

        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()
        return hidden

In [None]:
batch_size = 64
max_epoch = 10
min_loss = np.Inf
train_size = len(rnn_data)
clip = 5

gru = GRU(300, 2, 10, 1)
learning_rate = 0.005
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(gru.parameters(), lr=learning_rate)

try:
    gru.load_state_dict(torch.load('model_5b.pt'))
except:
    pass

gru.train()
for epoch in tqdm(range(max_epoch)):
    cur_loss = 0
    for inputs, labels in train_loader:
        h = gru.init_hidden(inputs.shape[0])
        gru.zero_grad()
        x = inputs.to(torch.float32)
        output, h = gru(x, h)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(gru.parameters(), clip)
        optimizer.step()
    
    gru.eval()
    for inputs, labels in test_loader:
        h = gru.init_hidden(inputs.shape[0])
        x = inputs.to(torch.float32)
        output, h = gru(x, h)
        test_loss = criterion(output.squeeze(), labels.float())
        cur_loss += test_loss.item()

    cur_loss = cur_loss / len(test_loader)
    if epoch == 0:
        min_loss = cur_loss
    elif epoch != 0 and cur_loss < min_loss:
        min_loss = cur_loss
        torch.save(gru.state_dict(), 'model_5b.pt')
        print("Model saved. Loss = ", min_loss)

In [None]:
try:
    gru.load_state_dict(torch.load('model_5b.pt'))
except:
    pass

num_correct = 0
gru.eval()
for inputs, labels in test_loader:
    h = gru.init_hidden(inputs.shape[0])
    x = inputs.to(torch.float32)
    output, h = gru(x, h)
    test_loss = criterion(output.squeeze(), labels.float())
    pred = torch.round(output.squeeze())  # Rounds the output to 0/1
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)

test_acc = num_correct/len(test_loader.dataset)
print("GRU accuracy: {:.3f}%".format(test_acc*100))

## (c)

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers, drop_prob=0.5):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_size, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        lstm_out, hidden = self.lstm(x, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = self.sigmoid(out)

        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        return hidden

In [None]:
batch_size = 64
max_epoch = 10
min_loss = np.Inf
train_size = len(rnn_data)
clip = 5

lstm = LSTM(300, 2, 10, 1)
learning_rate = 0.005
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)

try:
    lstm.load_state_dict(torch.load('model_5c.pt'))
except:
    pass

lstm.train()
for epoch in tqdm(range(max_epoch)):
    cur_loss = 0
    for inputs, labels in train_loader:
        h = lstm.init_hidden(inputs.shape[0])
        h = tuple([e.data for e in h])
        lstm.zero_grad()
        x = inputs.to(torch.float32)
        output, h = lstm(x, h)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(lstm.parameters(), clip)
        optimizer.step()
    
    lstm.eval()
    for inputs, labels in test_loader:
        h = lstm.init_hidden(inputs.shape[0])
        h = tuple([each.data for each in h])
        x = inputs.to(torch.float32)
        output, h = lstm(x, h)
        test_loss = criterion(output.squeeze(), labels.float())
        cur_loss += test_loss.item()

    cur_loss = cur_loss / len(test_loader)
    if epoch == 0:
        min_loss = cur_loss
    elif epoch != 0 and cur_loss < min_loss:
        min_loss = cur_loss
        torch.save(lstm.state_dict(), 'model_5c.pt')
        print("Model saved. Loss = ", min_loss)
    

In [None]:
try:
    lstm.load_state_dict(torch.load('model_5c.pt'))
except:
    pass

num_correct = 0
lstm.eval()
for inputs, labels in test_loader:
    h = lstm.init_hidden(inputs.shape[0])
    h = tuple([each.data for each in h])
    x = inputs.to(torch.float32)
    output, h = lstm(x, h)
    test_loss = criterion(output.squeeze(), labels.float())
    pred = torch.round(output.squeeze())  # Rounds the output to 0/1
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)

test_acc = num_correct/len(test_loader.dataset)
print("LSTM accuracy: {:.3f}%".format(test_acc*100))

## Q: What do you conclude by comparing accuracy values you obtain by GRU, LSTM, and simple RNN? <br>
Ans: GRU and LSTM have better result than simple RNN, looks like long term memories indeed effect the accuracy

### Overall Accuracy <br>
Perceptron: 76.9% <br>
SVM: 79.9% <br>
NN(mean): 79.8% <br>
NN(concat): 71.9% <br>
RNN: 70.9% <br>
LRU: 76.2% <br>
LSTM: 76.4%