In [None]:
### Neural network feed forward classifier with word embeddings for sentiment analysis 

# => structure:
# 1. input --> hidden: linear(input_dim, hidden_dim)
# 2. tanh action
# 3. hidden --> output: linear(hidden_dim, 2)
# 4. log softmax

# => use pretrained glove word embeddings 

import numpy as np
from sklearn import metrics
import torch
from collections import defaultdict
import math 
import re 
import pickle
import pandas as pd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import gzip

In [None]:
# load the dataset 
train_dataset = pd.read_csv("train_dataset.csv")
test_dataset = pd.read_csv("test_dataset.csv")
train_notes = train_dataset['note'].as_matrix()
test_notes = test_dataset['note'].as_matrix()

train_y = train_dataset['label'].as_matrix()
test_y = test_dataset['label'].as_matrix()

# converting train and test to clean format 
print "Converting training to no numbers"
for i in range(len(train_notes)):
    note = train_notes[i].lower()
    string = re.sub("\d+", "", note)
    train_notes[i] = " ".join(re.findall(r'\w+', string))

print "Converting testing to no numbers"
for i in range(len(test_notes)):
    note = test_notes[i].lower()
    string = re.sub("\d+", "", note)
    test_notes[i] = " ".join(re.findall(r'\w+', string))

In [None]:
### CODE ADAPTED FROM 6.864

torch.manual_seed(1)

batch_size = 173
hidden_dim = 300
weight_decay = 1e-5
lr = 1e-3

# extract text from word2vec
f = gzip.open('word_vectors.txt.gz', 'r')
wv = [ ]
lines = f.readlines()
for line in lines:
    wv.append(line.strip())

embeddings = {}
for line in wv: 
    parts = line.split()
    word = parts[0]
    vector = np.array([float(v) for v in parts[1:]])
    embeddings[word] = vector

    
def extract_embeddings(data):
    features = [ ]
    for i in range(len(data)):
        num_words = 0
        current_feature = [ 0.0 for _ in range(300) ]
        for word in data[i].split():
            if word in word_to_vec:
                current_feature += embeddings[word]/np.linalg.norm(embeddings[word])
                num_words += 1

        if num_words > 0:
            current_feature /= num_words

        features.append(current_feature)

    return np.array(features)

In [None]:
from sklearn.model_selection import train_test_split

### CODE ADAPTED FROM 6.864

# feature extraction and define dataset
train_x = extract_embeddings(train_notes)
test_x = extract_embeddings(test_notes)

train_y = train_y.astype(int)
test_y = test_y.astype(int)

print "datatype"
print train_y.dtype
print test_y.dtype 

train_x, dev_x, train_y, dev_y = train_test_split(train_x, train_y, test_size=0.1, random_state=42)

train_dataset = torch.utils.data.TensorDataset(torch.FloatTensor(train_x), torch.LongTensor(train_y))
dev_dataset = torch.utils.data.TensorDataset(torch.FloatTensor(dev_x), torch.LongTensor(dev_y))
test_dataset = torch.utils.data.TensorDataset(torch.FloatTensor(test_x), torch.LongTensor(test_y))
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
dev_loader = torch.utils.data.DataLoader(dev_dataset)
test_loader = torch.utils.data.DataLoader(test_dataset)

### CODE FROM 6.864
class FFN(nn.Module):

    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FFN, self).__init__()
        self.seq = nn.Sequential(
                nn.Linear(input_dim, hidden_dim),
                nn.Tanh(),
                nn.Linear(hidden_dim, output_dim),
                nn.Tanh(),
                nn.LogSoftmax()
                )

    def forward(self, x):
        x = self.seq(x)
        return x

def evaluate(model, loader):
    model.eval()
    pred = []
    actual = []
    for data, label in loader:
        data, label = Variable(data), Variable(label)
        output = model(data)
        output = output.data.cpu().numpy()
        pred = np.concatenate((pred, np.argmax(output, axis=1)), axis=0)
        actual = np.concatenate((actual, label.data.cpu().numpy()), axis=0)

    return metrics.accuracy_score( y_pred=pred, y_true=actual)

def train(model, loader, max_epoches, dev_loader, test_loader, verbose=False):
    criterion = nn.NLLLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    best_dev = 0.0
    corresponding_test = 0.0
    for epoch in range(max_epoches):
        model.train()
        for data, label in train_loader:
            data, label = Variable(data), Variable(label)
            model.zero_grad()
            output = model(data)
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()

        dev = evaluate(model, dev_loader)
        test = evaluate(model, test_loader)
        if dev > best_dev:
            best_dev = dev
            corresponding_test = test

    print (best_dev, corresponding_test)

In [None]:
model = FFN(100, hidden_dim, 2)
train(model, train_loader, 50, dev_loader, test_loader)