In [1]:
import pandas as pd
import csv
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import pdb
import nltk
import string
from nltk.corpus import stopwords
from random import shuffle

In [2]:
def clean_list_of_word(words, remove_stopping_word=False):
    """remove punctuations"""
    translator = str.maketrans('', '', string.punctuation)
    result = list(map(lambda x: x.translate(translator) , words))
    
    """remove stopping word"""
    if remove_stopping_word:
        result = [word for word in result if word not in stopwords.words('english')]
    return result

In [3]:
words = pd.read_table("../glove.6B/glove.6B.100d.txt", sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)
def vec(w):
  return words.loc[w].as_matrix()

In [16]:
vec("0")

array([-0.61491  ,  0.92734  ,  0.55826  ,  0.0057455, -0.67172  ,
        0.61189  ,  0.99226  ,  0.27641  , -0.64889  , -0.51675  ,
        1.1789   , -1.1631   , -0.90994  ,  0.5714   ,  1.0018   ,
        0.20675  ,  0.36751  ,  0.46931  ,  0.096868 ,  1.4351   ,
        1.3924   ,  0.4589   ,  0.65491  ,  1.0463   ,  0.11249  ,
        0.56988  ,  0.76595  ,  0.45246  ,  0.48627  , -0.74383  ,
       -0.21478  ,  0.7041   ,  0.068082 ,  0.54945  , -0.012204 ,
       -0.18558  ,  0.4266   ,  0.90742  , -0.62418  ,  0.56063  ,
        0.9091   , -1.4814   ,  0.39494  , -1.2353   ,  0.39864  ,
       -0.86375  ,  0.51337  ,  0.36805  ,  0.2084   ,  0.068722 ,
       -0.011695 , -0.48135  , -0.61458  ,  1.2844   , -1.2055   ,
       -1.929    , -0.48368  , -0.67811  ,  0.15408  ,  0.15402  ,
       -1.4051   , -1.5294   , -1.2464   , -0.38637  ,  0.89358  ,
        0.71173  , -1.0529   ,  0.64586  , -0.16673  ,  0.53946  ,
       -1.1164   ,  0.19146  ,  0.55717  ,  0.031212 ,  0.7696

In [4]:
def get_single_sentence_embedding(sent):
    result = []
    for item in sent:
        try:
            result.append(np.array(vec(item)))
        except:
            result.append(np.array(vec(",")))
            continue
    return np.array(result)      

In [5]:
def get_batch_embedding(sent_list):
    max_len = max(list(map(len, sent_list)))
    result = []
    for sent in sent_list:
        result.append(get_single_sentence_embedding(sent + ["."]*(max_len - len(sent))))
    return np.array(result)    

In [102]:
class RNNModel(nn.Module):
    def __init__(self, input_dimension, hidden_dimension, batch_size):
        super(RNNModel, self).__init__()
        self.input_dim = input_dimension
        self.hidden_dim = hidden_dimension
        self.batch_size = batch_size
        
        self.lstm = nn.LSTM(input_dimension, hidden_dimension, bidirectional=False)
        self.fc = nn.Linear(hidden_dimension, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x, hidden, c):
        x = x.transpose(0, 1)
        len_seq = len(x)
        outputs, last = self.lstm(x, (hidden, c))
        output = self.fc(last[0])
        output = self.sigmoid(output)
        return output.squeeze()
           
    def init_hidden(self):
        h0 = Variable(torch.randn(1, self.batch_size, self.hidden_dim))
        c0 = Variable(torch.randn(1, self.batch_size, self.hidden_dim))
        return h0, c0

In [126]:
df = pd.read_csv('../all_ter_data_dropna.csv')

In [181]:
one_list = list(df.loc[df['ht'] == 1.0][])

In [182]:
zero_list = list(df.loc[df['ht'] == 0.0]['Unnamed: 0'])[:16082]

In [183]:
total_list = one_list + zero_list

In [184]:
shuffle(total_list)

In [None]:
batch_size = 40
model = RNNModel(100, 100, batch_size)
if torch.cuda.is_available():
    model = model.cuda()
optimizer = optim.SGD(model.parameters(), lr=0.5)
criteria = nn.BCELoss()

for e in range(200):
    for i in range(800):
        ids = total_list[i*batch_size:(i+1)*batch_size]
        x_batch = []
        y_batch = []
        for index in ids:
            row = df.loc[df['Unnamed: 0'] == index]
            label = int(float(row['ht']))
            detail = clean_list_of_word(str(row['juicy_details']).split(' '), True)
            x_batch.append(detail)
            y_batch.append(label)
        x_batch = get_batch_embedding(x_batch)
        x = Variable(torch.from_numpy(x_batch).float())
        y = Variable(torch.from_numpy(np.array(y_batch)).float())
        if torch.cuda.is_available():
            x = x.cuda()
            y = y.cuda()
        optimizer.zero_grad()
        hidden, c_t = model.init_hidden()
        output = model(x, hidden, c_t)
        loss = criteria(output, y)
        loss.backward()
        optimizer.step()
        if i % 200 == 0:
            print("Epoch: {}, Step: {}, Loss: {}".format(e, i, loss.data[0]))

Epoch: 0, Step: 0, Loss: 0.6953133344650269
Epoch: 0, Step: 200, Loss: 0.6957678198814392
Epoch: 0, Step: 400, Loss: 0.6927761435508728
Epoch: 0, Step: 600, Loss: 0.6943928003311157
Epoch: 1, Step: 0, Loss: 0.6944343447685242
Epoch: 1, Step: 200, Loss: 0.6959816217422485
Epoch: 1, Step: 400, Loss: 0.6927227973937988
Epoch: 1, Step: 600, Loss: 0.6943374872207642
Epoch: 2, Step: 0, Loss: 0.6945720911026001
Epoch: 2, Step: 200, Loss: 0.6959515810012817
Epoch: 2, Step: 400, Loss: 0.6927058696746826
Epoch: 2, Step: 600, Loss: 0.694332480430603
Epoch: 3, Step: 0, Loss: 0.6946743726730347
Epoch: 3, Step: 200, Loss: 0.6959359049797058
Epoch: 3, Step: 400, Loss: 0.6926993131637573
Epoch: 3, Step: 600, Loss: 0.6943338513374329
Epoch: 4, Step: 0, Loss: 0.6947702169418335
Epoch: 4, Step: 200, Loss: 0.695929229259491
Epoch: 4, Step: 400, Loss: 0.6926904320716858
Epoch: 4, Step: 600, Loss: 0.6943276524543762
Epoch: 5, Step: 0, Loss: 0.6948450803756714
Epoch: 5, Step: 200, Loss: 0.6959184408187866
Ep

Epoch: 44, Step: 400, Loss: 0.6906159520149231
Epoch: 44, Step: 600, Loss: 0.6939963102340698
Epoch: 45, Step: 0, Loss: 0.6967888474464417
Epoch: 45, Step: 200, Loss: 0.6965638399124146
Epoch: 45, Step: 400, Loss: 0.6905968189239502
Epoch: 45, Step: 600, Loss: 0.6940005421638489
Epoch: 46, Step: 0, Loss: 0.6967501640319824
Epoch: 46, Step: 200, Loss: 0.696571946144104
Epoch: 46, Step: 400, Loss: 0.6905994415283203
Epoch: 46, Step: 600, Loss: 0.6939903497695923
Epoch: 47, Step: 0, Loss: 0.6967341303825378
Epoch: 47, Step: 200, Loss: 0.6965646743774414
Epoch: 47, Step: 400, Loss: 0.6906052827835083
Epoch: 47, Step: 600, Loss: 0.694008469581604
Epoch: 48, Step: 0, Loss: 0.6967126131057739
Epoch: 48, Step: 200, Loss: 0.6965678930282593
Epoch: 48, Step: 400, Loss: 0.6905998587608337
Epoch: 48, Step: 600, Loss: 0.6940034031867981
Epoch: 49, Step: 0, Loss: 0.6967161893844604
Epoch: 49, Step: 200, Loss: 0.6965616941452026
Epoch: 49, Step: 400, Loss: 0.6905855536460876
Epoch: 49, Step: 600, Los

Epoch: 88, Step: 600, Loss: 0.6961992979049683
Epoch: 89, Step: 0, Loss: 0.6037851572036743
Epoch: 89, Step: 200, Loss: 0.5572724342346191
Epoch: 89, Step: 400, Loss: 0.572045087814331
Epoch: 89, Step: 600, Loss: 0.6999374628067017
Epoch: 90, Step: 0, Loss: 0.6211358904838562
Epoch: 90, Step: 200, Loss: 0.5547441244125366
Epoch: 90, Step: 400, Loss: 0.5641446113586426
Epoch: 90, Step: 600, Loss: 0.7005079984664917
Epoch: 91, Step: 0, Loss: 0.6142944097518921
Epoch: 91, Step: 200, Loss: 0.5596771836280823
Epoch: 91, Step: 400, Loss: 0.5648335814476013
Epoch: 91, Step: 600, Loss: 0.7090256810188293
Epoch: 92, Step: 0, Loss: 0.613610565662384
Epoch: 92, Step: 200, Loss: 0.5522481203079224
Epoch: 92, Step: 400, Loss: 0.5579447746276855
Epoch: 92, Step: 600, Loss: 0.7096544504165649
Epoch: 93, Step: 0, Loss: 0.6134659051895142
Epoch: 93, Step: 200, Loss: 0.5331973433494568
Epoch: 93, Step: 400, Loss: 0.5550416707992554
Epoch: 93, Step: 600, Loss: 0.7090858221054077
Epoch: 94, Step: 0, Loss:

Epoch: 132, Step: 200, Loss: 0.3976813852787018
Epoch: 132, Step: 400, Loss: 0.3904567062854767
Epoch: 132, Step: 600, Loss: 0.6115921139717102
Epoch: 133, Step: 0, Loss: 0.450636088848114
Epoch: 133, Step: 200, Loss: 0.3586971163749695
Epoch: 133, Step: 400, Loss: 0.2937397062778473
Epoch: 133, Step: 600, Loss: 0.6369239091873169
Epoch: 134, Step: 0, Loss: 0.40549635887145996
Epoch: 134, Step: 200, Loss: 0.38578739762306213
Epoch: 134, Step: 400, Loss: 0.34566059708595276
Epoch: 134, Step: 600, Loss: 0.5239987969398499
Epoch: 135, Step: 0, Loss: 0.39961519837379456
Epoch: 135, Step: 200, Loss: 0.37337830662727356
Epoch: 135, Step: 400, Loss: 0.4035714268684387
Epoch: 135, Step: 600, Loss: 0.5170995593070984
Epoch: 136, Step: 0, Loss: 0.3675796389579773
Epoch: 136, Step: 200, Loss: 0.33262789249420166
Epoch: 136, Step: 400, Loss: 0.3363152742385864
Epoch: 136, Step: 600, Loss: 0.5332662463188171
Epoch: 137, Step: 0, Loss: 0.4639468193054199
Epoch: 137, Step: 200, Loss: 0.33825615048408