# Language Modelling

In [84]:
# Import Statements

import pandas as pd
import numpy as np

import torch, torch.nn as nn
from torch.autograd import Variable

import nltk
from nltk import word_tokenize
import re
from collections import Counter

np.set_printoptions(threshold=np.nan)

## Load Data

In [59]:
yelp_train = pd.read_csv('./yelp-train.txt', sep='\t', encoding='latin-1', header=None)
yelp_train_y = yelp_train[:][1]
yelp_valid = pd.read_csv('./yelp-valid.txt', sep='\t', encoding='latin-1', header=None)
yelp_valid_y = yelp_valid[:][1]
yelp_test = pd.read_csv('./yelp-test.txt', sep='\t', encoding='latin-1', header=None)
yelp_test_y = yelp_test[:][1]

print("Data loaded.")

Data loaded.


In [64]:
yelp_train[:][0][0]

"I can't believe I haven't yelped about the place yet. Several months (maybe over a year?) ago my husband read a newspaper article about the Clover coffee maker and the one place in town that had managed to procure one. I was skeptical (as is my nature). It can't be that much better right? You're just saying it's amazing because you want to talk about the new hot coffee shop you discovered, right? Well, maybe. But I love this place. And I don't think it has a whole lot to do with the Clover. They roast their own beans and they roast them way differently than that other ginormous coffee chain - all a light or medium roast. Never bitter, never oily, never yucky. The coffee they make there is, obviously, the best. But I send my husband in every week now to buy a pound of beans so that I can approximate the same coffee at home. Add an edgy (though sometimes intimidating) seating area, great local art (which we bought off the wall), and smiley service...I'm sold. Can't wait to try out the d

## Data Preparation

In [50]:

def pre_processing(data):
    new_data = []
    #i = 0
    for sentence in (data[:][0]):
        new_sentence = re.sub('<.*?>', '', sentence) # remove HTML tags
        new_sentence = re.sub(r'[^\w\s]', '', new_sentence) # remove punctuation
        new_sentence = new_sentence.lower() # convert to lower case
        if new_sentence != '':
            new_data.append(new_sentence)
    return new_data


In [69]:
train_x = pre_processing(yelp_train)

In [72]:
train_x[0]

'i cant believe i havent yelped about the place yet several months maybe over a year ago my husband read a newspaper article about the clover coffee maker and the one place in town that had managed to procure one i was skeptical as is my nature it cant be that much better right youre just saying its amazing because you want to talk about the new hot coffee shop you discovered right well maybe but i love this place and i dont think it has a whole lot to do with the clover they roast their own beans and they roast them way differently than that other ginormous coffee chain  all a light or medium roast never bitter never oily never yucky the coffee they make there is obviously the best but i send my husband in every week now to buy a pound of beans so that i can approximate the same coffee at home add an edgy though sometimes intimidating seating area great local art which we bought off the wall and smiley serviceim sold cant wait to try out the downtown location'

In [74]:

def tokenize(data):
    new_data = []
    for sentence in (data):
        new_sentence = nltk.word_tokenize(sentence)
        new_data.append(new_sentence)
    return new_data
        

In [85]:
train_x_tokenized = tokenize(train_x)

In [86]:
train_x_tokenized[0][:10]

['i',
 'cant',
 'believe',
 'i',
 'havent',
 'yelped',
 'about',
 'the',
 'place',
 'yet']

In [82]:
def takeSecond(elem):
    return elem[1]

def frequency(tokens):
    # combine all the tokens into one list
    new = sum(tokens, [])
    frequency = list(Counter(new).items())
    frequency = sorted(frequency, key=takeSecond, reverse=True)
    return frequency

In [93]:
freq = frequency(train_x_tokenized)[:10000] #specify how many words to be considered

In [103]:
def make_lookup_table(vocab):
    yelp_dic = {}
    for i in range(0,len(vocab)):
        yelp_dic[vocab[i][0]] = [i+1]
    return yelp_dic

In [106]:
# Index by frequency
dic = make_lookup_table(freq)

In [109]:
embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings
lookup_tensor = torch.tensor([dic["the"]], dtype=torch.long)
hello_embed = embeds(lookup_tensor)
print(hello_embed)

tensor([[[ 0.2624, -0.8028,  0.9173, -0.7209,  0.4542]]])


## Model

In [111]:
text = train_x_tokenized[0]
seq_len = len(text)
batch_size = 1
embedding_size = 1
hidden_size = 1
output_size = 1

random_input = Variable(torch.FloatTensor(seq_len, batch_size, embedding_size).normal_(), requires_grad=False)


In [112]:
random_input[:, 0, 0]

tensor([-1.3207, -0.1310,  1.2490,  0.7213, -1.6344, -0.9395,  1.2265,
        -0.6407,  0.9623,  0.2786, -1.7617,  0.1556,  0.7590, -1.6019,
         0.8350, -1.8381,  0.2009,  1.1187, -1.4306, -0.6751, -0.9802,
         0.3807, -0.0162, -0.7463, -0.8346, -0.6716,  1.5187,  0.4423,
        -1.0981,  0.1766,  0.2496, -0.1605,  1.0259,  1.5697,  0.5326,
         0.4253,  0.5420, -0.5488, -1.0993,  1.1145,  0.3469, -1.2564,
         0.3100,  1.0544, -1.5434, -0.2634,  0.4639, -0.6840, -0.7264,
        -0.1945, -0.0794,  0.3472,  1.0317, -0.4769,  1.6535,  1.2891,
        -0.8668,  0.0804,  0.9067,  1.7320, -1.3870, -0.7153, -0.3140,
        -0.2806,  0.1906,  0.4073, -0.6895,  1.3528, -0.9858,  0.9516,
         0.1824, -0.9128,  1.5177, -1.7726, -0.0343, -1.9834, -1.8461,
         0.3753, -0.7929,  0.4515, -0.6812, -1.1121,  0.4339, -0.1815,
        -0.2282,  0.8527,  0.3436, -0.8154, -1.1563, -0.5654, -1.2219,
         0.6119,  0.4273,  1.0082,  1.2205, -0.8598,  1.0804,  0.4001,
      

In [113]:

bi_rnn = torch.nn.RNN(input_size=embedding_size, hidden_size=hidden_size, num_layers=1, batch_first=False, bidirectional=True)

bi_output, bi_hidden = bi_rnn(random_input)

# stagger
forward_output, backward_output = bi_output[:-2, :, :hidden_size], bi_output[2:, :, hidden_size:]
staggered_output = torch.cat((forward_output, backward_output), dim=-1)

linear = nn.Linear(hidden_size * 2, output_size)

# only predict on words
labels = random_input[1:-1]

# for language models, use cross-entropy :)
loss = nn.MSELoss()
output = loss(linear(staggered_output), labels)

In [114]:
labels[:,0,0]

tensor([-0.1310,  1.2490,  0.7213, -1.6344, -0.9395,  1.2265, -0.6407,
         0.9623,  0.2786, -1.7617,  0.1556,  0.7590, -1.6019,  0.8350,
        -1.8381,  0.2009,  1.1187, -1.4306, -0.6751, -0.9802,  0.3807,
        -0.0162, -0.7463, -0.8346, -0.6716,  1.5187,  0.4423, -1.0981,
         0.1766,  0.2496, -0.1605,  1.0259,  1.5697,  0.5326,  0.4253,
         0.5420, -0.5488, -1.0993,  1.1145,  0.3469, -1.2564,  0.3100,
         1.0544, -1.5434, -0.2634,  0.4639, -0.6840, -0.7264, -0.1945,
        -0.0794,  0.3472,  1.0317, -0.4769,  1.6535,  1.2891, -0.8668,
         0.0804,  0.9067,  1.7320, -1.3870, -0.7153, -0.3140, -0.2806,
         0.1906,  0.4073, -0.6895,  1.3528, -0.9858,  0.9516,  0.1824,
        -0.9128,  1.5177, -1.7726, -0.0343, -1.9834, -1.8461,  0.3753,
        -0.7929,  0.4515, -0.6812, -1.1121,  0.4339, -0.1815, -0.2282,
         0.8527,  0.3436, -0.8154, -1.1563, -0.5654, -1.2219,  0.6119,
         0.4273,  1.0082,  1.2205, -0.8598,  1.0804,  0.4001,  2.1435,
      

In [115]:
bi_output[:,0]

tensor([[ 0.7386,  0.0971],
        [-0.0982,  0.7953],
        [-0.5999,  0.9743],
        [-0.1640,  0.9055],
        [ 0.8373, -0.2321],
        [ 0.3739,  0.4133],
        [-0.6915,  0.9695],
        [ 0.6390,  0.5886],
        [-0.6501,  0.9582],
        [ 0.1407,  0.8171],
        [ 0.8286, -0.2354],
        [-0.3080,  0.8621],
        [-0.2883,  0.9156],
        [ 0.8444, -0.0997],
        [-0.6454,  0.9186],
        [ 0.9089, -0.2900],
        [-0.3603,  0.8730],
        [-0.4717,  0.9524],
        [ 0.8311, -0.0597],
        [ 0.2191,  0.4777],
        [ 0.5685,  0.3723],
        [-0.3541,  0.8962],
        [ 0.2221,  0.7858],
        [ 0.4555,  0.4396],
        [ 0.4333,  0.4114],
        [ 0.3502,  0.5759],
        [-0.7751,  0.9831],
        [ 0.0804,  0.8836],
        [ 0.6486,  0.2821],
        [-0.2602,  0.8640],
        [ 0.0181,  0.8736],
        [ 0.1819,  0.7857],
        [-0.5716,  0.9648],
        [-0.6227,  0.9847],
        [-0.0340,  0.9218],
        [-0.1772,  0

In [116]:
bi_hidden[:,0,0]

tensor(1.00000e-02 *
       [ 0.3537,  9.7054])

In [117]:
forward_output[:,0,0]

tensor([ 0.7386, -0.0982, -0.5999, -0.1640,  0.8373,  0.3739, -0.6915,
         0.6390, -0.6501,  0.1407,  0.8286, -0.3080, -0.2883,  0.8444,
        -0.6454,  0.9089, -0.3603, -0.4717,  0.8311,  0.2191,  0.5685,
        -0.3541,  0.2221,  0.4555,  0.4333,  0.3502, -0.7751,  0.0804,
         0.6486, -0.2602,  0.0181,  0.1819, -0.5716, -0.6227, -0.0340,
        -0.1772, -0.2002,  0.4754,  0.5576, -0.6884,  0.1107,  0.6989,
        -0.3554, -0.4399,  0.8493, -0.0525, -0.1951,  0.5395,  0.3476,
         0.0856,  0.1061, -0.1775, -0.4795,  0.5170, -0.8280, -0.4240,
         0.6675, -0.2074, -0.4052, -0.7143,  0.8488,  0.2378,  0.2011,
         0.1930, -0.1085, -0.1392,  0.5274, -0.7571,  0.7640, -0.6718,
         0.2092,  0.5406, -0.8011,  0.9113, -0.2210,  0.8978,  0.7453,
        -0.4063,  0.6361, -0.4145,  0.5927,  0.5334, -0.3732,  0.3281,
         0.1144, -0.4717,  0.0341,  0.5409,  0.5667,  0.2428,  0.6609,
        -0.5041, -0.0087, -0.5144, -0.4795,  0.6762, -0.6993,  0.0802,
      

In [118]:
backward_output[:,0,0]

tensor([ 0.9743,  0.9055, -0.2321,  0.4133,  0.9695,  0.5886,  0.9582,
         0.8171, -0.2354,  0.8621,  0.9156, -0.0997,  0.9186, -0.2900,
         0.8730,  0.9524, -0.0597,  0.4777,  0.3723,  0.8962,  0.7858,
         0.4396,  0.4114,  0.5759,  0.9831,  0.8836,  0.2821,  0.8640,
         0.8736,  0.7857,  0.9648,  0.9847,  0.9218,  0.9079,  0.9101,
         0.5409,  0.3019,  0.9677,  0.8593,  0.1681,  0.8920,  0.9467,
        -0.0910,  0.7456,  0.8968,  0.4954,  0.5109,  0.7607,  0.8041,
         0.8984,  0.9601,  0.6709,  0.9868,  0.9706,  0.4391,  0.8485,
         0.9575,  0.9822, -0.0180,  0.5093,  0.7102,  0.7366,  0.8678,
         0.8908,  0.5647,  0.9727,  0.3800,  0.9579,  0.8388,  0.4312,
         0.9720, -0.2814,  0.6763, -0.5846, -0.2940,  0.8819,  0.4929,
         0.8932,  0.4581,  0.2797,  0.9033,  0.7612,  0.7629,  0.9515,
         0.8686,  0.3478,  0.1659,  0.5158,  0.2029,  0.9308,  0.9102,
         0.9635,  0.9676,  0.4641,  0.9666,  0.9076,  0.9939,  0.9367,
      

In [119]:
output

tensor(1.7871)