In [221]:
import torch
import spacy
import re
import string
import torch.nn as nn
import numpy as np

from collections import Counter

import pandas as pd

from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import torch.nn.functional as F

In [64]:
#input
x = torch.tensor([[1,2, 12,34, 56,78, 90,80],
                 [12,45, 99,67, 6,23, 77,82],
                 [3,24, 6,99, 12,56, 21,22]])
print(x.shape)

torch.Size([3, 8])


In [65]:
model1 = nn.Embedding(num_embeddings=100, embedding_dim=7, padding_idx=0)  # map each index in a fixed-size dictionary of size 100 to a dense vector of size 7
model2 = nn.LSTM(input_size=7, hidden_size=3, num_layers=1, batch_first=True)

In [66]:
out1 = model1(x)
print(type(out1))  # torch.Tensor
print(out1.shape)  # ([3, 8, 7]) --> each element in x is mapped to a vector of size 7
out2 = model2(out1)
print(type(out2))  # tuple
print(type(out2[0]))  # torch.Tensor --> consolidated output of all hidden states in the sequence --> shape [batch_size, seq_len, hidden_size]
print(type(out2[1][0]))  # torch.Tensor --> hidden state of the last LSTM unit - the final output
print(type(out2[1][1]))  # torch.Tensor --> cell state


<class 'torch.Tensor'>
torch.Size([3, 8, 7])
<class 'tuple'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>


## Loading the data

In [67]:
df = pd.read_csv("data/train.csv")
print(df.shape)
df.head()

(7613, 5)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## Preprocessing (tokenization)

In [70]:
!python -m spacy download en_core_web_sm
tok = spacy.load('en_core_web_sm')

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.1/12.8 MB 653.6 kB/s eta 0:00:20
     - -------------------------------------- 0.4/12.8 MB 2.6 MB/s eta 0:00:05
     -- ------------------------------------- 0.7/12.8 MB 3.6 MB/s eta 0:00:04
     --- ------------------------------------ 1.1/12.8 MB 4.7 MB/s eta 0:00:03
     ---- ----------------------------------- 1.5/12.8 MB 5.3 MB/s eta 0:00:03
     ------ --------------------------------- 2.0/12.8 MB 5.9 MB/s eta 0:00:02
     ------- -------------------------------- 2.5/12.8 MB 6.7 MB/s eta 0:00:02
     --------- ------------------------------ 3.1/12.8 MB 7.1 MB/s eta 0:00:02
     ----------- ---------------------------- 3

In [77]:
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

In [80]:
print(tokenize("hugo!!!hase"))

['hugo', '  ', 'hase']


In [84]:
#count number of occurences of each word
counts = Counter()
for index, row in df.iterrows():
    counts.update(tokenize(row['text']))

In [85]:
#deleting infrequent words
print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))

num_words before: 22129
num_words after: 7019


In [86]:
#creating vocabulary
vocab2index = {"":0, "UNK":1}  # map words to their index
words = ["", "UNK"]  # list of words indexed according to their appearance order
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [172]:
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return [encoded]

In [173]:
df['encoded'] = df['text'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
df.head()

Unnamed: 0,id,keyword,location,text,target,encoded
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[[16, 17, 18, 19, 1, 1, 9, 20, 0, 0, 0, 0, 0, ..."
2,5,,,All residents asked to 'shelter in place' are ...,1,"[[15, 21, 22, 23, 9, 24, 25, 26, 9, 4, 27, 1, ..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[[36, 37, 38, 9, 39, 32, 34, 25, 40, 0, 0, 0, ..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[[41, 42, 43, 8, 44, 45, 1, 9, 46, 47, 48, 45,..."


## Dataset

In [248]:
class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        print(X[3][0])
        self.X = X
        self.y = Y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx]
        #x_tensor = torch.tensor(self.X[idx][0], dtype=torch.int32)
        #return x_tensor, self.y[idx], self.X[idx][1]

In [249]:
X = list(df['encoded'])
y = list(df['target'])
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [250]:
train_ds = ReviewsDataset(X_train, y_train)
valid_ds = ReviewsDataset(X_valid, y_valid)

[ 127  231    9 1538    9  461   23 3311    9 2204 1623   87  804 2017
    9  473    9   50   83  165   83  150  166 5095    1    1    9 5391
    9 3857 5396  165   83  150  166 1442    9    1    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0]
[ 220  180    8  236 5976   76    9 4195   85 3399    9 1728  273   25
    9 1567    1    9  165   83  150  166 3954    1    9 3959  165   83
  150  166    1  150    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0]


In [251]:
batch_size = 5000
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

## Training loop

In [271]:
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y in train_dl:
            x = x.long()
            y = y.long()
            y_pred = model(x)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total

In [272]:
class LSTM_fixed_len(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 2)  # change to 5 for 5 classes
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [273]:
model_fixed = LSTM_fixed_len(7100, 70, 70)
train_model(model_fixed, epochs=30, lr=0.01)

train loss 0.685, val loss 0.684, val accuracy 0.570, and val rmse 0.655
train loss 0.683, val loss 0.684, val accuracy 0.570, and val rmse 0.655
train loss 0.684, val loss 0.684, val accuracy 0.570, and val rmse 0.655
train loss 0.683, val loss 0.683, val accuracy 0.570, and val rmse 0.655
train loss 0.683, val loss 0.683, val accuracy 0.570, and val rmse 0.655
train loss 0.683, val loss 0.683, val accuracy 0.570, and val rmse 0.655
