In [1]:
import nltk

from collections import Counter

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import torch 
import torch.nn as nn
import torch.nn.functional as F

from tqdm.auto import tqdm

import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
read_json = False
parquet_path = 'D:/Documentos/Estudos/Projeto-NLP/dataset/yelp_academic_dataset_review.parquet'
json_path = 'D:/Documentos/Estudos/Projeto-NLP/dataset/yelp_academic_dataset_review.json'
glove_path = 'D:/Documentos/Estudos/Projeto-NLP/dataset/glove.twitter.27B.100d.txt'

if read_json:

    df = pd.DataFrame()
    for chunk in pd.read_json(json_path, lines= True, chunksize= 100000):
        df = pd.concat([df, chunk[['stars', 'text']]])

    df.to_parquet(parquet_path, compression='gzip')

In [10]:
df = next(pd.read_json(json_path, lines= True, chunksize= 1000, ))[['stars', 'text']]

In [11]:
tokenizer = nltk.tokenize.WordPunctTokenizer()

def clean_text(text):
    text = re.sub(r"[^\w\s']", '', str(text))
    return ' '.join(tokenizer.tokenize(str(text))).lower()



df['text'] = df['text'].apply(clean_text)

In [13]:
data_val, data_test = train_test_split(df, test_size= 0.1, random_state= 42)
data_train, data_val = train_test_split(data_val, test_size= 0.222)

print(data_val.shape, data_test.shape, data_train.shape)

(19980, 2) (10000, 2) (70020, 2)


In [14]:
data_train.head()

Unnamed: 0,stars,text
36209,5,stromboli 795 look at the size of this and wit...
86874,1,if i could give no stars i would we order iced...
64802,5,i was looking for a lunch spot near the boeing...
56609,5,karma has been catering delicious salads for m...
51581,5,this place is amazing every time my only thoug...


In [15]:
all_text = ' '.join(data_train['text'].values)

all_tokens = tokenizer.tokenize(all_text)

tokens_count = Counter(all_tokens)

In [16]:
tokens_count.most_common(10)

[('the', 365921),
 ('and', 252126),
 ('i', 206887),
 ('a', 186817),
 ('to', 163291),
 ("'", 133860),
 ('was', 130913),
 ('it', 106521),
 ('of', 103077),
 ('is', 88560)]

In [17]:

tokens =  sorted(t for t, c in tokens_count.items() if c >= 10)
                 
UNK, PAD = "UNK", "PAD"

tokens = [UNK, PAD] + tokens

In [18]:
len(tokens)

14228

In [19]:
token_to_id = {k: i for i, k in enumerate(tokens)}

In [20]:
embedding_index = {}

with open(glove_path, 'r', encoding= 'utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        if word in token_to_id:
            coefs = np.asarray(values[1:], dtype= 'float32')
            embedding_index[word] = coefs

In [21]:
embedding_dim = 100
tokens_size = len(token_to_id)

embedding_matrix = np.zeros((tokens_size, embedding_dim))

UNK_VEC = np.array(list(embedding_index.values())).mean(axis= 0)
PAD_VEC = np.zeros(100)

embedding_matrix[0] = UNK_VEC
embedding_matrix[1] = PAD_VEC

for word, i in token_to_id.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        embedding_matrix[i] = UNK_VEC
    

In [26]:
type(UNK_IX)

int

In [22]:
UNK_IX, PAD_IX = map(token_to_id.get, [UNK, PAD])

def as_matrix(sequences, max_len=None):
    """ Convert a list of tokens into a matrix with padding """
    if isinstance(sequences[0], str):
        sequences = list(map(str.split, sequences))

    max_len = min(max(map(len, sequences)), max_len or float('inf'))
    
    matrix = np.full((len(sequences), max_len), np.int32(PAD_IX))
    for i,seq in enumerate(sequences):
        row_ix = [token_to_id.get(word, UNK_IX) for word in seq[:max_len]]
        matrix[i, :len(row_ix)] = row_ix
    
    return matrix

In [43]:
data_train.head()

Unnamed: 0,stars,text
3121,5,best haircut i ' ve ever had ! i recently move...
9531,4,our second time to cafe fleur de lis ! we went...
14930,1,absolutely terrible in every aspect . my fianc...
62636,5,"my husband thoughtfully purchased the "" empres..."
34212,1,what a crock . my wife and i had two vehicles ...


In [54]:
device = "cuda" if torch.cuda.is_available() else "cpu"
TARGET_COLUMN = 'stars'

def to_tensors(batch, device):
    batch_tensors = dict()
    for key, arr in batch.items():
        if key == 'text':
            batch_tensors[key] = torch.tensor(arr, device= device, dtype= torch.int64)
        else:
            batch_tensors[key] = torch.tensor(arr, device= device)

    return batch_tensors

def apply_word_dropout(matrix, keep_prop, replace_with=UNK_IX, pad_ix=PAD_IX,):
    dropout_mask = np.random.choice(2, np.shape(matrix), p=[keep_prop, 1 - keep_prop])
    dropout_mask &= matrix != pad_ix
    return np.choose(dropout_mask, [matrix, np.full_like(matrix, replace_with)])

def make_batch(data, max_len=None, word_dropout=0, device=device):
    """
    Creates a keras-friendly dict from the batch data.
    :param word_dropout: replaces token index with UNK_IX with this probability
    :returns: a dict with {'title' : int64[batch, title_max_len]
    """
    batch = {}
    batch["text"] = as_matrix(data["text"].values, max_len)
    
    if word_dropout != 0:
        batch["text"] = apply_word_dropout(batch["text"], 1. - word_dropout)
    
    if TARGET_COLUMN in data.columns:
        batch[TARGET_COLUMN] = data[TARGET_COLUMN].values
    
    return to_tensors(batch, device)

In [45]:
data_train[:3]

Unnamed: 0,stars,text
3121,5,best haircut i ' ve ever had ! i recently move...
9531,4,our second time to cafe fleur de lis ! we went...
14930,1,absolutely terrible in every aspect . my fianc...


In [46]:
class LSTMPredictor(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix):
        super(LSTMPredictor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype= torch.float32))
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embedding_dim, hidden_size= hidden_dim, batch_first= True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        

    def forward(self, x):
        x = self.embedding(x)
        x, (hidden, cel) = self.lstm(x)
        feat = hidden[-1]
        out_fc = self.fc(feat)
        output = F.relu(out_fc)

        return output


In [55]:
def iterate_minibatches(data, batch_size=256, shuffle=True, cycle=False, device=device, **kwargs):
    """ iterates minibatches of data in random order """
    while True:
        indices = np.arange(len(data))
        if shuffle:
            indices = np.random.permutation(indices)

        for start in range(0, len(indices), batch_size):
            batch = make_batch(data.iloc[indices[start : start + batch_size]], device=device, **kwargs)
            print(batch)
            yield batch
        
        if not cycle: break

In [48]:
BATCH_SIZE = 128
EPOCHS = 10

In [49]:
def print_metrics(model, data, batch_size=BATCH_SIZE, name="", device=torch.device('cpu'), **kw):
    squared_error = abs_error = num_samples = 0.0
    model.eval()
    with torch.no_grad():
        for batch in iterate_minibatches(data, batch_size=batch_size, shuffle=False, device=device, **kw):
            batch_pred = model(batch)
            squared_error += torch.sum(torch.square(batch_pred - batch[TARGET_COLUMN]))
            abs_error += torch.sum(torch.abs(batch_pred - batch[TARGET_COLUMN]))
            num_samples += len(batch_pred)
    mse = squared_error.detach().cpu().numpy() / num_samples
    mae = abs_error.detach().cpu().numpy() / num_samples
    print("%s results:" % (name or ""))
    print("Mean square error: %.5f" % mse)
    print("Mean absolute error: %.5f" % mae)
    return mse, mae


In [50]:
hidden_dim = 128
output_dim = 1

model = LSTMPredictor(tokens_size, embedding_dim, hidden_dim, output_dim, embedding_matrix).to(device)

criterion = nn.MSELoss(reduction= 'sum')
optimizer = torch.optim.SGD(model.parameters(), lr= 1e-4)


In [57]:
for epoch in range(EPOCHS):
    print(f"Epoch: {epoch}")
    model.train()
    for i, batch in tqdm(enumerate(
        iterate_minibatches(data_train, batch_size= BATCH_SIZE, device= device)),
        total = len(data_train) // BATCH_SIZE
    ):
        pred = model(batch)
        loss = criterion(pred, batch[TARGET_COLUMN])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print_metrics(model, data_val, device= device)        
        

Epoch: 0


  0%|          | 0/547 [00:00<?, ?it/s]

{'text': tensor([[ 5289, 13831,  5738,  ...,     1,     1,     1],
        [12861, 13831,  8470,  ...,     1,     1,     1],
        [13879, 12210,  2165,  ...,     1,     1,     1],
        ...,
        [13879,  5973,  8970,  ...,     1,     1,     1],
        [ 6875,    57, 10918,  ...,     1,     1,     1],
        [13928,  6204, 11047,  ...,     1,     1,     1]], device='cuda:0'), 'stars': tensor([3, 5, 1, 4, 4, 4, 5, 4, 4, 3, 2, 4, 5, 5, 4, 1, 5, 5, 5, 4, 3, 3, 5, 5,
        5, 3, 5, 4, 3, 4, 3, 1, 5, 2, 5, 3, 4, 4, 5, 1, 4, 5, 5, 5, 2, 5, 1, 5,
        5, 5, 4, 4, 3, 5, 4, 3, 4, 5, 4, 5, 4, 5, 1, 1, 5, 3, 5, 5, 1, 3, 1, 4,
        1, 5, 1, 2, 5, 5, 5, 1, 5, 4, 5, 3, 4, 5, 1, 3, 5, 1, 3, 4, 5, 5, 5, 4,
        2, 5, 5, 4, 4, 3, 5, 1, 5, 3, 5, 5, 1, 4, 4, 4, 4, 3, 2, 5, 5, 2, 1, 2,
        1, 5, 4, 5, 5, 4, 4, 5], device='cuda:0')}





TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not dict