In [106]:
import torch
import numpy as np
import pandas as pd
from collections import Counter
import string
import collections
import re

# yelp reviews

In [107]:
from argparse import Namespace

args = Namespace(
    frequency_cutoff=25,
    model_state_file='model.pth',
    review_csv='yelp_small.csv',
    save_dir='model_storage/yelp/',
    vectorizer_file="vectorizer.json",
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    train_proportion=80,
    val_proportion=10,
    test_proportion=10,
    
)

In [108]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

def split_data(review_subset):
    by_rating = collections.defaultdict(list)
    for _, row in review_subset.iterrows():
        by_rating[row.stars].append(row.to_dict())

    final_list = []
    np.random.seed(args.seed)

    for _, item_list in sorted(by_rating.items()):
        np.random.shuffle(item_list)

        n_total = len(item_list)
        n_train = int(args.train_proportion * n_total)
        n_val = int(args.val_proportion * n_total)
        n_test = int(args.test_proportion * n_total)

        for item in item_list[:n_train]:
            item['split'] = 'train'
        for item in item_list[n_train:n_train+n_val]:
            item['split'] = 'val'
        for item in item_list[n_train+n_val:n_train+n_val+n_test]:
            item['split'] = 'test'

        final_list.extend(item_list)

    final_reviews = pd.DataFrame(final_list)
    final_reviews.text = final_reviews.text.apply(preprocess_text)
    return final_reviews

In [109]:
from torch.utils.data import Dataset

class ReviewDataset(Dataset):
    def __init__(self, review_df, vectorizer):
        """
        Args: 
            review_df (pendas.DataFrame) the dataset
            vectorizer (reviewVectorizer) vectorizer instantiated from dataset
        """
        self.review_df = split_data(review_df)
        self._vectorizer = vectorizer
        
        self.train_df = self.review_df[self.review_df.split=="train"]
        self.train_size = len(self.train_df)
        
        self.val_df = self.review_df[self.review_df.split=="val"]
        self.validation_size = len(self.val_df)
        
        self.test_df = self.review_df[self.review_df.split=="test"]
        self.test_size = len(self.test_df)
        
        self._lookup_dict = {"train": (self.train_df, self.train_size),
                             "val": (self.val_df, self.validation_size),
                             "test": (self.test_df, self.test_size)}
        
        self.set_split('train')
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, review_csv):
        review_df = pd.read_csv(review_csv)
        return cls(review_df, ReviewVectorizer.from_dataframe(review_df))
    
    def get_vectorizer(self):
        return self._vectorizer
    
    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
        
    def __len__(self):
        return self._target_size
    
    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        
        review_vector = self._vectorizer.vectorize(row.text)
        
        rating_index = self._vectorizer.rating_vocab.lookup_token(row.stars)
        
        return {'x_data': review_vector,
                'y_target': rating_index}
    
    def get_num_batches(self, batch_size):
        return len(self) // batch_size
    
    

In [110]:
class Vocabulary(object):
    """Class to process text and extract Vocabulary for mapping"""
    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token for token, idx in self._token_to_idx.items()}
        self._add_unk = add_unk
        self._unk_token = unk_token
        self.unk_token = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
    
    def to_serializable(self):
        return {'token_to_idx': self.token_to_idx, 'add_unk': self._add_unk,
                'unk_token': self._unk_token}
    
    @classmethod
    def from_serializable(cls, content):
        return cls(**content)
    
    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
    
    def lookup_token(self, token):
        if self._add_unk:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
        
    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    
    def __str__(self):
        return "<Vocabulary(size=%d)" % len(self)
    
    def __len__(self):
        return len(self._token_to_idx)

In [111]:
class ReviewVectorizer(object):
    def __init__(self, review_vocab, rating_vocab):
        self.review_vocab = review_vocab
        self.rating_vocab = rating_vocab
        
    def vectorize(self, review):
        one_hot = np.zeros(len(self.review_vocab), dtype=np.float32)
        for token in review.split(" "):
            if token not in string.punctuation:
                one_hot[self.review_vocab.lookup_token(token)] = 1
        return one_hot
    
    @classmethod
    def from_dataframe(cls, review_df, cutoff=25):
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)
        
        for rating in sorted(set(review_df.stars)):
            rating_vocab.add_token(rating)
            
        word_counts = Counter()
        for review in review_df.text:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)
                
        return cls(review_vocab, rating_vocab)
    
    @classmethod
    def from_serializable(cls, contents):
        review_vocab = Vocabulary.from_serializable(contents['review_vocab'])
        rating_vocab = Vocabulary.from_serializable(contents['rating_vocab'])
        
        return cls(review_vocab=review_vocab, rating_vocab=rating_vocab)
    
    def to_serializable(self):
        return {'review_vocab': self.review_vocab.to_serializable(), 
                'rating_vocab': self.rating_vocab.to_serializable()}

In [112]:
def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"):
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                           shuffle=False, drop_last=drop_last)
    
    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [113]:
import torch.nn as nn
import torch.nn.functional as F

class ReviewClassifier(nn.Module):
    def __init__(self, num_features):
        super(ReviewClassifier, self).__init__()
        self.fc1 = nn.Linear(in_features=num_features, out_features=1)
        
    def forward(self, x_in, apply_sigmoid=False):
        y_out = self.fc1(x_in).squeeze()
        if apply_sigmoid:
            y_out = F.sigmoid(y_out)
        return y_out

In [114]:
import torch.optim as optim

def make_train_state(args):
    return {'epoch_index': 0, 
           'train_loss': [],
            'train_acc': [],
           'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1}
train_state = make_train_state(args)

args.device = "cpu"

dataset = ReviewDataset.load_dataset_and_make_vectorizer(args.review_csv)
vectorizer = dataset.get_vectorizer()

classifier = ReviewClassifier(num_features=len(vectorizer.review_vocab))
classifier = classifier.to(args.device)

loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)

In [115]:
from torch.utils.data import DataLoader

for epoch_index in range(args.num_epochs):
    train_state['epoch_index'] = epoch_index
    
    dataset.set_split('train')
    batch_generator = generate_batches(dataset, batch_size=args.batch_size,)
    running_loss=0.0
    running_acc =0.0
    classifier.train()
    
    for batch_index, batch_dict in enumerate(batch_generator):
        optimizer.zero_grad()
        
        y_pred = classifier(x_in=batch_dict['x_data'].float())
        
        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1)
        
        loss.backward()
        
        optimizer.step()
        
        #acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
        #running_acc (acc_batch - running_acc) / (batch_index + 1)
    
    #train_state['train_loss'].append(running_loss)
    #train_state['train_acc'].append(running_acc)
    
    dataset.set_split("val")
    batch_generator = generate_batches(dataset, batch_size=args.batch_size,)
    running_loss=0.0
    running_acc =0.0
    classifier.eval()
    
    for batch_index, batch_dict in enumerate(batch_generator):
        optimizer.zero_grad()
        
        y_pred = classifier(x_in=batch_dict['x_data'].float())
        
        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1)
        
        
        acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc = (acc_batch - running_acc) / (batch_index + 1)
    #train_state['val_loss'].append(running_loss)
    #train_state['val_acc'].append(running_acc)

In [116]:
dataset.set_split('train')
batch_generator = generate_batches(dataset, batch_size=args.batch_size,)
for batch_index, batch_dict in enumerate(batch_generator):
    print(batch_dict)

{'x_data': tensor([[1., 0., 1.,  ..., 0., 0., 0.],
        [1., 0., 1.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 1.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.]]), 'y_target': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1])}
{'x_data': tensor([[1., 0., 1.,  ..., 0., 0., 0.],
        [1., 1., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 1.,  ..., 0., 0., 0.],
        [1., 0., 1.,  ..., 0., 0., 0.]]), 'y_target': tensor([1,

In [117]:
def predict_rating(review, classifier, vectorizer, decision_threshold=0.5):
    """
    review: text to review
    classifier: trained model,
    
    """
    review = preprocess_text(review)
    vectorized_review = torch.tensor(vectorizer.vectorize(review))
    result =classifier (vectorized_review.view(1, -1))
    
    probability_value = F.sigmoid(result).item()
    
    index = 1
    if probability_value < decision_threshold:
        index = 0
    
    return vectorizer.rating_vocab.lookup_index(index)


In [118]:
test_review = "this is so bad, I hate it"
prediction = predict_rating(test_review, classifier, vectorizer)
print("{} -> {}".format(test_review, prediction))

this is so bad, I hate it -> 2




In [125]:
test_review = "this is so good, I like the ambiance and the nice tables"
prediction = predict_rating(test_review, classifier, vectorizer)
print("{} -> {}".format(test_review, prediction))

this is so good, I like the ambiance and the nice tables -> 2


In [122]:
fc1_weights = classifier.fc1.weight.detach()[0]
_, indices = torch.sort(fc1_weights, dim=0, descending=True)
indices = indices.numpy().tolist()

print(" most best:")
for i in range(20):
    print(vectorizer.review_vocab.lookup_index(indices[i]))

 most best:
meal
fan
typical
spot
although
liked
chips
salad
little
pretty
aussi
packed
though
ambiance
side
menu
quite
tables
nice
sauce


In [124]:
print("bad")
indices.reverse()
for i in range(20):
    print(vectorizer.review_vocab.lookup_index(indices[i]))

bad
well,
On
place!
delicious.


This
go.
menu,
cheese.
in,
2
I'll
Our
now,
très
Les
Un
you.
Il
it,
couldn't
