In [48]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

## Get Yelp Reviews Lite dataset

In [49]:

args = Namespace(
    raw_train_dataset_csv="data/yelp/train.csv",
    raw_test_dataset_csv="data/yelp/test.csv",
    proportion_subset_of_train=0.1,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="data/yelp/reviews_with_splits_lite.csv",
    seed=1337
)


In [50]:
# Read raw data
train_reviews = pd.read_csv(args.raw_train_dataset_csv, header=None, names=['rating', 'review'])


In [51]:
train_reviews.describe()

Unnamed: 0,rating
count,650000.0
mean,3.0
std,1.414215
min,1.0
25%,2.0
50%,3.0
75%,4.0
max,5.0


In [52]:

# making the subset equal across the review classes
by_rating = collections.defaultdict(list)
for _, row in train_reviews.iterrows():
    by_rating[row.rating].append(row.to_dict())
    

# create split data
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_rating.items()):
    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_subset = int(args.proportion_subset_of_train * n_total)
    n_train = int(args.train_proportion * n_subset)
    n_val = int(args.val_proportion * n_subset)
    n_test = int(args.test_proportion * n_subset)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
        
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
        
    for item in item_list[n_train+n_val:n_train+n_val+n_test]:
        item['split'] = 'test'
    
    ## Add to final list
    final_list.extend(item_list[:n_train+n_val+n_test])

final_reviews = pd.DataFrame(final_list)

In [53]:
# minimal clean
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

In [54]:
final_reviews.review = final_reviews.review.apply(preprocess_text)

In [55]:
final_reviews

Unnamed: 0,rating,review,split
0,1,"the sushi was fine , but after last night s ex...",train
1,1,rude and obnoxious people work here ! i will n...,train
2,1,stay away from olympic garden unless you are a...,train
3,1,horrible horrible experience . is selling pupp...,train
4,1,where do i start . . . if you are looking for ...,train
...,...,...,...
64995,5,as good as it gets for the dennys ihave been t...,test
64996,5,we enjoyed lillie s teppanyaki or as they call...,test
64997,5,we just purchased a vehicle from centennial la...,test
64998,5,they make great vegetarian thai food they do t...,test


## Pytorch Dataset class

In [56]:
from torch.utils.data import Dataset

In [57]:
class ReviewDataset(Dataset):
    def __init__(self, review_df, vectorizer):
        """
        Args:
            review_df (pandas.DataFrame): the dataset
            vectorizer (ReviewVectorizer): vectorizer instantiated from dataset
        """
        self.review_df = review_df
        self._vectorizer = vectorizer
        
        self.train_df = self.review_df[self.review_df.split=='train']
        self.train_size = len(self.train_df)
        
        self.val_df = self.review_df[self.review_df.split=='val']
        self.val_size = len(self.val_df)
        
        self.test_df = self.review_df[self.review_df.split=='test']
        self.test_size = len(self.test_df)
        
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                            'val': (self.val_df, self.val_size),
                            'test': (self.test_df, self.test_size)}
        
        self.set_split('train')
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, review_csv):
        """ Load dataset and make a new vectorizer form scratch
        
        Args:
            review_csv (str): location of the dataset
        Returns: 
            an instance of ReviewDataset
        """
        review_df = pd.read_csv(review_csv)
        return cls(review_df, ReviewVectorizer.from_dataframe(review_df))

    def get_vectorizer(self):
        """ Returns the vectorizer """
        return self._vectorizer
    
    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
        
    def __len__(self):
        return self._target_size
    
    def __getitem__(self, index):
        """ the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point
        Returns:
            a dict of the data point's features (x_data) and label (y_target)
        
        """
        row = self._target_df.iloc[index]
        
        review_vector = \
        self._vectorizer.vectorize(row.review)
        
        rating_index = self._vectorizer.rating_vocab.lookup_token(row.rating)
        
        return {'x_data': review_vector,
               'y_target': rating_index}
    
    def get_num_batches(self, batch_size):
        """ Given a batch size, return the number of batches in the dataset"""
        return len(self) // batch_size


In [None]:
class Vocabulary(object):
    """ Class to proces text and extract Vocabulary for mapping"""
    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        """Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
            add_unk (bool): a flag that indicates whether to add the UNK token
            unk_token (str): the UNK token to add into the vocabulary
        """
        
        if token_to_idx is None:
            toke_to_idx = {}
        self._token_to_idx = token_to_idx
        
        self._idx_to_token = {idx: token for token, idx in sel._token_to_idx.items{}}
        
        self._add_unk = add_unk
        self._unk_token = unk_token
        
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
            
        def to_serializable(self):
            """ returns a dictionary that can be serialized """
            return {'token_to_idx': self._token_to_idx,
                   'add_unk': self._add_unk,
                   'unk_token': self._unk_token}
        
        @classmethod
        def from_serializable(cls, contents):
            """ instantiates the Vocabulary from a serialized dictionary """
            return cls(**contents)
        
        def add_token(self, token):
            """ Update mapping dicts based on the token. 
                Args:
                    token (str): the item to add into the Vocabulary
                Returns:
                    index (int): the integer correspoiding to the token
                """
            if token in seld._token_to_idx:
                index = self._token_to_idx[token]
            
            