In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, Dataset, DataLoader, RandomSampler, SequentialSampler
import pyarrow.parquet as pq
from transformers import BertTokenizerFast, BertForSequenceClassification, DistilBertForSequenceClassification 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm, trange

In [2]:
DATA_PATH = r"C:\Users\zoopy\code\review-assist\data\yelp_academic_dataset_review.json"
WRITE_PATH = r"C:\Users\zoopy\code\review-assist\data\yelp_academic_dataset_review_short.json"
FINAL_WRITE_PATH = r"C:\Users\zoopy\code\review-assist\data\yelp_restaurant_reviews.parquet"
FINAL_WRITE_PATH_TRAIN = r"C:\Users\zoopy\code\review-assist\data\yelp_restaurant_reviews_train.parquet"
FINAL_WRITE_PATH_TEST = r"C:\Users\zoopy\code\review-assist\data\yelp_restaurant_reviews_test.parquet"
ALREADY_DATA = True
ACTUALLY_TRAIN = True
DISTIL = True

In [3]:
if not ALREADY_DATA:
    with open(DATA_PATH, mode='r', encoding='utf8') as in_file, \
        open(WRITE_PATH, mode='w', encoding='utf8') as out_file:
            for i in range(300000):
                out_file.write(next(in_file))
    df = pd.read_json(WRITE_PATH, lines=True)
    print(df["stars"].value_counts())
    print("-"*50)
    print(df.count())

    df['text_length'] = df['text'].apply(len)
    sns.displot(df,x='text_length')
    
    df_train = df.loc[df['text_length'] <= 2500]
    df_train["stars"] -= 1
    
    sns.displot(df_train,x='text_length')
    train, test = train_test_split(df_train, test_size=0.1)
    train.to_parquet(FINAL_WRITE_PATH_TRAIN)
    test.to_parquet(FINAL_WRITE_PATH_TEST)
else:
    df_train = pd.read_parquet(FINAL_WRITE_PATH_TRAIN)
    df_test = pd.read_parquet(FINAL_WRITE_PATH_TEST)

In [184]:
if not ACTUALLY_TRAIN:
    df = df_train.sample(n=5000)

In [21]:
df = df_train.sample(n=100000)

In [22]:
df

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,text_length
114941,41INyvULcU96e4cHfMXVhw,MIxQkyyvKg1ZgLvdihSigQ,PnJOVC9WGuMrNi5vQ04gMA,4,3,0,0,"Great atmosphere....professional, upscale and ...",2013-06-27 11:41:54,916
234394,k3dywgFifSpAaQGJ8rpk1g,5zNVELV8Huei5LEvAdUuXg,WbA5ud4InNWkizW7HE5kRQ,3,0,0,0,Gluten-free paradise and good value. I'm not ...,2014-12-13 20:38:42,683
142658,RvDWy-1pcT84cPUMvkaZ1Q,ysV_DyxdGTyxngavPMCayA,hJrhf5N_B8ifYqHQJwFAEw,1,1,0,0,"The meats were pretty tasty, but the portions ...",2017-07-22 23:46:11,276
254688,cqdw8L8cQVeSOU8zxKBoEQ,5ASyvGMmI-gwmfrlWKc2WQ,1CCaaunP7_hKs7wXSSJsKA,3,0,0,0,This chain is one of my favorite chains to sta...,2019-04-12 13:10:12,126
220527,35-0SuFXrRxIaJeQJM0_SA,txwFfvGkFEfqnaxVmFGOsw,0bPLkL0QhhPO5kt1_EXmNQ,3,3,0,0,"When I was a kid, growing up in New York - the...",2009-12-03 00:38:28,855
...,...,...,...,...,...,...,...,...,...,...
82712,B8k0mQAkrx9arVMEDNKsvA,7WZ1YuYYW5Ez3yzzulA7Kw,S3QHy1sshUeZwXOYviVsXQ,3,1,0,0,I know they have a good cappuccino:) waffles n...,2015-01-06 22:59:46,187
47017,R8FzJEytGgkLzzTBWZ-JUQ,u5WMfKHWsVsYCIEatr6UAw,tr366vgAkbcpJBVKSdBxZg,4,2,0,1,They're reopened as of 10/27/16!!\n\nAmazing p...,2016-10-28 01:17:35,119
157797,mp_YYfuBClBMYh0MhyF-lA,C0h0_bwIAUIuml-8n5rAvA,u7_3L1NBWgxhBM_B-cmmnA,3,0,0,0,Awesome pizza...the crust is perfect. It's lig...,2014-05-30 20:48:21,204
211528,mS2FkYCJKYrQB8i-7VMdcA,5smqsNwhlRFu32YJxMgCIw,q0Fi4n7shUTmlxl-mMPVXA,4,0,0,1,I've never been to an urgent care but a sick d...,2015-12-30 01:20:07,242


In [23]:
def preprocess(input_text, tokenizer):
    return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        padding = "max_length",
                        max_length=256,
                        truncation=True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )

# Initialize Bert Tokenizer
tokenizer = BertTokenizerFast.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True)

text = df.text.values
labels = df.stars.values

token_id = []
attention_masks = []

for sample in tqdm(text):
    encoding_dict = preprocess(sample, tokenizer)
    token_id.append(encoding_dict['input_ids'])
    attention_masks.append(encoding_dict['attention_mask'])

token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = F.one_hot(torch.tensor(labels), num_classes=5).to(torch.float)

100%|████████████████████████████████████████████████████████████████████████| 100000/100000 [00:38<00:00, 2615.61it/s]


In [24]:
# Load the BertForSequenceClassification model
if not DISTIL:
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels = 5,
        output_attentions = False,
        output_hidden_states = False,
    )

else:
    # Load the DistilBertForSequenceClassification model
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        num_labels = 5,
        output_attentions = False,
        output_hidden_states = False,
    )

NameError: name 'DISTIL' is not defined

In [8]:
validation_ratio = 0.2
batch_size = 16

train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = 0.2,
    shuffle = True,
    stratify = labels)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [9]:
def compute_batch_accuracy(logits, labels):
    preds = np.argmax(logits, axis = 1).flatten()
    truth = np.argmax(labels, axis=1).flatten()
    return accuracy_score(truth, preds)


In [15]:
# Pytorch training loop

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 2e-5,
                              eps = 1e-08)

# Run on GPU
model.cuda()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 3

for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    train_loss = 0
    nb_train_examples, nb_train_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids,  
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        # Update tracking variables
        train_loss += train_output.loss.item()
        nb_train_examples += b_input_ids.size(0)
        nb_train_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables 
    val_accuracy = []
    eval_loss = []
    
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              attention_mask = b_input_mask)
        eval_loss.append(eval_output.loss)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        batch_accuracy = compute_batch_accuracy(logits, label_ids)
        val_accuracy.append(batch_accuracy)
    
    print('\n\t - Train loss: {:.4f}'.format(train_loss / nb_train_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))

Epoch:  33%|█████████████████████████▎                                                  | 1/3 [15:11<30:23, 911.79s/it]


	 - Train loss: 0.1984
	 - Validation Accuracy: 0.6855


Epoch:  67%|██████████████████████████████████████████████████▋                         | 2/3 [30:22<15:10, 910.91s/it]


	 - Train loss: 0.1559
	 - Validation Accuracy: 0.6814


Epoch: 100%|████████████████████████████████████████████████████████████████████████████| 3/3 [45:32<00:00, 910.76s/it]


	 - Train loss: 0.1198
	 - Validation Accuracy: 0.6770





## Parameters

### Model 1
50,000 Samples
256 tokens
3 epochs 
lr = 5e-5,
eps = 1e-08
Train loss: 0.2157
Validation Accuracy: 0.6926

### Model 2

100,000 Samples
256 tokens
3 epochs 
lr = 5e-5,
eps = 1e-08
Train loss: 0.2251
Validation Accuracy: 0.6970

### Model 3

100,000 Samples
256 tokens
2 epochs 
lr = 5e-5,
eps = 1e-08
Train loss: 0.2296
Validation Accuracy: 0.7033



In [19]:
inputs = tokenizer("The waiters were really nice and great food.", return_tensors="pt")
inputs.pop("token_type_ids")
model.cpu()
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
print(f"The number of stars: {predicted_class_id + 1}")

The number of stars: 5


In [18]:
print(inputs)

{'input_ids': tensor([[  101,  1996, 15610,  2015,  2020,  2428,  3835,  1998,  2307,  2833,
          1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [20]:
import os
os.makedirs("models/distil_50k_samples")
model.save_pretrained("models/distil_50k_samples")

In [None]:
# class YelpRestaurantDataset(Dataset):
#     def __init__(self, data: pd.DataFrame, tokenizer: BertTokenizer, max_token_length: int = 512):
#         self.data = data
#         self.tokenizer = tokenizer
#         self.max_token_length = max_token_length
    
#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, index: int):
#         row = self.data.iloc[index]
#         input_text = row.text
#         labels = row.stars
#         encoding = tokenizer.encode_plus(
#                             input_text,
#                             add_special_tokens = True,
#                             padding = "max_length",
#                             max_length = self.max_token_length,
#                             truncation = True,
#                             return_attention_mask = True,
#                             return_tensors = 'pt'
#                        )
#         print(labels)
#         return dict(
#             input_text = input_text,
#             input_ids = encoding["input_ids"].flatten(),
#             attention_mask = encoding["attention_mask"].flatten(),
#             labels = torch.tensor(labels)
#         )

# dataset = YelpRestaurantDataset(df, tokenizer)

# Hugging Face Training (Not used for now)
# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=5,
#     weight_decay=0.01,
# )

# trainer = Trainer(
#     model,
#     args,
#     train_dataset=encoded_dataset["train"],
#     eval_dataset=encoded_dataset[validation_key],
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics
# )

## TFIDF approach

In [None]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def preprocess_text(text, stem=False, lemmatize=True, remove_urls, stopwords=None):
    if remove_urls:
        text = remove_urls(text)
    
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize
    text_list = word_tokenize()
    ## remove Stopwords
    if stopwords is not None:
        text_list = [word for word in text_list if word not in stopwords]
                
    ## Stemming
    if stem:
        ps = nltk.stem.porter.PorterStemmer()
        text_list = [ps.stem(word) for word in text_list]
                
    ## Lemmatisation
    if lemmatize:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        text_list = [lem.lemmatize(word) for word in text_list]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [None]:
df['preprocessed_text'] = df['text'].apply(preprocess_text)