In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, Dataset, DataLoader, RandomSampler, SequentialSampler
import pyarrow.parquet as pq
from transformers import BertTokenizerFast, BertForSequenceClassification, DistilBertForSequenceClassification 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm, trange

In [25]:
DATA_PATH = r"C:\Users\zoopy\code\review-assist\data\yelp_academic_dataset_review.json"
WRITE_PATH = r"C:\Users\zoopy\code\review-assist\data\yelp_academic_dataset_review_short.json"
FINAL_WRITE_PATH = r"C:\Users\zoopy\code\review-assist\data\yelp_restaurant_reviews.parquet"
FINAL_WRITE_PATH_TRAIN = r"C:\Users\zoopy\code\review-assist\data\yelp_restaurant_reviews_train.parquet"
FINAL_WRITE_PATH_TEST = r"C:\Users\zoopy\code\review-assist\data\yelp_restaurant_reviews_test.parquet"
ALREADY_DATA = True
ACTUALLY_TRAIN = True
DISTIL = True

In [3]:
if not ALREADY_DATA:
    with open(DATA_PATH, mode='r', encoding='utf8') as in_file, \
        open(WRITE_PATH, mode='w', encoding='utf8') as out_file:
            for i in range(300000):
                out_file.write(next(in_file))
    df = pd.read_json(WRITE_PATH, lines=True)
    print(df["stars"].value_counts())
    print("-"*50)
    print(df.count())

    df['text_length'] = df['text'].apply(len)
    sns.displot(df,x='text_length')
    
    df_train = df.loc[df['text_length'] <= 2500]
    df_train["stars"] -= 1
    
    sns.displot(df_train,x='text_length')
    train, test = train_test_split(df_train, test_size=0.1)
    train.to_parquet(FINAL_WRITE_PATH_TRAIN)
    test.to_parquet(FINAL_WRITE_PATH_TEST)
else:
    df_train = pd.read_parquet(FINAL_WRITE_PATH_TRAIN)
    df_test = pd.read_parquet(FINAL_WRITE_PATH_TEST)

In [184]:
if not ACTUALLY_TRAIN:
    df = df_train.sample(n=5000)

In [39]:
df = df_train.sample(n=200000)

In [38]:
df_train

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,text_length
31096,zbF_yFyW0KDdizoY7bwWXQ,GrlOQH1w3jIAhrziW6iuyQ,5ewePyN_mEE_79OmYyYBEA,0,4,1,0,I work until midnight by the time I get to sah...,2014-12-02 16:33:29,529
251347,1m-e2lA0wj6UdiH8yFa1nA,oKEcgplgzi4Nid6bexXY9Q,1Pxg1AMf0rEn9QF__ZYoWw,4,6,19,4,"I am a spent ronin, balding, emaciated, with t...",2015-01-14 21:12:32,1979
74260,0Y48NTxXzEbQQsZDwZesQQ,UXbCcmkYGl3DH_Py5UOtbQ,UCMSWPqzXjd7QHq7v8PJjQ,4,8,6,10,We had lunch here today and loved it. We were...,2015-01-22 05:21:20,2007
3668,8QTfi0GOXvK5Ig4-4q9Waw,nRBuwri2nzmFHUdOX9xO1Q,wQWhY5vA3ESMh6qFHMYvrg,4,0,0,0,My mani pedi at Queen Nails was great! I've be...,2014-03-17 15:42:54,199
83771,FjRjpxUYZ-ntIQo-1Wa1Aw,bRCS17rhy_hif-X8Q8i_Dw,xkTjLbBC7uB-rAIDqAm-sw,4,1,0,0,Best tacos in Tampa I've tasted so far! I've b...,2015-06-09 21:41:28,412
...,...,...,...,...,...,...,...,...,...,...
273519,7vemddPMpFDMHLeFCX6mlg,8pLZzJUhW_q13sKUKd-uZA,_O_KBH1MStcUkS1xk5pxsg,3,2,0,1,- Really cute and quirky coffee shop that serv...,2016-09-26 01:04:22,359
287570,qPXfjzFULcvYRbGWxPbqTg,pUNaC4U5JuY2TIoM6rsmmw,7apWV3_bxbRcC2MemII9dQ,4,0,0,0,"Excellent food, very reasonable prices, nice a...",2016-08-24 03:31:54,95
31175,AF80XkMNM0jvPTKSxuzNKw,h9vwGgymTKanLx4DZj_ZaQ,kgeiJzWSiXPnf-3wx7LHIQ,3,2,0,0,"Welcome to Meridian, Panera! Love the Pick Two...",2016-02-01 15:53:53,827
85668,WLEKpsGdq8eXhwqjNO2zhA,2tiS7fqBIwXMTHJe2GHUrg,mm4gSCCJXuAZFWDkESMjmw,0,0,0,0,I hate to do this. My coworkers and I ate the...,2012-06-30 11:34:00,254


In [40]:
def preprocess(input_text, tokenizer):
    return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        padding = "max_length",
                        max_length=512,
                        truncation=True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )

# Initialize Bert Tokenizer
tokenizer = BertTokenizerFast.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True)

text = df.text.values
labels = df.stars.values

token_id = []
attention_masks = []

for sample in tqdm(text):
    encoding_dict = preprocess(sample, tokenizer)
    token_id.append(encoding_dict['input_ids'])
    attention_masks.append(encoding_dict['attention_mask'])

token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = F.one_hot(torch.tensor(labels), num_classes=5).to(torch.float)

100%|████████████████████████████████████████████████████████████████████████| 200000/200000 [01:30<00:00, 2202.29it/s]


In [41]:
# Load the BertForSequenceClassification model
if not DISTIL:
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels = 5,
        output_attentions = False,
        output_hidden_states = False,
    )

else:
    # Load the DistilBertForSequenceClassification model
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        num_labels = 5,
        output_attentions = False,
        output_hidden_states = False,
    )

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifi

In [42]:
validation_ratio = 0.2
batch_size = 16

train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = validation_ratio,
    shuffle = True,
    stratify = labels)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [43]:
def compute_batch_accuracy(logits, labels):
    preds = np.argmax(logits, axis = 1).flatten()
    truth = np.argmax(labels, axis=1).flatten()
    return accuracy_score(truth, preds)


In [44]:
# Pytorch training loop

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 2e-5,
                              eps = 1e-08)

# Run on GPU
model.cuda()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 4

for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    train_loss = 0
    nb_train_examples, nb_train_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids,  
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        # Update tracking variables
        train_loss += train_output.loss.item()
        nb_train_examples += b_input_ids.size(0)
        nb_train_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables 
    val_accuracy = []
    eval_loss = []
    
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              attention_mask = b_input_mask)
        eval_loss.append(eval_output.loss)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        batch_accuracy = compute_batch_accuracy(logits, label_ids)
        val_accuracy.append(batch_accuracy)
    
    print('\n\t - Train loss: {:.4f}'.format(train_loss / nb_train_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))

Epoch:  25%|█████████████████▊                                                     | 1/4 [1:24:57<4:14:51, 5097.24s/it]


	 - Train loss: 0.2618
	 - Validation Accuracy: 0.7116


Epoch:  50%|███████████████████████████████████▌                                   | 2/4 [2:35:07<2:32:30, 4575.34s/it]


	 - Train loss: 0.2230
	 - Validation Accuracy: 0.7222


Epoch:  75%|█████████████████████████████████████████████████████▎                 | 3/4 [3:43:44<1:12:46, 4366.23s/it]


	 - Train loss: 0.1942
	 - Validation Accuracy: 0.7168


Epoch: 100%|█████████████████████████████████████████████████████████████████████████| 4/4 [4:52:21<00:00, 4385.42s/it]


	 - Train loss: 0.1616
	 - Validation Accuracy: 0.7117





## Hyperparameter Search

### Model 1
- 50,000 Samples
- 256 tokens
- 3 epochs 
- lr = 5e-5,
- eps = 1e-08
- Train loss: 0.2157
- Validation Accuracy: 0.6926

### Model 2

- 100,000 Samples
- 256 tokens
- 3 epochs 
- lr = 5e-5,
- eps = 1e-08
- Train loss: 0.2251
- Validation Accuracy: 0.6970

### Model 3

- 100,000 Samples
- 256 tokens
- 2 epochs 
- lr = 2e-5,
- eps = 1e-08
- Train loss: 0.2296
- Validation Accuracy: 0.7033

### Model 4
- DistilBERT
- 50,000 Samples
- 256 tokens
- 3 epochs 
- lr = 2e-5,
- eps = 1e-08
- Train loss: 0.1198
- Validation Accuracy: 0.6770

## Model 5
- DistilBERT
- 100,000 samples
- 256 tokens
- 3 epochs 
- lr = 2e-5,
- eps = 1e-08
- Train loss: 0.1949
- Validation Accuracy: 0.6957

## Model 6
- DistilBERT
- 100,000 samples
- 512 tokens
- 3 epochs 
- lr = 2e-5,
- eps = 1e-08
- Train loss: 0.1936
- Validation Accuracy: 0.7076

## Model 7
- DistilBERT
- 200,000 samples
- 512 tokens
- 4 epochs 
- lr = 2e-5,
- eps = 1e-08
- Train loss: 0.1616
- Validation Accuracy: 0.7117

In [19]:
inputs = tokenizer("The waiters were really nice and great food.", return_tensors="pt")
inputs.pop("token_type_ids")
model.cpu()
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
print(f"The number of stars: {predicted_class_id + 1}")

The number of stars: 5


In [18]:
print(inputs)

{'input_ids': tensor([[  101,  1996, 15610,  2015,  2020,  2428,  3835,  1998,  2307,  2833,
          1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [45]:
import os
os.makedirs("models/distil_200k_samples_512_tokens")
model.save_pretrained("models/distil_200k_samples_512_tokens")

In [None]:
# class YelpRestaurantDataset(Dataset):
#     def __init__(self, data: pd.DataFrame, tokenizer: BertTokenizer, max_token_length: int = 512):
#         self.data = data
#         self.tokenizer = tokenizer
#         self.max_token_length = max_token_length
    
#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, index: int):
#         row = self.data.iloc[index]
#         input_text = row.text
#         labels = row.stars
#         encoding = tokenizer.encode_plus(
#                             input_text,
#                             add_special_tokens = True,
#                             padding = "max_length",
#                             max_length = self.max_token_length,
#                             truncation = True,
#                             return_attention_mask = True,
#                             return_tensors = 'pt'
#                        )
#         print(labels)
#         return dict(
#             input_text = input_text,
#             input_ids = encoding["input_ids"].flatten(),
#             attention_mask = encoding["attention_mask"].flatten(),
#             labels = torch.tensor(labels)
#         )

# dataset = YelpRestaurantDataset(df, tokenizer)

# Hugging Face Training (Not used for now)
# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=5,
#     weight_decay=0.01,
# )

# trainer = Trainer(
#     model,
#     args,
#     train_dataset=encoded_dataset["train"],
#     eval_dataset=encoded_dataset[validation_key],
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics
# )

## TFIDF approach

In [None]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def preprocess_text(text, stem=False, lemmatize=True, remove_urls, stopwords=None):
    if remove_urls:
        text = remove_urls(text)
    
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize
    text_list = word_tokenize()
    ## remove Stopwords
    if stopwords is not None:
        text_list = [word for word in text_list if word not in stopwords]
                
    ## Stemming
    if stem:
        ps = nltk.stem.porter.PorterStemmer()
        text_list = [ps.stem(word) for word in text_list]
                
    ## Lemmatisation
    if lemmatize:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        text_list = [lem.lemmatize(word) for word in text_list]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [None]:
df['preprocessed_text'] = df['text'].apply(preprocess_text)