In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, Dataset, DataLoader, RandomSampler, SequentialSampler
import pyarrow.parquet as pq
from transformers import BertTokenizerFast, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm, trange

In [116]:
DATA_PATH = r"C:\Users\zoopy\code\review-assist\data\yelp_academic_dataset_review.json"
WRITE_PATH = r"C:\Users\zoopy\code\review-assist\data\yelp_academic_dataset_review_short.json"
FINAL_WRITE_PATH = r"C:\Users\zoopy\code\review-assist\data\yelp_restaurant_reviews.parquet"
FINAL_WRITE_PATH_TRAIN = r"C:\Users\zoopy\code\review-assist\data\yelp_restaurant_reviews_train.parquet"
FINAL_WRITE_PATH_TEST = r"C:\Users\zoopy\code\review-assist\data\yelp_restaurant_reviews_test.parquet"
ALREADY_DATA = True
ACTUALLY_TRAIN = True

In [118]:
if not ALREADY_DATA:
    with open(DATA_PATH, mode='r', encoding='utf8') as in_file, \
        open(WRITE_PATH, mode='w', encoding='utf8') as out_file:
            for i in range(300000):
                out_file.write(next(in_file))
    df = pd.read_json(WRITE_PATH, lines=True)
    print(df["stars"].value_counts())
    print("-"*50)
    print(df.count())

    df['text_length'] = df['text'].apply(len)
    sns.displot(df,x='text_length')
    
    df_train = df.loc[df['text_length'] <= 2500]
    df_train["stars"] -= 1
    
    sns.displot(df_train,x='text_length')
    train, test = train_test_split(df_train, test_size=0.1)
    train.to_parquet(FINAL_WRITE_PATH_TRAIN)
    test.to_parquet(FINAL_WRITE_PATH_TEST)
else:
    df_train = pd.read_parquet(FINAL_WRITE_PATH_TRAIN)
    df_test = pd.read_parquet(FINAL_WRITE_PATH_TEST)

In [119]:
if not ACTUALLY_TRAIN:
    df = df-train.sample(n=5000)

In [120]:
df = df_train.sample(n=5000)

In [127]:
def preprocess(input_text, tokenizer):
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        padding = "max_length",
                        max_length=64,
                        truncation=True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )

# Initialize Bert Tokenizer
tokenizer = BertTokenizerFast.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True)

text = df.text.values
labels = df.stars.values

token_id = []
attention_masks = []

for sample in tqdm(text):
    encoding_dict = preprocess(sample, tokenizer)
    token_id.append(encoding_dict['input_ids']) attention_masks.append(encoding_dict['attention_mask'])

token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = F.one_hot(torch.tensor(labels), num_classes=5).to(torch.float)

100%|████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:02<00:00, 2338.43it/s]


In [128]:
# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 5,
    output_attentions = False,
    output_hidden_states = False,
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [129]:
validation_ratio = 0.2
batch_size = 16

train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = 0.2,
    shuffle = True,
    stratify = labels)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [130]:
def compute_batch_accuracy(logits, labels):
    preds = np.argmax(logits, axis = 1).flatten()
    truth = np.argmax(labels, axis=1).flatten()
    return accuracy_score(truth, preds)


In [None]:
# Pytorch training
# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08)

# Run on GPU
model.cuda()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 3

for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    train_loss = 0
    nb_train_examples, nb_train_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables 
    val_accuracy = []
    eval_loss = []
    
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        eval_loss.append(eval_output.loss)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        batch_accuracy = compute_batch_accuracy(logits, label_ids)
        val_accuracy.append(batch_accuracy)
    
    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))

Epoch:  33%|█████████████████████████▋                                                   | 1/3 [00:31<01:02, 31.22s/it]


	 - Train loss: 0.1943
	 - Validation Accuracy: 0.5685


In [107]:
inputs = tokenizer("The waiters were really nice and great food.", return_tensors="pt")
model.cpu()
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()

In [108]:
predicted_class_id

4

In [None]:
# class YelpRestaurantDataset(Dataset):
#     def __init__(self, data: pd.DataFrame, tokenizer: BertTokenizer, max_token_length: int = 512):
#         self.data = data
#         self.tokenizer = tokenizer
#         self.max_token_length = max_token_length
    
#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, index: int):
#         row = self.data.iloc[index]
#         input_text = row.text
#         labels = row.stars
#         encoding = tokenizer.encode_plus(
#                             input_text,
#                             add_special_tokens = True,
#                             padding = "max_length",
#                             max_length = self.max_token_length,
#                             truncation = True,
#                             return_attention_mask = True,
#                             return_tensors = 'pt'
#                        )
#         print(labels)
#         return dict(
#             input_text = input_text,
#             input_ids = encoding["input_ids"].flatten(),
#             attention_mask = encoding["attention_mask"].flatten(),
#             labels = torch.tensor(labels)
#         )

# dataset = YelpRestaurantDataset(df, tokenizer)

# Hugging Face Training (Not used for now)
# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=5,
#     weight_decay=0.01,
# )

# trainer = Trainer(
#     model,
#     args,
#     train_dataset=encoded_dataset["train"],
#     eval_dataset=encoded_dataset[validation_key],
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics
# )