In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import torch
from torch.utils.data import TensorDataset, Dataset, DataLoader, RandomSampler, SequentialSampler
import pyarrow.parquet as pq
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
DATA_PATH = r"C:\Users\zoopy\code\review-assist\data\yelp_academic_dataset_review.json"
WRITE_PATH = r"C:\Users\zoopy\code\review-assist\data\yelp_academic_dataset_review_short.json"
FINAL_WRITE_PATH = r"C:\Users\zoopy\code\review-assist\data\yelp_restaurant_reviews.parquet"
ALREADY_DATA = True

In [3]:
if not ALREADY_DATA:
    with open(DATA_PATH, mode='r', encoding='utf8') as in_file, \
        open(WRITE_PATH, mode='w', encoding='utf8') as out_file:
            for i in range(250000):
                out_file.write(next(in_file))
    df = pd.read_json(WRITE_PATH, lines=True)
    print(df["stars"].value_counts())
    print("-"*50)
    print(df.count())

    df['text_length'] = df['text'].apply(len)
    sns.displot(df,x='text_length')

    df = df.loc[df['text_length'] <= 2500]

    sns.displot(df,x='text_length')
    df.to_parquet(FINAL_WRITE_PATH)
else:
    df = pd.read_parquet(FINAL_WRITE_PATH)

In [4]:
def preprocess(input_text, tokenizer):
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        padding = True,
                        max_length=512,
                        truncation=True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )

In [9]:
text = df.text.values
labels = df.stars.values

In [10]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True)

In [11]:
token_id = []
attention_masks = []

for sample in tqdm(text):
  encoding_dict = preprocess(sample, tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])

100%|█████████████████████████████████████████████████████████████████████████| 247444/247444 [06:05<00:00, 677.27it/s]


In [46]:
token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

RuntimeError: Sizes of tensors must match except in dimension 0. Expected size 118 but got size 186 for tensor number 1 in the list.

In [40]:
# Not used for now
# class YelpRestaurantDataset(Dataset):
#     def __init__(self, data: pd.DataFrame, tokenizer: BertTokenizer, max_token_length: int = 512):
#         self.data = data
#         self.tokenizer = tokenizer
#         self.max_token_length = max_token_length
    
#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, index: int):
#         row = self.data.iloc[index]
#         input_text = row.text
#         labels = row.stars
#         encoding = tokenizer.encode_plus(
#                             input_text,
#                             add_special_tokens = True,
#                             padding = "max_length",
#                             max_length = self.max_token_length,
#                             truncation = True,
#                             return_attention_mask = True,
#                             return_tensors = 'pt'
#                        )
#         print(labels)
#         return dict(
#             input_text = input_text,
#             input_ids = encoding["input_ids"].flatten(),
#             attention_mask = encoding["attention_mask"].flatten(),
#             labels = torch.tensor(labels)
#         )

# dataset = YelpRestaurantDataset(df, tokenizer)

In [38]:
# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 5,
    output_attentions = False,
    output_hidden_states = False,
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
def train_val_dataset(dataset, val_split=0.25):
    train_idx, val_idx = train_test_split(list(range(len(dataset))), test_size=val_split)
    datasets = {}
    datasets['train'] = Subset(dataset, train_idx)
    datasets['val'] = Subset(dataset, val_idx)
    return datasets

In [43]:
train_idx

array([ 94649, 225431,   9729, ..., 206943,  86079, 159025])

In [45]:
validation_ratio = 0.2
batch_size = 32

train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = 0.2,
    shuffle = True,
    stratify = labels)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

TypeError: only integer scalar arrays can be converted to a scalar index

In [None]:


training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)