In [1]:
import glob
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer

In [2]:
# Test sentence
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
input_sentence = "Hello my name is Jin"
encoded = tokenizer(input_sentence)
print('Input sentence: ', input_sentence, '\n')
print('Encoded: ', encoded, '\n')
print('Decoded: ', tokenizer.decode(encoded['input_ids']), '\n')


TypeError: 'BertTokenizer' object is not callable

In [None]:
# Get all review files
files = glob.glob(r'./Reviews/*[0-9].csv')
print(files)

In [None]:
# Concat all review data from different products into one big dataframe
df_list = []

for file in files:
    df = pd.read_csv(file)
    df_list.append(df)
    
df = pd.concat(df_list, axis=0, ignore_index=True)
df

In [None]:
# Check for missing values
df.isnull().any()

In [None]:
# Find reviews with missing comments
missing_indices = df[df['comment'].isnull()].index.tolist()
print('Number of reviews missing comments: ', len(missing_indices))
print('Missing indices: ', missing_indices)

In [None]:
# See what an example review with missing comment looks like
print(df.iloc[120116])

In [None]:
print('Max comment length (of all products): ', int(df.comment.str.len().max()))

In [None]:
# Get only the comments and star (labels) data
comment_df = df[['comment', 'stars']]
comment_df

In [None]:
# print first 5 reviews
for idx, row in comment_df[:5].iterrows():
    print(row['comment'] + '\n')

In [None]:
class ReviewsDataset:
    def __init__(self, df, max_length=1024):
        self.df = df
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        self.max_length = max_length 
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        # input=review, label=rating
        review = self.df.loc[idx, 'comment']
        rating = int(self.df.loc[idx, 'stars'])
        
        encoded = self.tokenizer(
            review,                      # review to encode
            max_length=self.max_length,  # Truncate all segments to max_length
            padding='max_length',        # pad all reviews with the [PAD] token to the max_length
            return_attention_mask=True,  # Construct attention masks.
            return_tensors='pt'
        )
        
        input_ids = encoded['input_ids']
        attn_mask = encoded['attention_mask']
        
        return {
            'input_ids': input_ids, 
            'attn_mask': attn_mask, 
            'rating': rating
        }

In [None]:
data_set = ReviewsDataset(df, 1024)
data_set[0]