In [14]:
import torch
import evaluate
import accelerate
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from datasets import Dataset


Preparing the dataset

In [2]:
df = pd.read_csv('fake_reviews_dataset.csv')
df

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...
...,...,...,...,...
40427,Clothing_Shoes_and_Jewelry_5,4.0,OR,I had read some reviews saying that this bra r...
40428,Clothing_Shoes_and_Jewelry_5,5.0,CG,I wasn't sure exactly what it would be. It is ...
40429,Clothing_Shoes_and_Jewelry_5,2.0,OR,"You can wear the hood by itself, wear it with ..."
40430,Clothing_Shoes_and_Jewelry_5,1.0,CG,I liked nothing about this dress. The only rea...


In [3]:
df.describe(include='object')

Unnamed: 0,category,label,text_
count,40432,40432,40432
unique,10,2,40412
top,Kindle_Store_5,CG,Easy to put together and looks nice and the fi...
freq,4730,20216,2


In [4]:
ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['category', 'rating', 'label', 'text_'],
    num_rows: 40432
})

Using a DistilBERT tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")



In [6]:
def tokenize_data(x):
    return tokenizer(x['text_'])

In [7]:
tokenized_data = ds.map(tokenize_data, batched=True)
tokenized_data


Map:   0%|          | 0/40432 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors


Dataset({
    features: ['category', 'rating', 'label', 'text_', 'input_ids', 'attention_mask'],
    num_rows: 40432
})

Make batches of the data

In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Adding a metric to evaluate the model

In [15]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)