# Sentiment Analysis using mBERT

## Import required libraries

In [1]:
import pandas as pd
from IPython.display import display, HTML
from transformers import BertTokenizer

## Import the data

In [2]:
translated_tweets_df = pd.read_csv('data/translated_tweets.csv.gz', compression='gzip')

In [56]:
# heureka_reviews_df = pd.read_json('data/reviews.json.gz', compression='gzip')

In [57]:
# gpt4_reviews_df = pd.read_csv('data/GPT4_reviews.csv')

In [58]:
# gpt35_reviews_df = pd.read_csv('data/gpt_3.5_reviews.csv')

## EDA

In [63]:

display(HTML("<h3>Translated Tweets DataFrame Head</h3>"))
display(translated_tweets_df.head())

display(HTML("<h3>Translated Tweets DataFrame Info</h3>"))
display(HTML(translated_tweets_df.info()))

display(HTML("<h3>Heureka Reviews DataFrame Head</h3>"))
display(heureka_reviews_df.head())

display(HTML("<h3>Heureka Reviews DataFrame Info</h3>"))
display(HTML(heureka_reviews_df.info()))

display(HTML("<h3>GPT-4 Reviews DataFrame Head</h3>"))
display(gpt4_reviews_df.head())

display(HTML("<h3>GPT-4 Reviews DataFrame Info</h3>"))
display(gpt4_reviews_df.info())

display(HTML("<h3>GPT-3.5 Reviews DataFrame Head</h3>"))
display(gpt35_reviews_df.head())

display(HTML("<h3>GPT-3.5 Reviews DataFrame Info</h3>"))
display(gpt35_reviews_df.info())

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
0,1,0,Sentiment140,Je to tak smutné pre môjho APL priateľa..........
1,2,0,Sentiment140,Chýbal mi nový trailer...
2,3,1,Sentiment140,Omg je už 7:30 :O
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. som bol u toh...
4,5,0,Sentiment140,"Myslím si, že mi BF podvádza na mňa!!!"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1138103 entries, 0 to 1138102
Data columns (total 4 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   ItemID           1138103 non-null  int64 
 1   Sentiment        1138103 non-null  int64 
 2   SentimentSource  1138103 non-null  object
 3   SentimentText    1138103 non-null  object
dtypes: int64(2), object(2)
memory usage: 34.7+ MB


<IPython.core.display.HTML object>

Unnamed: 0,review_text,sentiment
0,Som spokojná s tovarom aj rýchlosťou doručenia...,
1,"Kvalita ,rychlost",
2,Doposiaľ najrýchlejšie dodanie tovaru,1.0
3,Neskora donaśka,0.0
4,Dopravca s nemožnosťou platby kartou,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3048277 entries, 0 to 3048276
Data columns (total 2 columns):
 #   Column       Dtype  
---  ------       -----  
 0   review_text  object 
 1   sentiment    float64
dtypes: float64(1), object(1)
memory usage: 46.5+ MB


<IPython.core.display.HTML object>

Unnamed: 0,review_id,review_text,sentiment
0,1,Tento produkt je úžasný a veľmi užitočný!,1
1,2,Som sklamaný z kvality tohto produktu.,0
2,3,"Výborne funguje, odporúčam!",1
3,4,"Nevydrží dlho, nie je to stojí za peniaze.",0
4,5,Excelentná kvalita za rozumnú cenu.,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1226 entries, 0 to 1225
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_id    1226 non-null   int64 
 1   review_text  1226 non-null   object
 2   sentiment    1226 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 28.9+ KB


None

Unnamed: 0,review_text,sentiment
0,Skvelý výkon a dizajn!,1
1,Slabá batéria,0
2,Veľmi praktický produkt.,1
3,Občasné spomalenia,0
4,Skvelá kvalita za tú cenu.,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4052 entries, 0 to 4051
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_text  4052 non-null   object
 1   sentiment    4052 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 63.4+ KB


None

In [3]:
translated_tweets_df = translated_tweets_df[['Sentiment', 'SentimentText']]

In [4]:
translated_tweets_df.rename(columns={'Sentiment': 'labels', 'SentimentText': 'text'}, inplace=True) 

## Train distilBERT

In [6]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from datasets import Dataset

device = torch.device('mps' if torch.backends.mps.is_built() else 'cuda' if torch.cuda.is_available() else 'cpu')

df = translated_tweets_df.copy()

df.rename(columns={'sentiment': 'labels'}, inplace=True)

train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['labels'], random_state=42)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])




Map:   0%|          | 0/910482 [00:00<?, ? examples/s]

Map:   0%|          | 0/227621 [00:00<?, ? examples/s]

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=2)
model.to(device)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False, 
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    # compute_metrics=compute_metrics
)


In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# Save the model
trainer.save_model("models/distilbert-slovak-sentiment")
tokenizer.save_pretrained("tokenizers/distilbert-slovak-sentiment")

In [5]:
from transformers import (
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments, 
    DistilBertTokenizerFast
)
from datasets import Dataset, DatasetDict
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import train_test_split
import torch
import numpy as np

# Check if a GPU is available and set the device accordingly
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load tokenizer and model for DistilBERT
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-multilingual-cased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=2)
model.to(device)  # Ensure the model is on the correct device
print(f"Model is using device: {model.device}")

# Split the DataFrame into train and test sets
train_df, test_df = train_test_split(translated_tweets_df, test_size=0.1, stratify=translated_tweets_df['labels'], random_state=42)

train_df, _ = train_test_split(train_df, train_size=0.2, stratify=train_df['labels'], random_state=42)

# Convert DataFrame to Hugging Face Dataset
def df_to_dataset(df, tokenizer):
    dataset = Dataset.from_pandas(df)
    def tokenize_function(examples):
        return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)
    return dataset.map(tokenize_function, batched=True, num_proc=4)

train_dataset = df_to_dataset(train_df, tokenizer)
test_dataset = df_to_dataset(test_df, tokenizer)

# Create a DatasetDict to mimic the original format
tokenized_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

# Set format for PyTorch
tokenized_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define metrics for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels.cpu(), predictions, average='weighted')
    acc = accuracy_score(labels.cpu(), predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training setup
batch_size = 16  # Adjust based on device capacity
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),  # Use 16-bit floating point precision if available
    dataloader_num_workers=4  # Parallelize data loading
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics
)

# Train the model and evaluate
# trainer.train()
# trainer.save_model("./best_model")
# results = trainer.evaluate()
# print(f"Validation Results: {results}")

Using device: mps


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is using device: mps:0


Map (num_proc=4):   0%|          | 0/182096 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/227621 [00:00<?, ? examples/s]

In [6]:
# Train the model and evaluate
trainer.train()
trainer.save_model("./best_model")
results = trainer.evaluate()
print(f"Validation Results: {results}")

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 