# Evaluation of DistilBERT and LSTM models for sentiment analysis

## Import required libraries

In [1]:
import pandas as pd
from tqdm import tqdm
from prettytable import PrettyTable
import numpy as np
import torch
from IPython.display import display, HTML
from transformers import BertTokenizer
from transformers import (
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments, 
    DistilBertTokenizerFast
)
from datasets import Dataset, DatasetDict
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
import pickle

## Import data

In [5]:
translated_tweets_df = pd.read_csv('data/translated_tweets.csv.gz', compression='gzip')
heureka_reviews_df = pd.read_json('data/reviews.json.gz', compression='gzip')
gpt35_reviews_df = pd.read_csv('data/gpt_3.5_reviews.csv')

## Prepare the data

In [6]:
translated_tweets_df = translated_tweets_df[['Sentiment', 'SentimentText']]
translated_tweets_df.rename(columns={'Sentiment': 'labels', 'SentimentText': 'text'}, inplace=True) 

In [7]:
heureka_reviews_df.dropna(inplace=True)
heureka_reviews_df.loc[:, 'review_text'] = heureka_reviews_df['review_text'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
heureka_reviews_df = heureka_reviews_df.loc[heureka_reviews_df["review_text"] != '']
heureka_reviews_df.loc[:, "sentiment"] = heureka_reviews_df["sentiment"].astype('int8')
heureka_reviews_df.reset_index(inplace=True, drop=True)
heureka_reviews_df.rename(columns={'sentiment': 'labels', 'review_text': 'text'}, inplace=True) 
heureka_reviews_df['labels'] = heureka_reviews_df['labels'].astype('int64')

In [8]:
gpt35_reviews_df.rename(columns={'sentiment': 'labels', 'review_text': 'text'}, inplace=True)
gpt35_reviews_df.dropna(inplace=True)

In [11]:
# Combine the text and labels from each dataframe
combined_reviews_df = pd.concat([
    translated_tweets_df[['text', 'labels']],
    heureka_reviews_df[['text', 'labels']],
    gpt35_reviews_df[['text', 'labels']]
], ignore_index=True)

del translated_tweets_df, heureka_reviews_df, gpt35_reviews_df

## DistilBERT

### Functions for testing DistilBERT

In [12]:
# Convert DataFrame to Hugging Face Dataset and tokenize
def df_to_dataset(df, tokenizer):
    dataset = Dataset.from_pandas(df)
    def tokenize_function(examples):
        return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)
    return dataset.map(tokenize_function, batched=True, num_proc=4)

# Define evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    all_predictions, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1).cpu().numpy()
            all_predictions.extend(predictions)
            all_labels.extend(labels.cpu().numpy())
    return np.array(all_predictions), np.array(all_labels)

### Test distilbert trained on gpt 3.5 generated reviews

In [14]:
# Check if a GPU is available and set the device accordingly
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the saved model
model_dir = "./data/gpt35_reviews_best_model"
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-multilingual-cased')
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
model.to(device)
print(f"Model loaded from {model_dir} and using device: {model.device}")

# Split the DataFrame into train and test sets
train_df, test_df = train_test_split(combined_reviews_df, test_size=0.05, stratify=combined_reviews_df['labels'], random_state=42)

# train_dataset = df_to_dataset(train_df, tokenizer)
test_dataset = df_to_dataset(test_df, tokenizer)

# train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Create DataLoader for the new dataset
# train_dataloader = DataLoader(train_dataset, batch_size=16)
test_dataloader = DataLoader(test_dataset, batch_size=16)

# Perform evaluation
predictions, labels = evaluate(model, test_dataloader, device)

# Compute metrics
precision, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
accuracy = accuracy_score(labels, predictions)

# Create a DataFrame for results
results_df = pd.DataFrame({
    'Metric': ['Accuracy', 'F1 Score', 'Precision'],
    'Value': [accuracy, f1, precision]
})

# Print the results in a nice table format
table = PrettyTable()
table.field_names = ["Info", "Details"]
table.add_row(["Model", "distilbert-base-multilingual-cased"])
table.add_row(["Trained on", "combined_reviews"])
table.add_row(["Tested on", "combined_reviews (0.05)"])
for index, row in results_df.iterrows():
    table.add_row([row['Metric'], row['Value']])

print(table)

# If you just want to print predictions:
print(f"Predictions: {predictions}")

Using device: mps




Model loaded from ./data/gpt35_reviews_best_model and using device: mps:0


Map (num_proc=4):   0%|          | 0/3104 [00:00<?, ? examples/s]

Evaluating: 100%|█████████████████████████████| 194/194 [01:04<00:00,  2.98it/s]

+-----------+------------------------------------+
|    Info   |              Details               |
+-----------+------------------------------------+
|   Model   | distilbert-base-multilingual-cased |
| Tested on |      combined_reviews (0.001)      |
|  Accuracy |         0.7419458762886598         |
|  F1 Score |         0.746779761550987          |
| Precision |         0.7589421293817251         |
|   Recall  |         0.7419458762886598         |
+-----------+------------------------------------+
Predictions: [0 1 1 ... 1 0 1]





### Test distilbert model trained on translated tweets

In [None]:
# Check if a GPU is available and set the device accordingly
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define model and data info
model_name = "distilbert-base-multilingual-cased"
data_name = "gpt35_reviews_df"

# Load the saved model
model_dir = "./data/translated_tweets_best_model"
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-multilingual-cased')
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
model.to(device)
print(f"Model loaded from {model_dir} and using device: {model.device}")

# Split the DataFrame into train and test sets
train_df, test_df = train_test_split(combined_reviews_df, test_size=0.05, stratify=combined_reviews_df['labels'], random_state=42)

# train_dataset = df_to_dataset(train_df, tokenizer)
test_dataset = df_to_dataset(test_df, tokenizer)

# train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Create DataLoader for the new dataset
# train_dataloader = DataLoader(train_dataset, batch_size=16)
test_dataloader = DataLoader(test_dataset, batch_size=16)

# Perform evaluation
predictions, labels = evaluate(model, test_dataloader, device)

# Compute metrics
precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
accuracy = accuracy_score(labels, predictions)

# Create a DataFrame for results
results_df = pd.DataFrame({
    'Metric': ['Accuracy', 'F1 Score', 'Precision'],
    'Value': [accuracy, f1, precision]
})

# Print the results in a nice table format
table = PrettyTable()
table.field_names = ["Info", "Details"]
table.add_row(["Model", "distilbert-base-multilingual-cased"])
table.add_row(["Trained on", "translated_tweets"])
table.add_row(["Tested on", "translated_tweets (0.05)"])
for index, row in results_df.iterrows():
    table.add_row([row['Metric'], row['Value']])

print(table)

# If you just want to print predictions:
print(f"Predictions: {predictions}")

### Test DistilBERT model trained on heureka reviews

In [None]:
# Check if a GPU is available and set the device accordingly
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the saved model
model_dir = "./data/translated_tweets_best_model"
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-multilingual-cased')
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
model.to(device)
print(f"Model loaded from {model_dir} and using device: {model.device}")

# Split the DataFrame into train and test sets
train_df, test_df = train_test_split(combined_reviews_df, test_size=0.05, stratify=combined_reviews_df['labels'], random_state=42)

# train_dataset = df_to_dataset(train_df, tokenizer)
test_dataset = df_to_dataset(test_df, tokenizer)

# train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Create DataLoader for the new dataset
# train_dataloader = DataLoader(train_dataset, batch_size=16)
test_dataloader = DataLoader(test_dataset, batch_size=16)

# Perform evaluation
predictions, labels = evaluate(model, test_dataloader, device)

# Compute metrics
precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
accuracy = accuracy_score(labels, predictions)

# Create a DataFrame for results
results_df = pd.DataFrame({
    'Metric': ['Accuracy', 'F1 Score', 'Precision'],
    'Value': [accuracy, f1, precision]
})

# Print the results in a nice table format
table = PrettyTable()
table.field_names = ["Info", "Details"]
table.add_row(["Model", "distilbert-base-multilingual-cased"])
table.add_row(["Trained on", "heureka_reviews"])
table.add_row(["Tested on", "combined_reviews (0.05)"])
for index, row in results_df.iterrows():
    table.add_row([row['Metric'], row['Value']])

print(table)

# If you just want to print predictions:
print(f"Predictions: {predictions}")

## LSTM

### Function for evaluating LSTM model

In [None]:
# Evaluate the model
def evaluate(model, X_test, Y_test):
    predictions = model.predict(X_test, batch_size=16)
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(Y_test, predictions, average='weighted')
    accuracy = accuracy_score(Y_test, predictions)
    return accuracy, precision, recall, f1


### Test LSTM model trained on gpt 3.5 generated reviews

In [None]:
# Load the tokenizer
with open("lstm_gpt_tokenizer.pickle", "rb") as f:
    tokenizer = pickle.load(f)

# Maximum number of features
max_features = 5000

# Prepare the data
X = tokenizer.texts_to_sequences(combined_reviews_df['text'].values)
X = pad_sequences(X, maxlen=max_features)
Y = combined_reviews_df['labels'].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.05, random_state=42)

# # Load the LSTM model
model = tf.keras.models.load_model('lstm_gpt.keras')

Perform evaluation
accuracy, precision, recall, f1 = evaluate(model, X_test, Y_test)

# Create a DataFrame for results
results_df = pd.DataFrame({
    'Metric': ['Accuracy', 'F1 Score', 'Precision'],
    'Value': [accuracy, f1, precision]
})

# Print the results in a nice table format
table = PrettyTable()
table.field_names = ["Info", "Details"]
table.add_row(["Model", "LSTM"])
table.add_row(["Trained on", "gpt35_reviews"])
table.add_row(["Tested on", "combined_reviews (0.05)"])
for index, row in results_df.iterrows():
    table.add_row([row['Metric'], row['Value']])

print(table)

# If you just want to print predictions:
print(f"Predictions: {predictions}")

2024-06-24 21:43:40.624178: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-06-24 21:43:40.624356: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-06-24 21:43:40.624376: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-06-24 21:43:40.624424: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-24 21:43:40.624468: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


### Test LSTM model trained on translated tweets

In [None]:
# Load the tokenizer
with open("tweets_lstm_tokenizer.pickle", "rb") as f:
    tokenizer = pickle.load(f)

# Maximum number of features
max_features = 5000

# Prepare the data
X = tokenizer.texts_to_sequences(combined_reviews_df['text'].values)
X = pad_sequences(X, maxlen=max_features)
Y = combined_reviews_df['labels'].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.05, random_state=42)

# # Load the LSTM model
model = tf.keras.models.load_model('trained_lstm.keras')

Perform evaluation
accuracy, precision, recall, f1 = evaluate(model, X_test, Y_test)

# Create a DataFrame for results
results_df = pd.DataFrame({
    'Metric': ['Accuracy', 'F1 Score', 'Precision'],
    'Value': [accuracy, f1, precision]
})

# Print the results in a nice table format
table = PrettyTable()
table.field_names = ["Info", "Details"]
table.add_row(["Model", "LSTM"])
table.add_row(["Trained on", "translated_tweets"])
table.add_row(["Tested on", "combined_reviews_df (0.05)"])
for index, row in results_df.iterrows():
    table.add_row([row['Metric'], row['Value']])

print(table)

# If you just want to print predictions:
print(f"Predictions: {predictions}")

### Test LSTM model trained on heureka reviews

In [None]:
# Load the tokenizer
with open("lstm_heureka_tokenizer.pickle", "rb") as f:
    tokenizer = pickle.load(f)

# Maximum number of features
max_features = 5000

# Prepare the data
X = tokenizer.texts_to_sequences(combined_reviews_df['text'].values)
X = pad_sequences(X, maxlen=max_features)
Y = combined_reviews_df['labels'].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.05, random_state=42)

# # Load the LSTM model
model = tf.keras.models.load_model('lstm_heaureka.keras')

Perform evaluation
accuracy, precision, recall, f1 = evaluate(model, X_test, Y_test)

# Create a DataFrame for results
results_df = pd.DataFrame({
    'Metric': ['Accuracy', 'F1 Score', 'Precision'],
    'Value': [accuracy, f1, precision]
})

# Print the results in a nice table format
table = PrettyTable()
table.field_names = ["Info", "Details"]
table.add_row(["Model", "LSTM"])
table.add_row(["Trained on", "heureka_reviews"])
table.add_row(["Tested on", "combined_reviews (0.05)"])
for index, row in results_df.iterrows():
    table.add_row([row['Metric'], row['Value']])

print(table)

# If you just want to print predictions:
print(f"Predictions: {predictions}")