Process to follow

Step 1: Run a field taste with OvR, OvO with TFIDF, then with Word2vec, BERT and then also Truncated SVD. Choose the best performing one for tuning.

Step 2: Do same for XGBoost

Step 3: NEURAL NETWORKS

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.6-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import optuna
import xgboost as xgb
import torch

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_auc_score
from sklearn.multiclass import OneVsRestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.multiclass import OneVsOneClassifier
from gensim.models import Word2Vec
from xgboost import XGBClassifier
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.preprocessing import LabelEncoder

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
# Load your Excel data into a pandas DataFrame
df = pd.read_csv('Review.csv')

# Select only the 'review' and 'rating' columns
df_filtered = df[['Review', 'Rating']]

df_filtered = df_filtered[df_filtered['Rating'] != 0]

# Removing nan values
df_filtered.dropna(inplace=True)

# Display the first few rows of the final DataFrame
print(df_filtered.head())

                                              Review  Rating
0  I love this car.\nGas mileage, suspension, and...       5
1  I purchased my 2013 ILX from the dealer used w...       5
2  I recently purchased a 2013 ILX with the Tech ...       4
3  We bought our ILX used and have been incredibl...       4
4  In April of 2015 we were in need of another ca...       5


In [5]:
def review_cleaner(review):
  stopwords = nltk.corpus.stopwords.words("english")
  porter = PorterStemmer()
  # Make sure the reviews are not case sensitive
  review = review.lower()
  # Tokenize the words from the review
  words = nltk.word_tokenize(review)
  # Stemming and stopwords removal
  processed_words = [porter.stem(word) for word in words if word not in stopwords]
  # Join back to a single string
  return ' '.join(processed_words)

In [6]:
# Since Rating 1 has the least frequency, we use the same number for all other classes.
val = df_filtered['Rating'].value_counts()[1]

# Balancing class frequencies
df_rating_1 = df_filtered[df_filtered['Rating'] == 1].sample(n=val, random_state=1)
df_rating_2 = df_filtered[df_filtered['Rating'] == 2].sample(n=val, random_state=1)
df_rating_3 = df_filtered[df_filtered['Rating'] == 3].sample(n=val, random_state=1)
df_rating_4 = df_filtered[df_filtered['Rating'] == 4].sample(n=val, random_state=1)
df_rating_5 = df_filtered[df_filtered['Rating'] == 5].sample(n=val, random_state=1)

# Combine the samples into a single DataFrame
balanced_df = pd.concat([df_rating_1, df_rating_2, df_rating_3, df_rating_4, df_rating_5])

# Reset the index for neatness
balanced_df.reset_index(drop=True, inplace=True)

# Removing nan values
balanced_df.dropna(inplace=True)

# Display the first few rows of the final DataFrame
print(balanced_df.head())

                                              Review  Rating
0  Our 2008 Town & Country shuts off while drivin...       1
1  I purchased this new in 2012 and paid cash for...       1
2  Update:  12/28/2019 - GPS/INFOTAINMENT SCREEN ...       1
3  I thought I was getting a good deal. A mint fu...       1
4  I have had a rattle in my new VW atlas after t...       1


In [7]:
balanced_df.shape

(54950, 2)

#**Algorithms with T-IDF Vectors**

In [8]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(balanced_df['Review'], balanced_df['Rating'], stratify =balanced_df['Rating'], shuffle = True, test_size=0.2, random_state=3)
X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(X_train_1, y_train_1, stratify = y_train_1, shuffle = True, test_size=0.25, random_state=3)

tfidf2 = TfidfVectorizer(preprocessor=review_cleaner, max_features = 7000, ngram_range = (1, 2))
X_train_tfidf_2 = tfidf2.fit_transform(X_train_1)
X_test_tfidf_2 = tfidf2.transform(X_test_1)

In [9]:
lr = OneVsRestClassifier(LogisticRegression(max_iter=200, class_weight='balanced', solver='saga', penalty = 'elasticnet', C = 2, l1_ratio = 0.22405730364701912))
lr.fit(X_train_tfidf_2, y_train_1)
y_pred_lr = lr.predict(X_test_tfidf_2)
y_pred_probs_lr = lr.predict_proba(X_test_tfidf_2)

print(roc_auc_score(y_test_1, y_pred_probs_lr, multi_class='ovr'))
print(roc_auc_score(y_test_1, y_pred_probs_lr, average='macro', multi_class='ovr'))
print(classification_report(y_test_1, y_pred_lr))


0.8055264277807354
0.8055264277807354
              precision    recall  f1-score   support

           1       0.52      0.57      0.55      2198
           2       0.37      0.35      0.36      2198
           3       0.39      0.37      0.38      2198
           4       0.46      0.44      0.45      2198
           5       0.55      0.59      0.57      2198

    accuracy                           0.46     10990
   macro avg       0.46      0.46      0.46     10990
weighted avg       0.46      0.46      0.46     10990



In [10]:
svc = OneVsOneClassifier(SVC(class_weight = 'balanced', max_iter = 1000))
svc.fit(X_train_tfidf_2, y_train_1)
y_pred_svc = svc.predict(X_test_tfidf_2)



In [11]:
print(classification_report(y_test_1, y_pred_svc))

              precision    recall  f1-score   support

           1       0.45      0.55      0.49      2198
           2       0.34      0.27      0.30      2198
           3       0.33      0.30      0.31      2198
           4       0.40      0.35      0.37      2198
           5       0.49      0.60      0.54      2198

    accuracy                           0.41     10990
   macro avg       0.40      0.41      0.41     10990
weighted avg       0.40      0.41      0.41     10990



In [12]:
xgb_model = XGBClassifier(
    objective='multi:softprob',
    num_class=5,
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42
)

le = LabelEncoder()
y_train_xgb = le.fit_transform(y_train_1)

xgb_model.fit(X_train_tfidf_2, y_train_xgb)
y_pred_xgb = xgb_model.predict(X_test_tfidf_2)
y_pred_xgb = le.inverse_transform(y_pred_xgb)
print(classification_report(y_test_1, y_pred_xgb))

Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           1       0.52      0.58      0.55      2198
           2       0.38      0.37      0.37      2198
           3       0.39      0.32      0.36      2198
           4       0.45      0.41      0.43      2198
           5       0.52      0.61      0.56      2198

    accuracy                           0.46     10990
   macro avg       0.45      0.46      0.45     10990
weighted avg       0.45      0.46      0.45     10990



#**Algorithms with Word2Vec Embeddings**

In [13]:
# Tokenize your reviews for Word2Vec training
def tokenize_reviews(reviews):
    return [nltk.word_tokenize(review.lower()) for review in reviews]

# Tokenize the reviews
tokenized_reviews = tokenize_reviews(balanced_df['Review'])

# Train a Word2Vec model on your dataset
word2vec_model = Word2Vec(
    sentences=tokenized_reviews,
    vector_size=300,   # Embedding size
    window=5,          # Context window size
    min_count=2,       # Ignore words with frequency < 2
    sg=1,              # Skip-gram model
    epochs=10          # Training iterations
)

# Save the model for later use
word2vec_model.save("custom_word2vec.model")

# Generate averaged Word2Vec embeddings for each review
def get_avg_word2vec_embeddings(reviews, model):
    embeddings = []
    for review in tokenize_reviews(reviews):
        vectors = [model.wv[word] for word in review if word in model.wv]
        if vectors:
            avg_vector = np.mean(vectors, axis=0)
        else:
            avg_vector = np.zeros(model.vector_size)
        embeddings.append(avg_vector)
    return np.array(embeddings)

# Generate embeddings for train and test
X_train_word2vec = get_avg_word2vec_embeddings(X_train_1, word2vec_model)
X_test_word2vec = get_avg_word2vec_embeddings(X_test_1, word2vec_model)

In [14]:
lr_w2v = OneVsRestClassifier(LogisticRegression(max_iter=200, class_weight='balanced', solver='saga', penalty = 'elasticnet', C = 1, l1_ratio = 0.2))
lr_w2v.fit(X_train_word2vec, y_train_1)
y_pred_lrw2v = lr_w2v.predict(X_test_word2vec)
y_pred_probs_lrw2v = lr_w2v.predict_proba(X_test_word2vec)

print(roc_auc_score(y_test_1, y_pred_probs_lrw2v, multi_class='ovr'))
print(roc_auc_score(y_test_1, y_pred_probs_lrw2v, average='macro', multi_class='ovr'))
print(classification_report(y_test_1, y_pred_lrw2v))

0.8018057920965458
0.8018057920965458
              precision    recall  f1-score   support

           1       0.51      0.62      0.56      2198
           2       0.38      0.33      0.35      2198
           3       0.39      0.35      0.37      2198
           4       0.45      0.44      0.45      2198
           5       0.57      0.60      0.59      2198

    accuracy                           0.47     10990
   macro avg       0.46      0.47      0.46     10990
weighted avg       0.46      0.47      0.46     10990



In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_word2vec)
X_test_scaled = scaler.transform(X_test_word2vec)

svc_w2v = OneVsOneClassifier(SVC(class_weight = 'balanced', max_iter = 3000, C = 1))
svc_w2v.fit(X_train_scaled, y_train_1)
y_pred_svcw2v = svc_w2v.predict(X_test_scaled)
print(classification_report(y_test_1, y_pred_svcw2v))



              precision    recall  f1-score   support

           1       0.55      0.46      0.50      2198
           2       0.37      0.36      0.36      2198
           3       0.38      0.46      0.42      2198
           4       0.42      0.41      0.42      2198
           5       0.54      0.56      0.55      2198

    accuracy                           0.45     10990
   macro avg       0.45      0.45      0.45     10990
weighted avg       0.45      0.45      0.45     10990



In [16]:
xgb_w2v = XGBClassifier(
    objective='multi:softprob',
    num_class=5,
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42
)

xgb_w2v.fit(X_train_word2vec, y_train_xgb)
y_pred_xgbw2v = xgb_w2v.predict(X_test_word2vec)
y_pred_xgbw2v = le.inverse_transform(y_pred_xgbw2v)
print(classification_report(y_test_1, y_pred_xgbw2v))

Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           1       0.52      0.58      0.55      2198
           2       0.39      0.37      0.38      2198
           3       0.36      0.33      0.35      2198
           4       0.44      0.44      0.44      2198
           5       0.56      0.55      0.56      2198

    accuracy                           0.46     10990
   macro avg       0.45      0.46      0.45     10990
weighted avg       0.45      0.46      0.45     10990



#**Alogorithms with BERT Embeddings**

In [None]:
# Install required libraries
!pip install transformers onnxruntime onnx

# Import necessary libraries
import torch
import onnxruntime as ort


from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, BertModel, pipeline

Collecting onnxruntime
  Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting onnx
  Downloading onnx-1.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m81.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnx-1.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m102.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 

In [None]:
# Check GPU availability
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Load your data (replace 'data.csv' with your dataset path)
# Assuming the DataFrame has 'reviews' (text) and 'ratings' (integer between 1 and 5)
new_df = pd.read_csv('Review.csv')  # Replace with your data file
new_df = new_df.dropna()  # Ensure no missing values

# Train-test split
X = new_df['Review'].tolist()
y = new_df['Rating'].tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert y to one-hot for One-vs-Rest classifier
lb = LabelBinarizer()
y_train_ovr = lb.fit_transform(y_train)
y_test_ovr = lb.transform(y_test)

# Load BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name).to(device)

# Function to generate BERT embeddings
def get_bert_embeddings(texts, batch_size=512):
    """Generate BERT embeddings for a list of texts."""
    bert_model.eval()
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        tokens = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=512).to(device)
        with torch.no_grad():
            outputs = bert_model(**tokens)
        # Use [CLS] token embeddings
        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.extend(cls_embeddings)
    return np.array(embeddings)

# Generate embeddings for training and testing
print("Generating BERT embeddings for training data...")
X_train_embeddings = get_bert_embeddings(X_train)

print("Generating BERT embeddings for testing data...")
X_test_embeddings = get_bert_embeddings(X_test)

# Logistic Regression (One-vs-Rest)
print("Training One-vs-Rest Logistic Regression...")
ovr_clf = OneVsRestClassifier(LogisticRegression(max_iter=1000, random_state=42))
ovr_clf.fit(X_train_embeddings, y_train_ovr)

# Predict and evaluate Logistic Regression
y_pred_ovr = ovr_clf.predict(X_test_embeddings)
print("Logistic Regression Evaluation:")
print(classification_report(lb.inverse_transform(y_test_ovr), lb.inverse_transform(y_pred_ovr)))

# XGBoost Classifier
print("Training XGBoost Classifier...")
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

le = LabelEncoder()
y_train_xgb = le.fit_transform(y_train)

xgb_clf.fit(X_train_embeddings, y_train_xgb)

y_pred_xgb = xgb_clf.predict(X_test_embeddings)
y_pred_xgb = le.inverse_transform(y_pred_xgb)

print("XGBoost Evaluation:")
print(classification_report(y_test, y_pred_xgb))

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Generating BERT embeddings for training data...
Generating BERT embeddings for testing data...
Training One-vs-Rest Logistic Regression...
Logistic Regression Evaluation:
              precision    recall  f1-score   support

           1       0.11      0.86      0.19      2250
           2       0.38      0.06      0.10      3538
           3       0.45      0.07      0.12      6296
           4       0.63      0.68      0.65     28688
           5       0.65      0.30      0.41     19005

    accuracy                           0.47     59777
   macro avg       0.44      0.39      0.30     59777
weighted avg       0.58      0.47      0.47     59777

Training XGBoost Classifier...


Parameters: { "use_label_encoder" } are not used.



XGBoost Evaluation:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.42      0.27      0.33      2249
           2       0.33      0.23      0.27      3538
           3       0.37      0.28      0.32      6296
           4       0.59      0.77      0.67     28688
           5       0.58      0.41      0.48     19005

    accuracy                           0.55     59777
   macro avg       0.38      0.33      0.35     59777
weighted avg       0.54      0.55      0.54     59777



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Train-test split
X = balanced_df['Review'].tolist()
y = balanced_df['Rating'].tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert y to one-hot for One-vs-Rest classifier
lb = LabelBinarizer()
y_train_ovr = lb.fit_transform(y_train)
y_test_ovr = lb.transform(y_test)

# Load BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name).to(device)

# Function to generate BERT embeddings
def get_bert_embeddings(texts, batch_size=512):
    """Generate BERT embeddings for a list of texts."""
    bert_model.eval()
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        tokens = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=512).to(device)
        with torch.no_grad():
            outputs = bert_model(**tokens)
        # Use [CLS] token embeddings
        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.extend(cls_embeddings)
    return np.array(embeddings)

# Generate embeddings for training and testing
print("Generating BERT embeddings for training data...")
X_train_embeddings = get_bert_embeddings(X_train)

print("Generating BERT embeddings for testing data...")
X_test_embeddings = get_bert_embeddings(X_test)

# Logistic Regression (One-vs-Rest)
print("Training One-vs-Rest Logistic Regression...")
ovr_clf = OneVsRestClassifier(LogisticRegression(max_iter=1000, random_state=42))
ovr_clf.fit(X_train_embeddings, y_train_ovr)

# Predict and evaluate Logistic Regression
y_pred_ovr = ovr_clf.predict(X_test_embeddings)
print("Logistic Regression Evaluation:")
print(classification_report(lb.inverse_transform(y_test_ovr), lb.inverse_transform(y_pred_ovr)))

# XGBoost Classifier
print("Training XGBoost Classifier...")
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

le = LabelEncoder()
y_train_xgb = le.fit_transform(y_train)

xgb_clf.fit(X_train_embeddings, y_train_xgb)

y_pred_xgb = xgb_clf.predict(X_test_embeddings)
y_pred_xgb = le.inverse_transform(y_pred_xgb)

print("XGBoost Evaluation:")
print(classification_report(y_test, y_pred_xgb))

Generating BERT embeddings for training data...
Generating BERT embeddings for testing data...
Training One-vs-Rest Logistic Regression...
Logistic Regression Evaluation:
              precision    recall  f1-score   support

           1       0.28      0.92      0.42      2243
           2       0.48      0.10      0.16      2190
           3       0.49      0.11      0.18      2234
           4       0.53      0.29      0.38      2156
           5       0.63      0.41      0.50      2167

    accuracy                           0.37     10990
   macro avg       0.48      0.36      0.33     10990
weighted avg       0.48      0.37      0.33     10990

Training XGBoost Classifier...


Parameters: { "use_label_encoder" } are not used.



XGBoost Evaluation:
              precision    recall  f1-score   support

           1       0.51      0.54      0.53      2243
           2       0.37      0.37      0.37      2190
           3       0.38      0.34      0.36      2234
           4       0.44      0.43      0.44      2156
           5       0.54      0.57      0.56      2167

    accuracy                           0.45     10990
   macro avg       0.45      0.45      0.45     10990
weighted avg       0.45      0.45      0.45     10990



In [None]:
# Train-test split
balanced_df = balanced_df[balanced_df['Rating'] != 3]

X = balanced_df['Review'].tolist()
y = balanced_df['Rating'].tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert y to one-hot for One-vs-Rest classifier
lb = LabelBinarizer()
y_train_ovr = lb.fit_transform(y_train)
y_test_ovr = lb.transform(y_test)

# Load BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name).to(device)

# Function to generate BERT embeddings
def get_bert_embeddings(texts, batch_size=512):
    """Generate BERT embeddings for a list of texts."""
    bert_model.eval()
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        tokens = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=512).to(device)
        with torch.no_grad():
            outputs = bert_model(**tokens)
        # Use [CLS] token embeddings
        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.extend(cls_embeddings)
    return np.array(embeddings)

# Generate embeddings for training and testing
print("Generating BERT embeddings for training data...")
X_train_embeddings = get_bert_embeddings(X_train)

print("Generating BERT embeddings for testing data...")
X_test_embeddings = get_bert_embeddings(X_test)

# Logistic Regression (One-vs-Rest)
print("Training One-vs-Rest Logistic Regression...")
ovr_clf = OneVsRestClassifier(LogisticRegression(max_iter=1000, random_state=42))
ovr_clf.fit(X_train_embeddings, y_train_ovr)

# Predict and evaluate Logistic Regression
y_pred_ovr = ovr_clf.predict(X_test_embeddings)
print("Logistic Regression Evaluation:")
print(classification_report(lb.inverse_transform(y_test_ovr), lb.inverse_transform(y_pred_ovr)))

# XGBoost Classifier
print("Training XGBoost Classifier...")
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

le = LabelEncoder()
y_train_xgb = le.fit_transform(y_train)

xgb_clf.fit(X_train_embeddings, y_train_xgb)

y_pred_xgb = xgb_clf.predict(X_test_embeddings)
y_pred_xgb = le.inverse_transform(y_pred_xgb)

print("XGBoost Evaluation:")
print(classification_report(y_test, y_pred_xgb))

Generating BERT embeddings for training data...
Generating BERT embeddings for testing data...
Training One-vs-Rest Logistic Regression...
Logistic Regression Evaluation:
              precision    recall  f1-score   support

           1       0.41      0.82      0.54      2228
           2       0.59      0.30      0.40      2159
           4       0.58      0.45      0.51      2188
           5       0.66      0.45      0.53      2217

    accuracy                           0.51      8792
   macro avg       0.56      0.50      0.50      8792
weighted avg       0.56      0.51      0.50      8792

Training XGBoost Classifier...


Parameters: { "use_label_encoder" } are not used.



XGBoost Evaluation:
              precision    recall  f1-score   support

           1       0.57      0.58      0.57      2228
           2       0.51      0.51      0.51      2159
           4       0.52      0.50      0.51      2188
           5       0.58      0.59      0.59      2217

    accuracy                           0.55      8792
   macro avg       0.54      0.54      0.54      8792
weighted avg       0.54      0.55      0.54      8792

