In [2]:
import pandas as pd

In [3]:
# Load the sampled training and test datasets from CSV files
df_train_sampled = pd.read_csv('data/train_sampled_with_embeddings.csv')
df_test_sampled = pd.read_csv('data/test_sampled_with_embeddings.csv')

# Set the display option to show the full content of each column
pd.set_option('display.max_colwidth', None)

In [None]:
df_train_sampled.head()

### Convert in 2D Numpy Array

In [None]:
import numpy as np

# Convert embedding strings to NumPy arrays
df_train_sampled['text_embedding'] = df_train_sampled['text_embedding'].apply(lambda x: np.array(eval(x), dtype=float))
df_test_sampled['text_embedding'] = df_test_sampled['text_embedding'].apply(lambda x: np.array(eval(x), dtype=float))

# Convert the embeddings column to a 2D NumPy array (each row is an embedding)
X_train = np.vstack(df_train_sampled['text_embedding'].values)
y_train = df_train_sampled['label'].values

X_test = np.vstack(df_test_sampled['text_embedding'].values)
y_test = df_test_sampled['label'].values

### Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Initialize and train the model
lr_model = LogisticRegression(max_iter=10000, random_state=42)
lr_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_lr = lr_model.predict(X_test)


# Evaluate the model
print("Logistic Regression Performance:")
print(classification_report(y_test, y_pred_lr))


Logistic Regression Performance:
              precision    recall  f1-score   support

           0       0.69      0.69      0.69      1000
           1       0.69      0.69      0.69      1000

    accuracy                           0.69      2000
   macro avg       0.69      0.69      0.69      2000
weighted avg       0.69      0.69      0.69      2000



### Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
print("Random Forest Performance:")
print(classification_report(y_test, y_pred_rf))

Random Forest Performance:
              precision    recall  f1-score   support

           0       0.74      0.70      0.72      1000
           1       0.72      0.76      0.74      1000

    accuracy                           0.73      2000
   macro avg       0.73      0.73      0.73      2000
weighted avg       0.73      0.73      0.73      2000



### XGBoost

In [None]:
!pip install xgboost

In [17]:
from xgboost import XGBClassifier

# Initialize and train the model
xgb_model = XGBClassifier(n_estimators=200, use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
print("XGBoost Performance:")
print(classification_report(y_test, y_pred_xgb))


Parameters: { "use_label_encoder" } are not used.



XGBoost Performance:
              precision    recall  f1-score   support

           0       0.73      0.71      0.72      1000
           1       0.72      0.74      0.73      1000

    accuracy                           0.72      2000
   macro avg       0.73      0.72      0.72      2000
weighted avg       0.73      0.72      0.72      2000



### Hyperparameter Tuning using grid

In [19]:
from sklearn.model_selection import GridSearchCV

# With Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='f1')
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)


Best parameters found:  {'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 200}
Best cross-validation score:  0.7350550516396049
