TF-IDF Dimensionality Reduction: Consider reducing dimensionality while retaining essential information by applying PCA. This could help mitigate overfitting and improve model performance. 

## Performance Evaluation of CatBoost Regressor

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
from catboost import CatBoostRegressor
import time

# List of datasets to process
datasets = [
    'combined_features_exp_2.csv',
    'combined_features_exp_2_pca_1000.csv',
    'combined_features_exp_2_pca_700.csv',
    'combined_features_exp_2_pca_500.csv'
]

results = []

# Function to discretize predictions
def discretize_predictions(predictions, target_classes):
    bins = np.linspace(min(target_classes) - 0.5, max(target_classes) + 0.5, num=len(target_classes) + 1)
    discretized = np.digitize(predictions, bins) - 1
    discretized = np.clip(discretized, 0, len(target_classes) - 1)
    return discretized + 1

# Process each dataset
for dataset in datasets:
    # Load the datasets
    combined_features_df = pd.read_csv(dataset)
    df_transformed = pd.read_csv('transformed_data_exp_2.csv')

    print(f"Working on Split, Train, Validate for {dataset}")
    start_time = time.time()

    # Split Data
    X = combined_features_df
    y = df_transformed['score']
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

    # Check the distribution of the target classes in the training data
    print("Distribution of target classes in the training data:")
    print(y_train.value_counts())

    # Check the distribution of the target classes in the test data
    print("Distribution of target classes in the test data:")
    print(y_test.value_counts())

    # Define the CatBoost Regressor model
    model = CatBoostRegressor(iterations=600, depth=4, learning_rate=0.1, random_seed=42, silent=True)

    target_classes = np.sort(np.unique(y))

    print(f"Working on CatBoost Regressor for {dataset}...")
    start_time = time.time()
    model.fit(X_train, y_train)
    end_time = time.time()
    print(f"Elapsed time for CatBoost Regressor on {dataset}: {end_time - start_time} seconds")

    # Predictions on training set
    y_train_pred = model.predict(X_train)
    y_train_pred_scaled = (y_train_pred - y_train_pred.min()) / (y_train_pred.max() - y_train_pred.min()) * (y_train.max() - y_train.min()) + y_train.min()
    y_train_discretized = discretize_predictions(y_train, target_classes)
    y_train_pred_discretized = discretize_predictions(y_train_pred_scaled, target_classes)
    kappa_train_score = cohen_kappa_score(y_train_discretized, y_train_pred_discretized, weights='quadratic')

    # Predictions on test set
    y_test_pred = model.predict(X_test)
    y_test_pred_scaled = (y_test_pred - y_test_pred.min()) / (y_test_pred.max() - y_test_pred.min()) * (y_train.max() - y_train.min()) + y_train.min()
    y_test_discretized = discretize_predictions(y_test, target_classes)
    y_test_pred_discretized = discretize_predictions(y_test_pred_scaled, target_classes)
    kappa_test_score = cohen_kappa_score(y_test_discretized, y_test_pred_discretized, weights='quadratic')

    results.append({'Dataset': dataset, 'QWK Score (Train)': kappa_train_score, 'QWK Score (Test)': kappa_test_score})



Working on Split, Train, Validate for combined_features_exp_2.csv
Distribution of target classes in the training data:
score
2    4294
3    4017
4    2513
5    2194
1     854
6     568
Name: count, dtype: int64
Distribution of target classes in the test data:
score
2    1074
3    1005
4     628
5     548
1     214
6     142
Name: count, dtype: int64
Working on CatBoost Regressor for combined_features_exp_2.csv...
Elapsed time for CatBoost Regressor on combined_features_exp_2.csv: 15.531715869903564 seconds
Working on Split, Train, Validate for combined_features_exp_2_pca_1000.csv
Distribution of target classes in the training data:
score
2    4294
3    4017
4    2513
5    2194
1     854
6     568
Name: count, dtype: int64
Distribution of target classes in the test data:
score
2    1074
3    1005
4     628
5     548
1     214
6     142
Name: count, dtype: int64
Working on CatBoost Regressor for combined_features_exp_2_pca_1000.csv...
Elapsed time for CatBoost Regressor on combined_featu

In [4]:
results_df = pd.DataFrame(results)
print(results_df)

                                Dataset  QWK Score (Train)  QWK Score (Test)
0           combined_features_exp_2.csv           0.904795          0.861679
1  combined_features_exp_2_pca_1000.csv           0.906795          0.862143
2   combined_features_exp_2_pca_700.csv           0.906436          0.862702
3   combined_features_exp_2_pca_500.csv           0.902056          0.867482
