In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, make_scorer
from catboost import CatBoostRegressor
import joblib
import time

# Function to discretize predictions
def discretize_predictions(predictions, target_classes):
    bins = np.linspace(min(target_classes) - 0.5, max(target_classes) + 0.5, num=len(target_classes) + 1)
    discretized = np.digitize(predictions, bins) - 1
    discretized = np.clip(discretized, 0, len(target_classes) - 1)
    return discretized + 1

# Custom scorer
def qwk_scorer(y_true, y_pred):
    target_classes = np.sort(np.unique(y_true))
    y_pred_discretized = discretize_predictions(y_pred, target_classes)
    return cohen_kappa_score(y_true, y_pred_discretized, weights='quadratic')

# Dataset and hyperparameters
dataset = 'combined_features_exp_2_pca_500.csv'
hyperparameters = {'learning_rate': 0.01, 'l2_leaf_reg': 7, 'iterations': 500, 'depth': 4}

# Load the dataset
combined_features_df = pd.read_csv(dataset)
df_transformed = pd.read_csv('transformed_data_exp_2.csv')

print(f"Working on Split, Train, Validate for {dataset}")
start_time = time.time()

# Split Data
X = combined_features_df
y = df_transformed['score']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Check the distribution of the target classes in the training data
print("Distribution of target classes in the training data:")
print(y_train.value_counts())

# Check the distribution of the target classes in the test data
print("Distribution of target classes in the test data:")
print(y_test.value_counts())

# Initialize and train the CatBoost Regressor model with specified hyperparameters
model = CatBoostRegressor(**hyperparameters, random_seed=42, silent=True)

print(f"Training CatBoost Regressor with specified parameters on {dataset}...")
start_time = time.time()
model.fit(X_train, y_train)
end_time = time.time()
print(f"Elapsed time for training on {dataset}: {end_time - start_time} seconds")

# Predict on the training set
y_train_pred = model.predict(X_train)
y_train_pred_discretized = discretize_predictions(y_train_pred, target_classes)
qwk_train_score = cohen_kappa_score(y_train, y_train_pred_discretized, weights='quadratic')

# Predict on the test set
y_test_pred = model.predict(X_test)
target_classes = np.sort(np.unique(y))
y_test_pred_discretized = discretize_predictions(y_test_pred, target_classes)
qwk_test_score = cohen_kappa_score(y_test, y_test_pred_discretized, weights='quadratic')

# Save the trained model
model_filename = 'catboost_model_exp2_pca_500.pkl'
joblib.dump(model, model_filename)
print(f"Model saved as '{model_filename}'")

# Print the results
print(f"QWK Score (Train) for {dataset}: {qwk_train_score}")
print(f"QWK Score (Test) for {dataset}: {qwk_test_score}")

results = {
    'Dataset': dataset,
    'Params': hyperparameters,
    'QWK Score (Train)': qwk_train_score,
    'QWK Score (Test)': qwk_test_score
}

results_df = pd.DataFrame([results])


Working on Split, Train, Validate for combined_features_exp_2_pca_500.csv
Distribution of target classes in the training data:
score
2    4294
3    4017
4    2513
5    2194
1     854
6     568
Name: count, dtype: int64
Distribution of target classes in the test data:
score
2    1074
3    1005
4     628
5     548
1     214
6     142
Name: count, dtype: int64
Training CatBoost Regressor with specified parameters on combined_features_exp_2_pca_500.csv...
Elapsed time for training on combined_features_exp_2_pca_500.csv: 18.714754343032837 seconds
Model saved as 'catboost_model_exp2_pca_500.pkl'
QWK Score (Train) for combined_features_exp_2_pca_500.csv: 0.8392364665721874
QWK Score (Test) for combined_features_exp_2_pca_500.csv: 0.8369415811018865


In [8]:
results_df.head()

Unnamed: 0,Dataset,Params,QWK Score (Train),QWK Score (Test)
0,combined_features_exp_2_pca_500.csv,"{'learning_rate': 0.01, 'l2_leaf_reg': 7, 'ite...",0.839236,0.836942
