In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import f_oneway

%matplotlib inline

In [None]:
# load the data
data = pd.read_csv('./data/training_data.csv', delimiter=';')
data.head()

In [None]:
# Load the column names dictionary from the CSV file into a DataFrame
col_dict_df = pd.read_csv('data/column_names_dictionary.csv', delimiter=';')
col_dict = dict(zip(col_dict_df['CODE'], col_dict_df['INDICATOR NAME']))

col_dict["I1"]

# Load the group dictionary from the CSV file into a DataFrame
group_dict_df = pd.read_csv('data/group_dictionary.csv', delimiter=';')
group_dict = dict(zip(group_dict_df['CODE'], group_dict_df['SECTOR']))
group_dict["G1"]


In [None]:
# Convert all columns except the first one to numeric
for column in data.columns[1:]:
    if data[column].dtype == 'object':
        data[column] = data[column].str.replace(",", ".").astype(float)

data.dtypes

In [None]:
# Convert all columns except the first one to numeric
for column in data.columns[1:]:
    if data[column].dtype == 'object':
        data[column] = data[column].str.replace(",", ".").astype(float)

data.dtypes

In [None]:
test_data = pd.read_csv('./data/test_data_no_target.csv', delimiter=';')

In [None]:
# Convert all columns except the first one to numeric
for column in test_data.columns[1:]:
    if test_data[column].dtype == 'object':
        test_data[column] = test_data[column].str.replace(",", ".").astype(float)

data.dtypes

In [None]:
numerical_col = data.columns[1:-2]
categorical_col = data.columns[0]

In [None]:
# I21, 48,50 and dI21, dI48, dI50 have very high missing values. Also 100% for G1 and G1
# so I drop these columns

# Also I2 is highly correlated with I3, I8 . I33 is highly correlated with I34.
# So I also drop I3, I8, I34

data_dropped = data.drop(columns = ['I21','I48','I50','dI21','dI48','dI50','I3','I8','I34'])

data_dropped.head()

In [None]:

# impute median by group for the rows with missing values.
dfs = []

for group_df in data_dropped.groupby('Group'):
    group_df = group_df[1]
    for col in group_df.columns[1:-2]:
        group_df.loc[group_df[col].isna() , col] = group_df[col].median()
    dfs.append(group_df)
    
data_imputed = pd.concat(dfs)
data_imputed.head()

#data_imputed has imputed values.

In [None]:
null_df = data_imputed.isnull()
null_df.sum().sum()

In [None]:
# I21, 48,50 and dI21, dI48, dI50 have very high missing values. Also 100% for G1 and G1
# so I drop these columns

# Also I2 is highly correlated with I3, I8 . I33 is highly correlated with I34.
# So I also drop I3, I8, I34

test_data_dropped = test_data.drop(columns = ['I21','I48','I50','dI21','dI48','dI50','I3','I8','I34'])

test_data_dropped.head()

In [None]:
# impute median by group for the rows with missing values.
dfs = []

for group_df in test_data_dropped.groupby('Group'):
    group_df = group_df[1]
    for col in group_df.columns[1:-2]:
        group_df.loc[group_df[col].isna() , col] = group_df[col].median()
    dfs.append(group_df)
    
test_data_imputed = pd.concat(dfs)
test_data_imputed.head()

#data_imputed has imputed values.

In [None]:
null_df = test_data_imputed.isnull()
null_df.sum().sum()

In [None]:
X = data_imputed.iloc[:,:-2]
y = data_imputed.loc[:,['Class','Perform']]

class_labels = [-1, 0, 1]  # Define the class labels present in your dataset

y_shifted = np.array(y['Class']) + 1

# Compute the class frequencies
class_counts = np.bincount(y_shifted)

# Compute the inverse class frequencies
class_weights = 1.0 / class_counts

# Normalize the weights
class_weights /= class_weights.sum()

class_weights_array = np.zeros(len(data))
class_weights_array[data[data['Class'] == -1].index] = class_weights[0]
class_weights_array[data[data['Class'] == 0].index] = class_weights[1]
class_weights_array[data[data['Class'] == 1].index] = class_weights[2]

In [None]:
#preprocessing done - removed highly correlated, removed NA, imputed group median, class weights.
#catboost does'nt need one hot encoding.

In [None]:
test_data_imputed.head()

In [None]:
data_imputed.head()

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
# Assuming X contains the features and y contains Perform and Class

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.utils import class_weight

# Custom prediction logic

def custom_predict_proba(probas): # probas is probably a 8000x3 matrix
    predictions = []
    
    for p in probas:
        if p[0] >= 0.5:  # p[0] is the probability for class -1
            predictions.append(-1)
        elif p[2] >= 0.5:  # p[2] is the probability for class 1
            predictions.append(1)
        else:  # p[1] is the probability for class 0
            predictions.append(0)
            
    return np.array(predictions) # predictions = [-1,1,1,0,...]

# Custom scoring function
def custom_scorer(catboost_model, X, y):
    probas = catboost_model.predict_proba(X)
    predictions = custom_predict_proba(probas)
    cost_matrix = np.array([[0, 1, 2], [1, 0, 1], [2, 1, 0]])
    conf_matrix = np.zeros((3, 3))

    for true, pred in zip(y, predictions):
        conf_matrix[int(true) + 1][int(pred) + 1] += 1

    error_cost = np.sum(conf_matrix * cost_matrix) / len(y)
    return -error_cost  # Negative because higher is better for RandomizedSearchCV


# Define the CatBoost model
catboost_model = CatBoostClassifier(
    od_type='Iter',
    od_wait=20,
    cat_features = ['Group'], 
    one_hot_max_size = 11
)
#(od_type and od_wait are for overfitting detection)


# Define hyperparameters grid for tuning
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8, 10],
    'iterations': [100, 200, 300]
    # Add more hyperparameters as needed
}

# Create GridSearchCV

grid_search = GridSearchCV(catboost_model, param_grid, scoring=custom_scorer, cv=5, 
                           verbose=1, n_jobs=-1, error_score='raise')

# Fit the model with hyperparameter tuning and cross-validation
grid_search.fit(X, y['Class'], sample_weight=class_weights_array)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
