In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import f_oneway
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostRegressor

%matplotlib inline

In [2]:
# load the data
data = pd.read_csv('./data/training_data.csv', delimiter=';')
data.head()

FileNotFoundError: [Errno 2] No such file or directory: './data/training_data.csv'

In [None]:
# Load the column names dictionary from the CSV file into a DataFrame
col_dict_df = pd.read_csv('data/column_names_dictionary.csv', delimiter=';')
col_dict = dict(zip(col_dict_df['CODE'], col_dict_df['INDICATOR NAME']))

col_dict["I1"]

# Load the group dictionary from the CSV file into a DataFrame
group_dict_df = pd.read_csv('data/group_dictionary.csv', delimiter=';')
group_dict = dict(zip(group_dict_df['CODE'], group_dict_df['SECTOR']))
group_dict["G1"]

In [None]:
# Convert all columns except the first one to numeric
for column in data.columns[1:]:
    if data[column].dtype == 'object':
        data[column] = data[column].str.replace(",", ".").astype(float)

data.dtypes

In [None]:
# Convert all columns except the first one to numeric
for column in data.columns[1:]:
    if data[column].dtype == 'object':
        data[column] = data[column].str.replace(",", ".").astype(float)

data.dtypes

In [None]:
test_data = pd.read_csv('./data/test_data_no_target.csv', delimiter=';')

In [None]:
# Convert all columns except the first one to numeric
for column in test_data.columns[1:]:
    if test_data[column].dtype == 'object':
        test_data[column] = test_data[column].str.replace(",", ".").astype(float)

data.dtypes

In [None]:
# I21, 48,50 and dI21, dI48, dI50 have very high missing values. Also 100% for G1 and G1
# so I drop these columns

# Also I2 is highly correlated with I3, I8 . I33 is highly correlated with I34.
# So I also drop I3, I8, I34

data_dropped = data.drop(columns = ['I21','I48','I50','dI21','dI48','dI50','I3','I8','I34'])


data_dropped.head()

In [None]:

# impute median by group for the rows with missing values.
dfs = []

for group_df in data_dropped.groupby('Group'):
    group_df = group_df[1]
    for col in group_df.columns[1:-2]:
        group_df.loc[group_df[col].isna() , col] = group_df[col].median()
    dfs.append(group_df)
    
data_imputed = pd.concat(dfs)
data_imputed.head()

#data_imputed has imputed values.

In [None]:
null_df = data_imputed.isnull()
null_df.sum().sum()

In [None]:
# I21, 48,50 and dI21, dI48, dI50 have very high missing values. Also 100% for G1 and G1
# so I drop these columns

# Also I2 is highly correlated with I3, I8 . I33 is highly correlated with I34.
# So I also drop I3, I8, I34

test_data_dropped = test_data.drop(columns = ['I21','I48','I50','dI21','dI48','dI50','I3','I8','I34'])

test_data_dropped.head()

In [None]:
# impute median by group for the rows with missing values.
dfs = []

for group_df in test_data_dropped.groupby('Group'):
    group_df = group_df[1]
    for col in group_df.columns[1:]:
        group_df.loc[group_df[col].isna() , col] = group_df[col].median()
    dfs.append(group_df)
    
test_data_imputed = pd.concat(dfs)
test_data_imputed.head()

#data_imputed has imputed values.

In [None]:
null_df = test_data_imputed.isnull()
null_df.sum().sum()

In [None]:
X = data_imputed.iloc[:,:-2]
y = data_imputed.loc[:,['Class','Perform']]

In [None]:
class_labels = [-1, 0, 1]  # Define the class labels present in your dataset

y_shifted = np.array(y['Class']) + 1

# Compute the class frequencies
class_counts = np.bincount(y_shifted)

# Compute the inverse class frequencies
class_weights = 1.0 / class_counts

# Normalize the weights
class_weights /= class_weights.sum()

class_weights_array = np.zeros(len(data))
class_weights_array[data[data['Class'] == -1].index] = class_weights[0]
class_weights_array[data[data['Class'] == 0].index] = class_weights[1]
class_weights_array[data[data['Class'] == 1].index] = class_weights[2]

In [None]:

# Define the model
random_model = RandomForestClassifier(classifier_bootstrap = True, classifiercriterion = gini, 
                                      classifiermax_depth = 30, classifiermax_features = log2, 
                                      classifiermin_samples_leaf = 17,
                                    classifiermin_samples_split = 2, 
                                    classifier_n_estimators = 166)  


# Fit the pipeline (including imputation) to the training data
random_model.fit(X, y['Class'])

random_class_train_pred = random_model.predict(X)
random_class_test_pred = random_model.predict(test_data2)

In [None]:
catboost_model = CatBoostRegressor(
    od_type='Iter',
    od_wait=20,
    cat_features = ['Group'], 
    one_hot_max_size = 11, 
    depth = 6, 
    iterations = 200, 
    learning_rate = 0.05
)


catboost_model.fit(X, y['Class'], sample_weight=class_weights_array)
catboost_perform_train_pred = catboost_model.predict(X)
catboost_perform_test_pred = catboost_model.predit(test_data_imputed)


In [None]:
# Convert regression predictions to classification labels
catboost_class_train_pred = []
for pred in ytrain_Perform_pred:
    if pred > 0.04:
        catboost_class_train_pred.append(1)
    elif pred > -0.015:
        catboost_class_train_pred.append(0)
    else:
        catboost_class_train_pred.append(-1)

#test
ytest_Perform_pred = catboost_model.predict(test_data2)
# Convert regression predictions to classification labels
catboost_class_test_pred = []
for pred in ytest_Perform_pred:
    if pred > 0.04:
        catboost_class_test_pred.append(1)
    elif pred > -0.015:
        catboost_class_test_pred.append(0)
    else:
        catboost_class_test_pred.append(-1)

In [None]:
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostRegressor


# Combine predictions to create meta-features
meta_features_train = np.column_stack((catboost_class_train_pred, random_class_train_pred))
meta_features_test = np.column_stack((catboost_class_test_pred, random_class_test_pred))

# Train a meta-model (Ridge regression in this case)
meta_model = Ridge()
meta_model.fit(meta_features_train, y['Class'])

# Make final predictions on test data using the meta-model
y_test_final_pred = meta_model.predict(meta_features_test)