In [1]:
import pandas as pd
pd.options.display.max_rows = None
pd.options.display.max_columns = None
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(max_iter=100, tol=1e-3)

df_train = pd.read_csv('../data/iith_foml_2023_train.csv')
# df_train.drop(columns=['Feature 16', 'Feature 17'], inplace=True)
for column in df_train.columns:
    if df_train[column].isna().any():
        mean_value = df_train[column].mean()
        df_train[column].fillna(mean_value, inplace=True)
# df_imputed = df_train

In [3]:
# from sklearn.utils.class_weight import compute_sample_weight, compute_class_weight

X = df_train.iloc[:, :-1]
y = df_train.iloc[:, -1]
scaler = StandardScaler()
X = scaler.fit_transform(X)
# le = LabelEncoder()
# y = le.fit_transform(y)
# class_wts = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
# class_wts_dict = dict(zip(np.arange(len(class_wts)), class_wts))
# sample_wts = compute_sample_weight(class_weight=class_wts_dict, y=y)

In [4]:
class_counts = df_train['Target Variable (Discrete)'].value_counts()
# print(class_counts)


In [5]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)

X_resampled, y_resampled = ros.fit_resample(X, y)


In [6]:
X_train, val_x, y_train, val_y = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [7]:
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

def report_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro', zero_division=0.0)

# Initialize models
extratrees_model = ExtraTreesClassifier()
randomforest_model = RandomForestClassifier()
gradientboost_model = GradientBoostingClassifier()
# knn_model = KNeighborsClassifier()

knn_model = VotingClassifier(
    estimators=[
        ('bagging_extratrees', extratrees_model),
        ('bagging_randomforest', randomforest_model),
        ('gradientboost_model', gradientboost_model),
#         ('knn_model', knn_model)
    ], voting='hard'
)

# Train the model on the training data
# cv_f1_score = cross_val_score(model, X_train, y_train, scoring=make_scorer(report_f1))
# print(cv_f1_score)
knn_model.fit(X_train, y_train)


In [8]:
y_val_pred = knn_model.predict(val_x)
macro_f1 = f1_score(val_y, y_val_pred, average='macro')

print(f'Macro F1 Score: {macro_f1}')

Macro F1 Score: 0.9960168519559569


In [9]:
cv_f1_score = cross_val_score(knn_model, X_train, y_train, scoring=make_scorer(report_f1))
print(cv_f1_score)

[0.99779708 0.99413499 0.997114   0.99636753 0.99491784]


In [10]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

dt_classification_rep = classification_report(val_y, y_val_pred, zero_division=0.0)
print("\nDecision Tree Classification Report:\n", dt_classification_rep)


Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       116
           1       1.00      0.94      0.97       102
           2       0.98      1.00      0.99       104
           3       0.99      1.00      0.99        87
           4       0.98      1.00      0.99        93
           5       1.00      1.00      1.00       102
           6       1.00      1.00      1.00       100
           7       0.99      1.00      0.99        72
           8       1.00      1.00      1.00       106
           9       1.00      1.00      1.00        83
          10       1.00      1.00      1.00       103
          11       1.00      1.00      1.00       114
          12       1.00      1.00      1.00        95
          13       1.00      1.00      1.00       119
          14       1.00      1.00      1.00        76
          15       1.00      1.00      1.00        90
          16       1.00      1.00      1.0