In [1]:
import pandas as pd
pd.options.display.max_rows = None
pd.options.display.max_columns = None
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Assuming df is your DataFrame and 'Feature17' has missing values
imputer = IterativeImputer(max_iter=100, tol=1e-3)

df_train = pd.read_csv('../data/iith_foml_2023_train.csv')
# Fit the imputer model on the dataset to learn the data patterns
df_train.drop(columns=['Feature 16', 'Feature 17'], inplace=True)
imputer.fit(df_train)

# Transform the dataset to replace missing values
df_imputed = imputer.transform(df_train)

# Convert back to a DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=df_train.columns)

In [3]:
from sklearn.utils.class_weight import compute_sample_weight, compute_class_weight

X_train = df_imputed.iloc[:, :-1]
y_train = df_imputed.iloc[:, -1]
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
le = LabelEncoder()
y_train = le.fit_transform(y_train)
class_wts = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_wts_dict = dict(zip(np.arange(len(class_wts)), class_wts))
sample_wts = compute_sample_weight(class_weight=class_wts_dict, y=y_train)

In [4]:
from sklearn.ensemble import VotingClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, classification_report, make_scorer
from xgboost import XGBClassifier

def report_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro', zero_division=0.0)

# Initialize models
extratrees_model = ExtraTreesClassifier(random_state=42)
bagging_xgboost_model = BaggingClassifier(estimator=XGBClassifier(random_state=42), random_state=42)
randomforest_model = RandomForestClassifier(random_state=42)


model = VotingClassifier(
    estimators=[
        ('bagging_extratrees', extratrees_model),
        ('bagging_xgboost', bagging_xgboost_model),
        ('bagging_gradboost', randomforest_model),
    ], voting='hard'
)

# Train the model on the training data
cv_f1_score = cross_val_score(model, X_train, y_train, fit_params={'sample_weight': sample_wts}, scoring=make_scorer(report_f1))
print(cv_f1_score)



              precision    recall  f1-score   support

           0       0.88      1.00      0.93        50
           1       0.97      0.98      0.97        98
           2       0.76      0.73      0.74        22
           4       0.50      1.00      0.67         1
           5       1.00      0.62      0.77         8
           6       1.00      1.00      1.00        14
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          14       1.00      1.00      1.00         1
          15       0.00      0.00      0.00         1
          17       0.00      0.00      0.00         1

    accuracy                           0.92       199
   macro avg       0.51      0.53      0.51       199
weighted avg       0.90      0.92      0.91       199

              precision    recall  f1-score   support

           0       0.86      0.96      0.91        50
           1       0.96 