In [102]:
import pandas as pd
pd.options.display.max_rows = None
pd.options.display.max_columns = None
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [103]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Assuming df is your DataFrame and 'Feature17' has missing values
imputer = IterativeImputer(max_iter=100, tol=1e-3)

df_train = pd.read_csv('../data/iith_foml_2023_train.csv')
# Fit the imputer model on the dataset to learn the data patterns
df_train.drop(columns=['Feature 16', 'Feature 17'], inplace=True)
imputer.fit(df_train)

# Transform the dataset to replace missing values
df_imputed = imputer.transform(df_train)

# Convert back to a DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=df_train.columns)

In [104]:
X_train = df_imputed.iloc[:, :-1]
y_train = df_imputed.iloc[:, -1]
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [105]:
df_imputed['Target Variable (Discrete)'].value_counts()

Target Variable (Discrete)
1.0     488
0.0     249
2.0     109
6.0      70
5.0      41
8.0       7
14.0      5
7.0       5
15.0      4
4.0       3
13.0      3
3.0       3
9.0       2
12.0      1
17.0      1
11.0      1
10.0      1
16.0      1
Name: count, dtype: int64

In [106]:
X_train_pd = pd.DataFrame(X_train, columns=df_train.columns[:-1])
X_train_pd['Target Variable (Discrete)'] = y_train
X_train_pd.loc[X_train_pd['Target Variable (Discrete)'] == 12.0]

Unnamed: 0,Feature 1 (Discrete),Feature 2 (Discrete),Feature 3 (Discrete),Feature 4 (Discrete),Feature 5 (Discrete),Feature 6 (Discrete),Feature 7 (Discrete),Feature 8 (Discrete),Feature 9,Feature 10,Feature 11,Feature 12,Feature 13,Feature 14,Feature 15,Feature 18,Feature 19 (Discrete),Feature 20 (Discrete),Feature 21 (Discrete),Feature 22 (Discrete),Feature 23 (Discrete),Feature 24,Target Variable (Discrete)
11,-0.353006,1.86527,-0.952298,0.419034,0.141934,0.560641,-0.408464,-0.419865,0.898656,1.286632,1.028634,-0.182683,-1.193573,1.193901,0.491325,-0.311962,-0.609359,0.970641,-1.001086,-0.93296,-0.354493,0.491242,12.0


In [107]:
X_train_pd.loc[X_train_pd['Target Variable (Discrete)'] == 1.0].head(2)

Unnamed: 0,Feature 1 (Discrete),Feature 2 (Discrete),Feature 3 (Discrete),Feature 4 (Discrete),Feature 5 (Discrete),Feature 6 (Discrete),Feature 7 (Discrete),Feature 8 (Discrete),Feature 9,Feature 10,Feature 11,Feature 12,Feature 13,Feature 14,Feature 15,Feature 18,Feature 19 (Discrete),Feature 20 (Discrete),Feature 21 (Discrete),Feature 22 (Discrete),Feature 23 (Discrete),Feature 24,Target Variable (Discrete)
0,1.715422,0.807006,-0.89604,0.155038,0.141934,0.560641,-0.408464,-0.419865,1.921298,1.286202,1.028676,-0.181829,-1.191682,1.193631,1.349099,-0.311567,0.11074,0.407409,-0.581928,-0.378201,1.718423,0.502981,1.0
1,0.495074,-0.78039,0.707307,1.34302,-0.612517,0.560641,-0.408464,-0.419865,-0.860103,1.286456,1.02853,-0.182575,-1.195033,1.19372,0.773381,-0.311963,-0.819849,-0.625184,-1.001086,-0.905222,0.490392,-1.530524,1.0


In [108]:
from sklearn.model_selection import train_test_split

train_x, val_x, train_y, val_y = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

print("Shape of train_x:", train_x.shape)
print("Shape of val_x:", val_x.shape)
print("Shape of train_y:", train_y.shape)
print("Shape of val_y:", val_y.shape)


Shape of train_x: (795, 22)
Shape of val_x: (199, 22)
Shape of train_y: (795,)
Shape of val_y: (199,)


In [109]:
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Initialize Adaboost model
model = BaggingClassifier(random_state=42)

# le = LabelEncoder()
# train_y = le.fit_transform(train_y)

# Train the model on the training data
model.fit(train_x, train_y)

# Make predictions on the validation data
predictions = model.predict(val_x)

# Calculate accuracy and macro F1 score
accuracy = accuracy_score(val_y, predictions)
macro_f1 = f1_score(val_y, predictions, average='macro', zero_division=0.0)

# Display results
print("Bagging Accuracy:", accuracy)
print("Bagging Macro F1 Score:", macro_f1)

# Classification report for more details
classification_rep = classification_report(val_y, predictions, zero_division=0.0)
print("\nBagging Classification Report:\n", classification_rep)

Bagging Accuracy: 0.864321608040201
Bagging Macro F1 Score: 0.46113991011643485

Bagging Classification Report:
               precision    recall  f1-score   support

         0.0       0.89      0.94      0.91        50
         1.0       0.93      0.98      0.95        91
         2.0       0.65      0.77      0.70        26
         3.0       0.00      0.00      0.00         1
         4.0       1.00      1.00      1.00         1
         5.0       0.50      0.38      0.43         8
         6.0       1.00      1.00      1.00        11
         7.0       0.00      0.00      0.00         2
         8.0       0.00      0.00      0.00         5
        11.0       0.00      0.00      0.00         1
        13.0       0.00      0.00      0.00         1
        14.0       1.00      1.00      1.00         1
        15.0       0.00      0.00      0.00         1

    accuracy                           0.86       199
   macro avg       0.46      0.47      0.46       199
weighted avg       0.