In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [15]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Assuming df is your DataFrame and 'Feature17' has missing values
imputer = IterativeImputer(max_iter=100, tol=1e-3)

df_train = pd.read_csv('../data/iith_foml_2023_train.csv')
# Fit the imputer model on the dataset to learn the data patterns
df_train.drop(columns=['Feature 16', 'Feature 17'], inplace=True)
imputer.fit(df_train)

# Transform the dataset to replace missing values
df_imputed = imputer.transform(df_train)

# Convert back to a DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=df_train.columns)

In [16]:
df_imputed

Unnamed: 0,Feature 1 (Discrete),Feature 2 (Discrete),Feature 3 (Discrete),Feature 4 (Discrete),Feature 5 (Discrete),Feature 6 (Discrete),Feature 7 (Discrete),Feature 8 (Discrete),Feature 9,Feature 10,...,Feature 14,Feature 15,Feature 18,Feature 19 (Discrete),Feature 20 (Discrete),Feature 21 (Discrete),Feature 22 (Discrete),Feature 23 (Discrete),Feature 24,Target Variable (Discrete)
0,1404.0,12.0,64.0,14.0,3.0,1.0,1.0,1.0,110.502,35775.2,...,1436.052,5000.50000,15.040000,104.0,12.0,2.0,32.0,1409.0,37677.1,1.0
1,909.0,0.0,235.0,32.0,1.0,1.0,1.0,1.0,-40.448,35779.4,...,1436.111,3720.50000,12.030000,20.0,1.0,0.0,13.0,909.0,25239.1,1.0
2,654.0,3.0,175.0,2.0,1.0,1.0,1.0,1.0,-27.445,35770.4,...,1436.103,4685.40000,13.010000,1.0,1.0,0.0,13.0,654.0,27683.5,1.0
3,1372.0,12.0,382.0,14.0,2.0,0.0,1.0,0.0,0.001,509.2,...,94.844,564.88968,3416.605776,313.0,12.0,10.0,54.0,1377.0,39363.2,0.0
4,786.0,3.0,199.0,2.0,1.0,0.0,1.0,0.0,0.001,612.1,...,97.823,4.10000,3456.445771,171.0,1.0,5.0,11.0,786.0,40044.4,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
989,1388.0,0.0,382.0,14.0,2.0,0.0,1.0,0.0,0.002,628.5,...,97.574,2700.40000,3562.677406,104.0,12.0,11.0,53.0,1393.0,41473.4,0.0
990,1339.0,0.0,344.0,14.0,3.0,0.0,1.0,0.0,0.002,520.3,...,95.235,1.30000,3527.256148,205.0,12.0,10.0,63.0,1344.0,40916.2,2.0
991,527.0,7.0,40.0,5.0,5.0,2.0,1.0,3.0,0.005,19130.1,...,676.384,1415.50000,7.020000,101.0,6.0,7.0,95.0,527.0,37938.4,6.0
992,1144.0,12.0,120.0,14.0,3.0,0.0,1.0,0.0,0.003,785.2,...,100.805,300.10000,2974.117578,166.0,12.0,11.0,53.0,1149.0,33433.5,0.0


In [17]:
X_train = df_imputed.iloc[:, :-1]
y_train = df_imputed.iloc[:, -1]
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [18]:
class_counts =  y_train.value_counts()
classes_to_remove = class_counts[class_counts < 10].index
filtered_indices = ~y_train.isin(classes_to_remove)
X_train_filtered = X_train[filtered_indices]
y_train_filtered = y_train[filtered_indices]

In [19]:
y_train_filtered.value_counts()

Target Variable (Discrete)
1.0    488
0.0    249
2.0    109
6.0     70
5.0     41
Name: count, dtype: int64

In [20]:
from sklearn.model_selection import train_test_split

train_x, val_x, train_y, val_y = train_test_split(X_train_filtered, y_train_filtered, test_size=0.2)

print("Shape of train_x:", train_x.shape)
print("Shape of val_x:", val_x.shape)
print("Shape of train_y:", train_y.shape)
print("Shape of val_y:", val_y.shape)


Shape of train_x: (765, 22)
Shape of val_x: (192, 22)
Shape of train_y: (765,)
Shape of val_y: (192,)


In [21]:
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Initialize Adaboost model
model = BaggingClassifier()

le = LabelEncoder()
train_y = le.fit_transform(train_y)

# Train the model on the training data
model.fit(train_x, train_y)

# Make predictions on the validation data
predictions = le.inverse_transform(model.predict(val_x))

# Calculate accuracy and macro F1 score
accuracy = accuracy_score(val_y, predictions)
macro_f1 = f1_score(val_y, predictions, average='macro', zero_division=0.0)

# Display results
print("Bagging Accuracy:", accuracy)
print("Bagging Macro F1 Score:", macro_f1)

# Classification report for more details
classification_rep = classification_report(val_y, predictions, zero_division=0.0)
print("\nBagging Classification Report:\n", classification_rep)


Bagging Accuracy: 0.90625
Bagging Macro F1 Score: 0.8344189838597421

Bagging Classification Report:
               precision    recall  f1-score   support

         0.0       0.92      0.96      0.94        49
         1.0       0.90      0.98      0.94       101
         2.0       0.93      0.58      0.72        24
         5.0       0.67      0.67      0.67         6
         6.0       1.00      0.83      0.91        12

    accuracy                           0.91       192
   macro avg       0.88      0.80      0.83       192
weighted avg       0.91      0.91      0.90       192

