In [21]:
import pandas as pd
pd.options.display.max_rows = None
pd.options.display.max_columns = None
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [22]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Assuming df is your DataFrame and 'Feature17' has missing values
imputer = IterativeImputer(max_iter=100, tol=1e-3)

df_train = pd.read_csv('../data/iith_foml_2023_train.csv')
# Fit the imputer model on the dataset to learn the data patterns
df_train.drop(columns=['Feature 16', 'Feature 17'], inplace=True)
imputer.fit(df_train)

# Transform the dataset to replace missing values
df_imputed = imputer.transform(df_train)

# Convert back to a DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=df_train.columns)

In [23]:
X_train = df_imputed.iloc[:, :-1]
y_train = df_imputed.iloc[:, -1]
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [24]:
from sklearn.model_selection import train_test_split

train_x, val_x, train_y, val_y = train_test_split(X_train, y_train, test_size=0.2)

print("Shape of train_x:", train_x.shape)
print("Shape of val_x:", val_x.shape)
print("Shape of train_y:", train_y.shape)
print("Shape of val_y:", val_y.shape)


Shape of train_x: (795, 22)
Shape of val_x: (199, 22)
Shape of train_y: (795,)
Shape of val_y: (199,)


In [25]:
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Initialize models
bagging_model = BaggingClassifier(random_state=42)
extratrees_model = ExtraTreesClassifier(random_state=42)
randomforest_model = RandomForestClassifier(random_state=42)

model = VotingClassifier(
    estimators=[
        ('rf', randomforest_model),
        ('bg', bagging_model),
        ('et', extratrees_model)
    ], voting='hard'
)

# Train the model on the training data
model.fit(train_x, train_y)

# Make predictions on the validation data
predictions = model.predict(val_x)

# Calculate accuracy and macro F1 score
accuracy = accuracy_score(val_y, predictions)
macro_f1 = f1_score(val_y, predictions, average='macro', zero_division=0.0)

# Display results
print("Bagging Accuracy:", accuracy)
print("Bagging Macro F1 Score:", macro_f1)

# Classification report for more details
classification_rep = classification_report(val_y, predictions, zero_division=0.0)
print("\nBagging Classification Report:\n", classification_rep)

Bagging Accuracy: 0.914572864321608
Bagging Macro F1 Score: 0.4942547484413513

Bagging Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      1.00      0.92        37
         1.0       0.96      0.98      0.97        98
         2.0       0.77      0.92      0.84        26
         4.0       0.00      0.00      0.00         2
         5.0       1.00      0.54      0.70        13
         6.0       1.00      1.00      1.00        17
         7.0       0.00      0.00      0.00         1
         8.0       0.00      0.00      0.00         2
        14.0       1.00      1.00      1.00         1
        15.0       0.00      0.00      0.00         1
        17.0       0.00      0.00      0.00         1

    accuracy                           0.91       199
   macro avg       0.51      0.49      0.49       199
weighted avg       0.89      0.91      0.90       199

