In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [36]:
# Create imputer object
imputer = IterativeImputer(max_iter=100, tol=1e-3)

df = pd.read_csv('data/iith_foml_2023_train.csv')
y_train = df.iloc[:, -1]
X_train = df.iloc[:, :-1]
# Fit the imputer model on the dataset to learn the data patterns
X_train.drop(columns=['Feature 16', 'Feature 17'], inplace=True)
imputer.fit(X_train)

# Transform the dataset to replace missing values
# Convert back to a DataFrame
X_train_imputed = pd.DataFrame(imputer.transform(X_train), columns=X_train.columns)

In [37]:
df_test = pd.read_csv('data/iith_foml_2023_test.csv')
df_test.drop(columns=['Feature 16', 'Feature 17'], inplace=True)
X_test = pd.DataFrame(imputer.transform(df_test), columns=df_test.columns)

In [38]:
from sklearn.utils.class_weight import compute_sample_weight, compute_class_weight

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test)
le = LabelEncoder()
y_train = le.fit_transform(y_train)
class_wts = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_wts_dict = dict(zip(np.arange(len(class_wts)), class_wts))
sample_wts = compute_sample_weight(class_weight=class_wts_dict, y=y_train)

In [39]:
from sklearn.ensemble import VotingClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from xgboost import XGBClassifier

extratrees_model = ExtraTreesClassifier(random_state=42)
bagging_xgboost_model = BaggingClassifier(estimator=XGBClassifier(random_state=42), random_state=42)
randomforest_model = RandomForestClassifier(random_state=42)


model = VotingClassifier(
    estimators=[
        ('extratrees', extratrees_model),
        ('bagging_xgboost', bagging_xgboost_model),
        ('randomforest', randomforest_model),
    ], voting='hard'
)

# Train the model on the training data
model.fit(X_train_scaled, y_train)

predictions = le.inverse_transform(model.predict(X_test_scaled))

In [40]:
pred_df = pd.DataFrame()
pred_df['Category'] = predictions
pred_df['id'] = pred_df.index + 1
pred_df = pred_df[['id', 'Category']]
pred_df.to_csv('output/voting_strong_again.csv', index=False)