In [1076]:
import pandas as pd
# pd.options.display.max_rows = None
pd.options.display.max_columns = None
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [1077]:
df_train = pd.read_csv('../data/iith_foml_2023_train.csv')
df_corr = df_train.corr()
drop_cols = ['Feature 3 (Discrete)', 'Feature 10', 'Feature 14', 'Feature 16', 'Feature 17', 'Feature 23 (Discrete)']
# display(df_corr.where(df_corr > 0.75))
df_train.drop(columns=drop_cols, inplace=True)
# for column in df_train.columns:
#     if df_train[column].isna().any():
#         mean_value = df_train[column].mode()[0]
#         df_train[column].fillna(mean_value, inplace=True)
df_train.fillna(1e9, inplace=True)


In [1078]:
class_counts = df_train["Target Variable (Discrete)"].value_counts()
print(class_counts)


Target Variable (Discrete)
1     488
0     249
2     109
6      70
5      41
8       7
14      5
7       5
15      4
4       3
13      3
3       3
9       2
12      1
17      1
11      1
10      1
16      1
Name: count, dtype: int64


In [1079]:
import numpy as np
import pandas as pd

# Assuming class_counts is defined somewhere in your code
minority_classes = class_counts[class_counts < 50].index

df_oversampled = pd.DataFrame()

for cls in minority_classes:
    minority_instances = df_train[df_train["Target Variable (Discrete)"] == cls]
    oversampled_instances = minority_instances.sample(n=40, replace=True, random_state=42)

    # Add Gaussian noise
    noise = np.random.normal(loc=0, scale=0.1, size=oversampled_instances.shape)
    oversampled_instances_with_noise = oversampled_instances + noise


    df_oversampled = pd.concat([df_oversampled, oversampled_instances_with_noise])
    
df_oversampled["Target Variable (Discrete)"] = df_oversampled["Target Variable (Discrete)"].round()
print(df_oversampled["Target Variable (Discrete)"].value_counts())


Target Variable (Discrete)
5.0     40
8.0     40
14.0    40
7.0     40
15.0    40
4.0     40
13.0    40
3.0     40
9.0     40
12.0    40
17.0    40
11.0    40
10.0    40
16.0    40
Name: count, dtype: int64


In [1080]:
# Assuming df_train is your original dataframe
from imblearn.over_sampling import SMOTE

# Separate majority and minority classes in df_train
majority_instances = df_train[df_train["Target Variable (Discrete)"].isin(minority_classes) == False]

majority_instances = df_train[df_train["Target Variable (Discrete)"].isin(minority_classes) == False]
df_train_updated = pd.concat([majority_instances, df_oversampled], ignore_index=True)


# Check the distribution of classes in the updated DataFrame
print(df_train_updated["Target Variable (Discrete)"].value_counts())


Target Variable (Discrete)
1.0     488
0.0     249
2.0     109
6.0      70
3.0      40
10.0     40
11.0     40
17.0     40
12.0     40
9.0      40
4.0      40
13.0     40
15.0     40
7.0      40
14.0     40
8.0      40
5.0      40
16.0     40
Name: count, dtype: int64


In [1081]:
X = df_train_updated.iloc[:, :-1]
y = df_train_updated.iloc[:, -1]
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [1082]:
X_train, val_x, y_train, val_y = train_test_split(X, y, test_size=0.2, random_state=42)

In [1083]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [1084]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

def report_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro', zero_division=0.0)

# model = KNeighborsClassifier(n_neighbors=7, n_jobs=-1, weights='distance')
model = GradientBoostingClassifier()

cv_f1_score = cross_val_score( model, X_train_resampled, y_train_resampled, scoring=make_scorer(report_f1),cv=2,)
print(cv_f1_score)

model.fit(X_train_resampled, y_train_resampled)

[0.9912716  0.99476784]


In [1085]:
y_val_pred = model.predict(val_x)
macro_f1 = f1_score(val_y, y_val_pred, average='macro')

print(f'Macro F1 Score: {macro_f1}')

Macro F1 Score: 0.9399045404641291
