In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [84]:
df_train = pd.read_csv('data/iith_foml_2023_train.csv')
drop_cols = ['Feature 3 (Discrete)', 'Feature 10', 'Feature 14', 'Feature 16', 'Feature 17', 'Feature 23 (Discrete)']
df_train.drop(columns=drop_cols, inplace=True)
df_train.fillna(1e9, inplace=True)

In [85]:
df_test = pd.read_csv('data/iith_foml_2023_test.csv')
df_test.drop(columns=drop_cols, inplace=True)
df_test.fillna(1e9, inplace=True)
X_test = df_test

In [86]:
class_counts = df_train["Target Variable (Discrete)"].value_counts()

In [87]:
import numpy as np
import pandas as pd

# Assuming class_counts is defined somewhere in your code
minority_classes = class_counts[class_counts < 50].index

df_oversampled = pd.DataFrame()

for cls in minority_classes:
    minority_instances = df_train[df_train["Target Variable (Discrete)"] == cls]
    oversampled_instances = minority_instances.sample(n=40, replace=True, random_state=42)

    # Add Gaussian noise
    noise = np.random.normal(loc=0, scale=0.1, size=oversampled_instances.shape)
    oversampled_instances_with_noise = oversampled_instances + noise


    df_oversampled = pd.concat([df_oversampled, oversampled_instances_with_noise])
    
df_oversampled["Target Variable (Discrete)"] = df_oversampled["Target Variable (Discrete)"].round()
print(df_oversampled["Target Variable (Discrete)"].value_counts())


Target Variable (Discrete)
5.0     40
8.0     40
14.0    40
7.0     40
15.0    40
4.0     40
13.0    40
3.0     40
9.0     40
12.0    40
17.0    40
11.0    40
10.0    40
16.0    40
Name: count, dtype: int64


In [88]:
# Assuming df_train is your original dataframe
from imblearn.over_sampling import SMOTE

# Separate majority and minority classes in df_train
majority_instances = df_train[df_train["Target Variable (Discrete)"].isin(minority_classes) == False]

majority_instances = df_train[df_train["Target Variable (Discrete)"].isin(minority_classes) == False]
df_train_updated = pd.concat([majority_instances, df_oversampled], ignore_index=True)


# Check the distribution of classes in the updated DataFrame
print(df_train_updated["Target Variable (Discrete)"].value_counts())


Target Variable (Discrete)
1.0     488
0.0     249
2.0     109
6.0      70
3.0      40
10.0     40
11.0     40
17.0     40
12.0     40
9.0      40
4.0      40
13.0     40
15.0     40
7.0      40
14.0     40
8.0      40
5.0      40
16.0     40
Name: count, dtype: int64


In [89]:
X = df_train_updated.iloc[:, :-1]
y = df_train_updated.iloc[:, -1]
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [90]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
X, y = smote.fit_resample(X, y)

In [91]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Initialize models
model = GradientBoostingClassifier()

# Train the model on the training data
model.fit(X, y)

predictions = model.predict(X_test)

In [94]:
pred_df = pd.DataFrame()
pred_df['Category'] = predictions
pred_df['id'] = pred_df.index + 1
pred_df = pred_df[['id', 'Category']].astype(int)
pred_df.to_csv('output/oversample_new_GradientBoostingClassifier.csv', index=False)