In [None]:
# !kaggle competitions download -c just-the-basics-the-after-party

In [None]:
# from zipfile import ZipFile

In [None]:
# with ZipFile('just-the-basics-the-after-party.zip', 'r') as zipObj:
#     zipObj.extractall()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set_theme()

In [None]:
train_df = pd.read_csv('../input/just-the-basics-the-after-party/train.csv')
train_df

In [None]:
for idx, col in enumerate(train_df.columns):
    train_df = train_df.rename(columns={col: str(idx)})

In [None]:
train_df['target'] = pd.read_csv('../input/just-the-basics-the-after-party/train_labels.csv')

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.corr()['target'].sort_values(ascending=False)

# EDA

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(train_df.corr())

In [None]:
train_df.isna().sum().sum()

In [None]:
sns.countplot(data=train_df, x='target')

In [None]:
sns.histplot(data=train_df, x='0')

In [None]:
sns.histplot(data=train_df, x='1')

In [None]:
sns.pairplot(data=train_df.iloc[:, :10])

In [None]:
sns.pairplot(data=train_df.iloc[:, 10:20])

In [None]:
sns.pairplot(data=train_df.iloc[:, 20:30])

In [None]:
sns.pairplot(data=train_df.iloc[:, 30:40])

In [None]:
sns.pairplot(data=train_df.iloc[:, 40:50])

In [None]:
sns.pairplot(data=train_df.iloc[:, 50:60])

In [None]:
sns.pairplot(data=train_df.iloc[:, 60:70])

In [None]:
sns.pairplot(data=train_df.iloc[:, 70:80])

In [None]:
sns.pairplot(data=train_df.iloc[:, 80:90])

In [None]:
sns.pairplot(data=train_df.iloc[:, 90:100])

## EDA results:
- There are not high correlation between features and target

# Data preprocessing

In [None]:
mean = {}

In [None]:
def clean_data(df, test=False):
    global mean
    if not test:
        for col in df.columns:
            mean[col] = df[col].mean()
    for col in df.columns:
        df.fillna(mean[col], inplace=True)
    return df

In [None]:
train_df = clean_data(train_df)

In [None]:
train_df.isna().sum().sum()

In [None]:
test_df = pd.read_csv('../input/just-the-basics-the-after-party/test.csv')
test_df

In [None]:
test_df.isna().sum().sum()

In [None]:
test_df = clean_data(test_df)

In [None]:
test_df.isna().sum().sum()

# Model defining

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier, XGBRFClassifier
from typing import Union

In [None]:
X = train_df.drop(columns=['target'])
Y = train_df['target']

In [None]:
X.shape, Y.shape

In [None]:
algorithms = {
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'XGBClassifier': XGBClassifier(),
    'XGBRFClassifier': XGBRFClassifier()
}

models = {}

param_grid = {
    'LogisticRegression': {
        'C': [1.0, 3.0, 5.0]
    },
    'SVC': {
        'C': [1.0, 3.0, 5.0],
        'kernel': ['poly', 'rbf', 'sigmoid']
    },
    'AdaBoostClassifier': {
        'n_estimators': [100, 300],
    },
    'GradientBoostingClassifier': {
        'n_estimators': [100, 300],
        'max_depth': [3, 5, 7]
    },
    'RandomForestClassifier': {
        'n_estimators': [100, 300],
        'max_depth': [3, 5, 7]
    },
    'KNeighborsClassifier': {
        'n_neighbors': [3, 6, 10]
    },
    'XGBClassifier': {
        'n_estimators': [100, 300],
        'max_depth': [3, 5, 7]
    },
    'XGBRFClassifier': {
        'n_estimators': [100, 300],
        'max_depth': [3, 5, 7]
    }
}

scores = pd.DataFrame(data={'AUC': []})

In [None]:
# def calculate_performance(model, scores, algorithm=""):
#     global X, Y
#     Y_pred = model.predict(X)
#     acc = accuracy_score(Y, Y_pred)
#     prec = precision_score(Y, Y_pred)
#     rec = recall_score(Y, Y_pred)
#     f1 = f1_score(Y, Y_pred)
#     scores.loc[algorithm, 'Accuracy'] = acc
#     scores.loc[algorithm, 'Precision'] = prec
#     scores.loc[algorithm, 'Recall'] = rec
#     scores.loc[algorithm, 'F1'] = f1

In [None]:
def train(scaler: Union['none', 'Standard', 'MinMax']='none'):
    global models
    if scaler == 'none':
        for index, model in algorithms.items():
            models[index] = GridSearchCV(estimator=model, param_grid=param_grid[index], cv=10, scoring='roc_auc')
            models[index].fit(X, Y)
            scores.loc[index, "AUC"] = models[index].best_score_
            print(index)
            
    elif scaler == 'Standard':
        for index, model in algorithms.items():
            models[index] = make_pipeline(StandardScaler(), GridSearchCV(estimator=model, param_grid=param_grid[index], cv=10, scoring='roc_auc', refit=True))
            models[index].fit(X, Y)
            scores.loc[index, "AUC"] = models[index]['gridsearchcv'].best_score_
            print(index)
    elif scaler == 'MinMax':
        for index, model in algorithms.items():
            models[index] = make_pipeline(MinMaxScaler(), GridSearchCV(estimator=model, param_grid=param_grid[index], cv=10, scoring='roc_auc', refit=True))
            models[index].fit(X, Y)
            scores.loc[index, "AUC"] = models[index]['gridsearchcv'].best_score_
            print(index)

In [None]:
train('Standard')

In [None]:
scores.sort_values(by=['AUC'], ascending=False)

In [None]:
X_test = test_df

In [None]:
Y_test = models['GradientBoostingClassifier']['gridsearchcv'].best_estimator_.predict(X_test)

In [None]:
Y_test.shape

In [None]:
solution = pd.DataFrame({0: Y_test})

In [None]:
solution

In [None]:
solution.to_csv('submission.csv', sep=',', index=False)