In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
seed = 47

/kaggle/input/tabular-playground-series-apr-2022/sample_submission.csv
/kaggle/input/tabular-playground-series-apr-2022/train_labels.csv
/kaggle/input/tabular-playground-series-apr-2022/train.csv
/kaggle/input/tabular-playground-series-apr-2022/test.csv


In [2]:
def evaluate_model(model, x, y):
    y_pred_prob = model.predict_proba(x)[:, 1]
    auc_roc = roc_auc_score(y, y_pred_prob)
    return {'auc_roc_curve' : auc_roc}

# Tabular Playground Series - Apr 2022 - Logistic Regression

In [3]:
train_df =  pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/train.csv')
labels_df = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/train_labels.csv')

In [4]:
train_df.drop(['subject', 'step'], axis=1, inplace=True)
train_df = train_df.groupby(['sequence']).mean()
train_df = train_df.join(labels_df.set_index('sequence'), on='sequence')
y_train = train_df.pop('state').values
x_train = train_df.values

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=seed, shuffle=False)

In [6]:
params = {'n_estimators': 100, 
          'max_depth': 24, 
          'subsample': 1.0,
          'eta': 0.2,
          'colsample_bytree': 1.0,
          'gamma': 0.0, 
          'min_child_weight': 1,
          'reg_alpha': 1}


model =  XGBClassifier(**params,
                         objective='binary:logistic',
                         random_state=seed, 
                         tree_method='gpu_hist', 
                         predictor='gpu_predictor',
                         early_stopping_rounds=200,
                         use_label_encoder=False,
                         verbosity=0)
model.fit(x_train, y_train, verbose=True)
score = evaluate_model(model, x_test, y_test)
print(score)

{'auc_roc_curve': 0.7764327555183542}


# Submission

In [7]:
test_df =  pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/test.csv')
test_df.drop(['subject', 'step'], axis=1, inplace=True)
test_df = test_df.groupby(['sequence']).mean()
x_test = test_df.values

In [8]:
submission_df = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/sample_submission.csv')
state = model.predict_proba(x_test)[:, 1]
sequence = submission_df['sequence'].values.squeeze()
submission_df = pd.DataFrame({'sequence': sequence,'state': state}, index=test_df.index)
submission_df.head()

Unnamed: 0_level_0,sequence,state
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1
25968,25968,0.889715
25969,25969,0.586699
25970,25970,0.640355
25971,25971,0.454876
25972,25972,0.953395


In [9]:
submission_df = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/sample_submission.csv')
state = model.predict_proba(x_test)

In [10]:
submission_df.to_csv('submission.csv', index=False)