<a href="https://colab.research.google.com/github/eddielin0926/kaggle/blob/main/notebooks/spaceship-titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [Competiton](https://www.kaggle.com/competitions)


## Preparation

In [None]:
! pip install --upgrade pip
! pip install --upgrade kaggle flaml
! pip install git+https://github.com/eddielin0926/myutils.git

Set environment variable for kaggle authentication from Colab Secrets.

In [None]:
import os
from google.colab import userdata

os.environ['KAGGLE_USERNAME'] = userdata.get('KAGGLE_USERNAME')
os.environ['KAGGLE_KEY'] = userdata.get('KAGGLE_KEY')

Set competition name.

In [None]:
COMPETITION = "competition"

Download competition dataset.

In [None]:
! mkdir -p data
! kaggle competitions download -c {COMPETITION}
! unzip -o {COMPETITION}.zip -d data

Import libraries.

In [None]:
import pandas as pd
import numpy as np

import myutils

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

import warnings
warnings.filterwarnings("ignore")

### Datasets

Read training and testing dataset.

In [None]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

### Parameters

In [None]:
ID = "id"
TARGET = "target"

## Exploratory Data Analysis

### Train Dataset

In [None]:
display(df_train.shape)
display(myutils.overview(df_train))

### Test Dataset

In [None]:
display(df_test.shape)
display(myutils.overview(df_test))

## Data Preprocessing

### Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
encode_cols = []
for col in encode_cols:
    df_train[col] = le.fit_transform(df_train[col])
    df_test[col] = le.transform(df_test[col])
    print('Encoding:', col, le.classes_)

In [None]:
train = pd.get_dummies(df_train.drop([], axis=1))
test = pd.get_dummies(df_test.drop([], axis=1))

In [None]:
drop_cols = []
for col, corr in zip(train.corr()[TARGET].index, train.corr()[TARGET]):
    if abs(corr) < 0.2:
        drop_cols.append(col)
print('Dropping cols:', drop_cols)
train = train.drop(drop_cols, axis=1)
test = test.drop(list(set(drop_cols) & set(test.columns)), axis=1)

In [None]:
plt.figure(figsize=(10, 10))
sns.heatmap(train.corr(), annot=False, annot_kws={'size': 8}, vmin=-1, vmax=1, cmap='viridis')

## Training

In [None]:
from flaml import AutoML
from sklearn.ensemble import GradientBoostingRegressor

automl = AutoML()
automl_settings = {
    "task": "classification",
    "time_budget": 120,
    "log_file_name": f"{COMPETITION}.log",
    "estimator_list": ['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree'],
    "metric": "ap",
    "ensemble": {
        "final_estimator": GradientBoostingRegressor(),
        "passthrough": True,
    },
}
automl.fit(dataframe=train, label="Target", **automl_settings)

In [None]:
from pprint import pprint

print('Best ML leaner:', automl.best_estimator)
print("Best score:", automl.score(train.drop([TARGET], axis=1), train[TARGET]))
print('Best hyperparmeter config:')
pprint(automl.best_config)

In [None]:
from flaml.automl.data import get_output_from_log

time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
    get_output_from_log(filename=automl_settings["log_file_name"], time_budget=120)

plt.title("Learning Curve")
plt.xlabel("Wall Clock Time (s)")
plt.ylabel("Validation Accuracy")
plt.step(time_history, 1 - np.array(best_valid_loss_history), where="post")
plt.show()

## Submission

In [None]:
predicted = automl.predict(test)
submission = pd.DataFrame({ID: df_test[ID], TARGET: predicted})
submission.to_csv('./submission.csv', index=False)
submission.head()

In [None]:
! kaggle competitions submit -c {COMPETITION} -f submission.csv -m "First try"

In [None]:
! kaggle competitions submissions {COMPETITION}

## Reference