<a href="https://colab.research.google.com/github/eddielin0926/kaggle/blob/main/notebooks/digit-recognizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [Digit Recognizer](https://www.kaggle.com/competitions/digit-recognizer)
Learn computer vision fundamentals with the famous MNIST data


## Preparation

In [1]:
! pip install --upgrade pip
! pip install --upgrade kaggle flaml
! pip install git+https://github.com/eddielin0926/myutils.git

Collecting pip
  Downloading pip-24.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-24.0
Collecting kaggle
  Downloading kaggle-1.6.6.tar.gz (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.6/84.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting flaml
  Downloading FLAML-2.1.1-py3-none-any.whl.metadata (15 kB)
Downloading FLAML-2.1.1-py3-none-any.whl (295 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.2/295.2 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
 

Set environment variable for kaggle authentication from Colab Secrets.

In [2]:
import os
from google.colab import userdata

os.environ['KAGGLE_USERNAME'] = userdata.get('KAGGLE_USERNAME')
os.environ['KAGGLE_KEY'] = userdata.get('KAGGLE_KEY')

Set competition name.

In [3]:
COMPETITION = "digit-recognizer"

Download competition dataset.

In [4]:
! mkdir -p data
! kaggle competitions download -c {COMPETITION}
! unzip -o {COMPETITION}.zip -d data

Downloading digit-recognizer.zip to /content
 65% 10.0M/15.3M [00:00<00:00, 39.3MB/s]
100% 15.3M/15.3M [00:00<00:00, 54.8MB/s]
Archive:  digit-recognizer.zip
  inflating: data/sample_submission.csv  
  inflating: data/test.csv           
  inflating: data/train.csv          


Import libraries.

In [5]:
import pandas as pd
import numpy as np

import myutils

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

import warnings
warnings.filterwarnings("ignore")

### Datasets

Read training and testing dataset.

In [6]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

### Parameters

In [7]:
ID = "ImageId"
TARGET = "Label"

## Exploratory Data Analysis

### Train Dataset

In [8]:
display(df_train.shape)
display(myutils.overview(df_train))

(42000, 785)

Unnamed: 0,Column,Dtype,Null,Non-Null,Count,Unique,Mode,Mean,Std,Overview,Sample
0,label,int64,0,42000,42000,10,1,4.456643,2.887730,"[0.0, 2.0, 4.0, 7.0, 9.0]",1
1,pixel0,int64,0,42000,42000,1,0,0.000000,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0]",0
2,pixel1,int64,0,42000,42000,1,0,0.000000,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0]",0
3,pixel2,int64,0,42000,42000,1,0,0.000000,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0]",0
4,pixel3,int64,0,42000,42000,1,0,0.000000,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0]",0
...,...,...,...,...,...,...,...,...,...,...,...
780,pixel779,int64,0,42000,42000,3,0,0.002857,0.414264,"[0.0, 0.0, 0.0, 0.0, 62.0]",0
781,pixel780,int64,0,42000,42000,1,0,0.000000,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0]",0
782,pixel781,int64,0,42000,42000,1,0,0.000000,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0]",0
783,pixel782,int64,0,42000,42000,1,0,0.000000,0.000000,"[0.0, 0.0, 0.0, 0.0, 0.0]",0


### Test Dataset

In [9]:
display(df_test.shape)
display(myutils.overview(df_test))

(28000, 784)

Unnamed: 0,Column,Dtype,Null,Non-Null,Count,Unique,Mode,Mean,Std,Overview,Sample
0,pixel0,int64,0,28000,28000,1,0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0]",0
1,pixel1,int64,0,28000,28000,1,0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0]",0
2,pixel2,int64,0,28000,28000,1,0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0]",0
3,pixel3,int64,0,28000,28000,1,0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0]",0
4,pixel4,int64,0,28000,28000,1,0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0]",0
...,...,...,...,...,...,...,...,...,...,...,...
779,pixel779,int64,0,28000,28000,1,0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0]",0
780,pixel780,int64,0,28000,28000,1,0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0]",0
781,pixel781,int64,0,28000,28000,1,0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0]",0
782,pixel782,int64,0,28000,28000,1,0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0]",0


## Data Preprocessing

### Encoding

In [10]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
encode_cols = []
for col in encode_cols:
    df_train[col] = le.fit_transform(df_train[col])
    df_test[col] = le.transform(df_test[col])
    print('Encoding:', col, le.classes_)

In [None]:
train = pd.get_dummies(df_train.drop([], axis=1))
test = pd.get_dummies(df_test.drop([], axis=1))

In [None]:
drop_cols = []
for col, corr in zip(train.corr()[TARGET].index, train.corr()[TARGET]):
    if abs(corr) < 0.2:
        drop_cols.append(col)
print('Dropping cols:', drop_cols)
train = train.drop(drop_cols, axis=1)
test = test.drop(list(set(drop_cols) & set(test.columns)), axis=1)

In [None]:
plt.figure(figsize=(10, 10))
sns.heatmap(train.corr(), annot=False, annot_kws={'size': 8}, vmin=-1, vmax=1, cmap='viridis')

## Training

In [None]:
from flaml import AutoML
from sklearn.ensemble import GradientBoostingRegressor

automl = AutoML()
automl_settings = {
    "task": "classification",
    "time_budget": 120,
    "log_file_name": f"{COMPETITION}.log",
    "estimator_list": ['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree'],
    "metric": "ap",
    "ensemble": {
        "final_estimator": GradientBoostingRegressor(),
        "passthrough": True,
    },
}
automl.fit(dataframe=train, label="Target", **automl_settings)

In [None]:
from pprint import pprint

print('Best ML leaner:', automl.best_estimator)
print("Best score:", automl.score(train.drop([TARGET], axis=1), train[TARGET]))
print('Best hyperparmeter config:')
pprint(automl.best_config)

In [None]:
from flaml.automl.data import get_output_from_log

time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
    get_output_from_log(filename=automl_settings["log_file_name"], time_budget=120)

plt.title("Learning Curve")
plt.xlabel("Wall Clock Time (s)")
plt.ylabel("Validation Accuracy")
plt.step(time_history, 1 - np.array(best_valid_loss_history), where="post")
plt.show()

## Submission

In [None]:
predicted = automl.predict(test)
submission = pd.DataFrame({ID: df_test[ID], TARGET: predicted})
submission.to_csv('./submission.csv', index=False)
submission.head()

In [None]:
! kaggle competitions submit -c {COMPETITION} -f submission.csv -m "First try"

In [None]:
! kaggle competitions submissions {COMPETITION}

## Reference