In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
#| default_exp deep_learning

# Deep learning

> Apply deep learning to the tabular data, using fastai and Pytorch.

In [None]:
#|hide
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.metrics import mean_squared_error as mse
import pandas as pd

## Create `Learner`
A key part of fastai is creating a `Learner`. This involves several steps, including separating categorical and continuous variables, and splitting the Dataframe into training and validation sets.

In [None]:
#|export
def create_learner(df, # Dataframe to analyse
                   dep_var, # Name of target variable in Dataframe
                   task): # classification or regression
    "Create a fastai Learner to train"
    cont, cat = cont_cat_split(df, max_card=9000, dep_var=dep_var)
    procs = [Categorify, FillMissing, Normalize]
    y_block = CategoryBlock() if task=="classification" else None
    splits = RandomSplitter()(range_of(df))
    to = TabularPandas(df, procs=procs, cat_names=cat, cont_names=cont,
                       y_names=dep_var, y_block=y_block, splits=splits)
    dls = to.dataloaders()
    learn = tabular_learner(dls, layers=[500,250])
    return learn

## Create, train and validate model
We can do this in one line of code with `auc_accuracy`:

In [None]:
#|export
def validate(df, # Dataframe to analyse
             dep_var, # Name of target variable in Dataframe 
             task): # classification or regression
    "Get accuracy and ROC AUC from Learner after training"
    learn = create_learner(df, dep_var, task)
    with learn.no_bar(), learn.no_logging(): learn.fit_one_cycle(3, 1e-2)
    preds,targs = learn.get_preds()
    if task == "classification":
        roc_auc = roc_auc_score(targs, preds[:,1])
        preds_list = preds.tolist()
        preds_class = [np.argmax(x) for x in preds_list]
        accuracy = accuracy_score(preds_class, targs)
        return roc_auc, accuracy
    else: return np.sqrt(mse(preds, targs))

We can use these functions to easily get close to SOTA on the infamous Titanic dataset:

In [None]:
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
df = pd.read_csv(url)
df = df.sample(frac=1) # shuffle dataset
df.drop(columns=['PassengerId', 'Ticket', 'Name'], inplace=True)

Once we have a Pandas DataFrame, we can train and evaluate a model in one line of code:

In [None]:
roc_auc, accuracy = validate(df, dep_var='Survived', task="classification")
print(f"ROC AUC = {np.round(roc_auc, 4)}, Accuracy = {np.round(100*accuracy, 2)}%")

ROC AUC = 0.8198, Accuracy = 66.29%


We can do the same thing for a regression dataset:

In [None]:
url = 'https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv'
boston_df = pd.read_csv(url)
boston_df = boston_df.sample(frac=1) # shuffle dataset

# do the model fitting
rmse = validate(boston_df, dep_var="medv", task="regression")
print(f"RMSE = {np.round(rmse, 2)}")

RMSE = 21.59000015258789
