In [1]:
%load_ext lab_black

In [2]:
import os

import pandas as pd

from sklearn.base import clone

from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split, ParameterGrid, GridSearchCV
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)

import joblib

In [3]:
ARTIFACTS_DIRECTORY = os.path.join("..", "artifacts")
MAX_DEPTH_GRID_SEARCH_FILE_NAME = "max_depth_grid_search.joblib"
CCP_ALPHA_GRID_SEARCH_FILE_NAME = "ccp_alpha_grid_search.joblib"

In [4]:
MAX_DEPTH_GRID_SEARCH_FILE_PATH = os.path.join(
    ARTIFACTS_DIRECTORY, MAX_DEPTH_GRID_SEARCH_FILE_NAME
)
CCP_ALPHA_GRID_SEARCH_FILE_NAME = os.path.join(
    ARTIFACTS_DIRECTORY, CCP_ALPHA_GRID_SEARCH_FILE_NAME
)

In [5]:
os.makedirs(ARTIFACTS_DIRECTORY, exist_ok=True)

In [6]:
def clean_features_df(df):
    return (
        df.rename(columns=lambda col_name: col_name.lower())
        .assign(
            male=lambda df: df["gender"] == "Male",
            driving_license=lambda df: df["driving_license"] == 1,
            region_code=lambda df: df["region_code"].astype("int").astype("str"),
            previously_insured=lambda df: df["previously_insured"] == 1,
            vehicle_damage=lambda df: df["vehicle_damage"] == "Yes",
            policy_sales_channel=lambda df: df["policy_sales_channel"]
            .astype("int")
            .astype("str"),
            vehicle_age=lambda df: df["vehicle_age"].replace(
                {"< 1 Year": 0, "1-2 Year": 1, "> 2 Years": 2}
            ),
        )
        .drop("gender", axis=1)
    )

In [7]:
def test_model(trained_model, X_test, y_test):

    y_pred = trained_model.predict(X_test)
    y_pred_proba = trained_model.predict_proba(X_test)

    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "auc": roc_auc_score(y_test, y_pred_proba[:, 1]),
    }

In [8]:
df = pd.read_csv("../data/train.csv").set_index("id")

In [9]:
features_df = clean_features_df(df.drop("Response", axis=1))

In [10]:
response_series = df["Response"]
response = response_series.values

In [11]:
X_train, X_test, y_train, y_test = train_test_split(features_df, response)

In [12]:
policy_sales_channel_value_counts = X_train["policy_sales_channel"].value_counts()

In [13]:
relevant_sales_policies = policy_sales_channel_value_counts[
    policy_sales_channel_value_counts > 1000
].unique()

In [14]:
region_code = ["region_code"]
policy_sales_channel = ["policy_sales_channel"]
other_categorical_features = [
    "driving_license",
    "previously_insured",
    "vehicle_damage",
    "male",
]


numerical_features = ["age", "vehicle_age", "annual_premium", "vintage"]

In [15]:
ct = ColumnTransformer(
    [
        (
            "cat",
            OneHotEncoder(),
            other_categorical_features,
        ),
        (
            "region_code",
            OneHotEncoder(handle_unknown="ignore"),
            region_code,
        ),
        (
            "policy_sales_channel",
            OneHotEncoder(
                categories=[relevant_sales_policies],
                handle_unknown="ignore",
            ),
            policy_sales_channel,
        ),
        ("num", "passthrough", numerical_features),
    ]
)

In [33]:
# tree_pipeline = make_pipeline(ct, DecisionTreeClassifier(class_weight="balanced"))
tree_pipeline = make_pipeline(ct, DecisionTreeClassifier())

In [34]:
%%time
full_tree_pipeline = clone(tree_pipeline).fit(X_train, y_train)

CPU times: user 20.5 s, sys: 15.7 ms, total: 20.5 s
Wall time: 20.5 s


In [35]:
test_model(full_tree_pipeline, X_test, y_test)

{'accuracy': 0.8224039127605534,
 'precision': 0.28486622753517116,
 'recall': 0.3012037833190026,
 'f1': 0.29280728883687884,
 'auc': 0.5983672479887056}

In [36]:
max_depth = full_tree_pipeline[-1].get_depth()

In [37]:
%%time
ccp_alphas = full_tree_pipeline[-1].cost_complexity_pruning_path(X_train, y_train)["ccp_alphas"]

CPU times: user 6.2 s, sys: 3.93 ms, total: 6.21 s
Wall time: 6.2 s


In [38]:
max_depth

83

In [39]:
ccp_alphas.shape

(13170,)

In [40]:
full_tree_pipeline.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat', OneHotEncoder(),
                                                  ['driving_license',
                                                   'previously_insured',
                                                   'vehicle_damage', 'male']),
                                                 ('region_code',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['region_code']),
                                                 ('policy_sales_channel',
                                                  OneHotEncoder(categories=[array([101004,  59852,  55498,  16344,   8039,   7386,   5021,   4530,
         2908,   2148,   1406,   1398,   1174,   1134,   1083])],
                                                                handle_unknown='ignore'),
                             

In [41]:
grid_search_cv_params = {
    "estimator": full_tree_pipeline,
    "scoring": make_scorer(roc_auc_score),
    "n_jobs": -1,
    "cv": 2,
}

In [42]:
max_depth_grid_search = GridSearchCV(
    **grid_search_cv_params,
    param_grid=ParameterGrid(
        {
            "decisiontreeclassifier__max_depth": [
                [max_depth] for max_depth in range(1, max_depth)
            ]
        }
    )
)

In [43]:
%%time
max_depth_grid_search.fit(X_train, y_train);

CPU times: user 1min, sys: 229 ms, total: 1min
Wall time: 6min 33s


GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('cat',
                                                                         OneHotEncoder(),
                                                                         ['driving_license',
                                                                          'previously_insured',
                                                                          'vehicle_damage',
                                                                          'male']),
                                                                        ('region_code',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['region_code']),
                                                                        ('policy_sales_channe

In [44]:
max_depth_grid_search.best_estimator_

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat', OneHotEncoder(),
                                                  ['driving_license',
                                                   'previously_insured',
                                                   'vehicle_damage', 'male']),
                                                 ('region_code',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['region_code']),
                                                 ('policy_sales_channel',
                                                  OneHotEncoder(categories=[array([101004,  59852,  55498,  16344,   8039,   7386,   5021,   4530,
         2908,   2148,   1406,   1398,   1174,   1134,   1083])],
                                                                handle_unknown='ignore'),
                                                  ['policy_sales_c

In [45]:
test_model(max_depth_grid_search.best_estimator_, X_test, y_test)

{'accuracy': 0.8218476458363946,
 'precision': 0.2809476963436629,
 'recall': 0.2946689595872743,
 'f1': 0.2876447876447876,
 'auc': 0.5963016802466183}

In [29]:
%%time
accuracy_score(y_true=y_train, y_pred=tree_model.predict(X_train))

NameError: name 'tree_model' is not defined

In [30]:
(13117 * 2 / 0.712) / 60 / 60 / 4

2.558715667915106