In [1]:
%load_ext lab_black

In [28]:
import pandas as pd

from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import cross_validate
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)

In [None]:
def cross_validate_model(model, X, y):
    result = cross_validate(tree_model, X=features_df, y=y, scoring={
        "accuracy": make_scorer(accuracy_score),
        "precision": make_scorer(precision_score),
        "recall": make_scorer(recall_score),
        "f1": make_scorer(f1_score),
        "auc": make_scorer(roc_auc_score),
    }, cv=3, n_jobs=-1)
    
    return {score_name: score.mean() for score_name, score in result.items()}

In [3]:
df = pd.read_csv("../data/train.csv").set_index("id")

In [4]:
features_df = (
    df.drop("Response", axis=1)
    .rename(columns=lambda col_name: col_name.lower())
    .assign(male=lambda df: df["gender"] == "Male")
    .drop("gender", axis=1)
    .assign(
        vehicle_damage=lambda df: df["vehicle_damage"] == "Yes",
        region_code=lambda df: df["region_code"].astype("int").astype("category"),
        policy_sales_channel=lambda df: df["policy_sales_channel"]
        .astype("int")
        .astype("category"),
        vehicle_age=lambda df: (
            df["vehicle_age"]
            .replace({"< 1 Year": 0, "1-2 Year": 1, "> 2 Years": 2})
            .astype("int")
            .astype("category")
        ),
    )
)
y = df["Response"].values

In [5]:
features_df["region_code"].value_counts()

28    106415
8      33877
46     19749
41     18263
15     13308
30     12191
29     11065
50     10243
3       9251
11      9232
36      8797
33      7654
47      7436
35      6942
6       6280
45      5605
37      5501
18      5153
48      4681
14      4678
39      4644
10      4374
21      4266
2       4038
13      4036
7       3279
12      3198
9       3101
27      2823
32      2787
43      2639
17      2617
26      2587
25      2503
24      2415
38      2026
0       2021
16      2007
31      1960
23      1960
20      1935
49      1832
4       1801
34      1664
19      1535
22      1309
40      1295
5       1279
1       1008
44       808
42       591
52       267
51       183
Name: region_code, dtype: int64

In [6]:
# we could remove some features by binning all the sales channels that have less than n observations in the training set.
features_df["policy_sales_channel"].value_counts()

152    134784
26      79700
124     73995
160     21779
156     10661
        ...  
149         1
84          1
123         1
144         1
143         1
Name: policy_sales_channel, Length: 155, dtype: int64

In [7]:
features_df.head()

Unnamed: 0_level_0,age,driving_license,region_code,previously_insured,vehicle_age,vehicle_damage,annual_premium,policy_sales_channel,vintage,male
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,44,1,28,0,2,True,40454.0,26,217,True
2,76,1,3,0,1,False,33536.0,26,183,True
3,47,1,28,0,2,True,38294.0,26,27,True
4,21,1,11,1,0,False,28619.0,152,203,True
5,29,1,41,1,0,False,27496.0,152,39,False


In [8]:
categorical_features = [
    "driving_license",
    "region_code",
    "previously_insured",
    "vehicle_damage",
    "policy_sales_channel",
    "male",
]

numerical_features = ["age", "vehicle_age", "annual_premium", "vintage"]

In [30]:
ct = ColumnTransformer(
    [
        (
            "cat",
            OneHotEncoder(handle_unknown="ignore"),
            categorical_features,
        ),
        ("num", "passthrough", numerical_features),
    ]
)

In [31]:
y

array([1, 0, 1, ..., 0, 0, 0])

In [38]:
tree_model = make_pipeline(
    ct,
    DecisionTreeClassifier(class_weight="balanced"),
)

In [33]:
tree_model.fit(features_df, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['driving_license',
                                                   'region_code',
                                                   'previously_insured',
                                                   'vehicle_damage',
                                                   'policy_sales_channel',
                                                   'male']),
                                                 ('num', 'passthrough',
                                                  ['age', 'vehicle_age',
                                                   'annual_premium',
                                                   'vintage'])])),
                ('decisiontreeclassifier', DecisionTreeClassifier())])

In [34]:
y_hat = tree_model.predict(features_df)

In [39]:
%%time
cross_validate_model(tree_model, features_df, y)

CPU times: user 98.4 ms, sys: 32.1 ms, total: 131 ms
Wall time: 38.5 s


{'fit_time': 33.70823184649149,
 'score_time': 0.3386910756429036,
 'test_accuracy': 0.8235570418100356,
 'test_precision': 0.29074928346970885,
 'test_recall': 0.3054592164418754,
 'test_f1': 0.29789669553833503,
 'test_auc': 0.6006929720819842}