In [1]:
%load_ext lab_black

In [2]:
import pandas as pd

from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import cross_validate
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)

In [3]:
def clean_features_df(df):
    return (
        df.rename(columns=lambda col_name: col_name.lower())
        .assign(
            male=lambda df: df["gender"] == "Male",
            driving_license=lambda df: df["driving_license"] == 1,
            region_code=lambda df: df["region_code"].astype("int").astype("str"),
            previously_insured=lambda df: df["previously_insured"] == 1,
            vehicle_damage=lambda df: df["vehicle_damage"] == "Yes",
            policy_sales_channel=lambda df: df["policy_sales_channel"]
            .astype("int")
            .astype("str"),
            vehicle_age=lambda df: df["vehicle_age"].replace(
                {"< 1 Year": 0, "1-2 Year": 1, "> 2 Years": 2}
            ),
        )
        .drop("gender", axis=1)
    )

In [4]:
def cross_validate_model(model, X, y):
    result = cross_validate(
        tree_model,
        X=features_df,
        y=y,
        scoring={
            "accuracy": make_scorer(accuracy_score),
            "precision": make_scorer(precision_score),
            "recall": make_scorer(recall_score),
            "f1": make_scorer(f1_score),
            "auc": make_scorer(roc_auc_score),
        },
        cv=3,
        n_jobs=-1,
    )

    return {score_name: score.mean() for score_name, score in result.items()}

In [5]:
df = pd.read_csv("../data/train.csv").set_index("id")

In [6]:
features_df = clean_features_df(df.drop("Response", axis=1))

In [7]:
response_series = df["Response"]
response = response_series.values

In [8]:
response_series.value_counts()

0    334399
1     46710
Name: Response, dtype: int64

In [9]:
46710 / (334399 + 46710)

0.12256336113815208

In [10]:
features_df.dtypes

age                       int64
driving_license            bool
region_code              object
previously_insured         bool
vehicle_age               int64
vehicle_damage             bool
annual_premium          float64
policy_sales_channel     object
vintage                   int64
male                       bool
dtype: object

In [11]:
features_df["driving_license"].value_counts()

True     380297
False       812
Name: driving_license, dtype: int64

In [12]:
features_df["previously_insured"].value_counts()

False    206481
True     174628
Name: previously_insured, dtype: int64

In [13]:
features_df["vehicle_damage"].value_counts()

True     192413
False    188696
Name: vehicle_damage, dtype: int64

In [14]:
features_df["male"].value_counts()

True     206089
False    175020
Name: male, dtype: int64

In [15]:
features_df["region_code"].unique().shape

(53,)

In [16]:
features_df["policy_sales_channel"].unique().shape

(155,)

In [17]:
features_df["region_code"].value_counts()

28    106415
8      33877
46     19749
41     18263
15     13308
30     12191
29     11065
50     10243
3       9251
11      9232
36      8797
33      7654
47      7436
35      6942
6       6280
45      5605
37      5501
18      5153
48      4681
14      4678
39      4644
10      4374
21      4266
2       4038
13      4036
7       3279
12      3198
9       3101
27      2823
32      2787
43      2639
17      2617
26      2587
25      2503
24      2415
38      2026
0       2021
16      2007
23      1960
31      1960
20      1935
49      1832
4       1801
34      1664
19      1535
22      1309
40      1295
5       1279
1       1008
44       808
42       591
52       267
51       183
Name: region_code, dtype: int64

In [18]:
policy_sales_channel_value_counts = features_df["policy_sales_channel"].value_counts()

In [19]:
relevant_sales_policies = policy_sales_channel_value_counts[
    policy_sales_channel_value_counts > 1000
].unique()

In [20]:
features_df.head()

Unnamed: 0_level_0,age,driving_license,region_code,previously_insured,vehicle_age,vehicle_damage,annual_premium,policy_sales_channel,vintage,male
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,44,True,28,False,2,True,40454.0,26,217,True
2,76,True,3,False,1,False,33536.0,26,183,True
3,47,True,28,False,2,True,38294.0,26,27,True
4,21,True,11,True,0,False,28619.0,152,203,True
5,29,True,41,True,0,False,27496.0,152,39,False


In [21]:
region_code = ["region_code"]
policy_sales_channel = ["policy_sales_channel"]
other_categorical_features = [
    "driving_license",
    "previously_insured",
    "vehicle_damage",
    "male",
]


numerical_features = ["age", "vehicle_age", "annual_premium", "vintage"]

In [22]:
relevant_sales_policies

array([134784,  79700,  73995,  21779,  10661,   9930,   6684,   5993,
         3885,   2893,   1865,   1848,   1598,   1515,   1410,   1264,
         1234,   1203,   1074,   1055,   1026])

In [23]:
ct = ColumnTransformer(
    [
        (
            "cat",
            OneHotEncoder(),
            other_categorical_features,
        ),
        (
            "region_code",
            OneHotEncoder(handle_unknown="ignore"),
            region_code,
        ),
        (
            "policy_sales_channel",
            OneHotEncoder(
                categories=[relevant_sales_policies],
                handle_unknown="ignore",
            ),
            policy_sales_channel,
        ),
        ("num", "passthrough", numerical_features),
    ]
)

In [24]:
tree_model = make_pipeline(ct, DecisionTreeClassifier())

In [25]:
%%time
cross_validate_model(tree_model, features_df, response)

CPU times: user 940 ms, sys: 174 ms, total: 1.11 s
Wall time: 27.4 s


{'fit_time': 23.42039163907369,
 'score_time': 0.3933785756429036,
 'test_accuracy': 0.8229850272963325,
 'test_precision': 0.2854492158728215,
 'test_recall': 0.295546992078784,
 'test_f1': 0.2904055421701895,
 'test_auc': 0.5961031869451677}