# Modeling



In [None]:
import os

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn import feature_extraction
from sklearn import preprocessing
from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier # <- boosting model
from sklearn.pipeline import Pipeline

from src.utils import precission_recall_vs_thr

set_config(display='diagram')   

LABEL_COLUMN = "churned"

# Load Dataset

In [None]:
train = pd.read_csv('../data/users_train.csv')
test = pd.read_csv('../data/users_test.csv')

x_train, y_train = train.drop(columns=[LABEL_COLUMN]), train[LABEL_COLUMN]
x_test, y_test = test.drop(columns=[LABEL_COLUMN]), test[LABEL_COLUMN]

# Modelos boosting

## Solo variables numericas
### Tranformer

In [None]:
NUMERICAL_COLUMNS = [
    "cnt_user_engagement",
    "cnt_level_start_quickplay",
    "cnt_level_end_quickplay",
    "cnt_level_complete_quickplay",
    "cnt_level_reset_quickplay",
    "cnt_post_score",
    "cnt_spend_virtual_currency",
    "cnt_ad_reward",
    "cnt_challenge_a_friend",
    "cnt_completed_5_levels",
    "cnt_use_extra_steps",
]
IGNORE_COLUMNS = [
    "user_first_engagement",
    "user_pseudo_id",
    "is_enable",
    "bounced",
    "device_lang",
    "country_name",
    "device_os",
    "device_lang",
]

transformer_numeric = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
        ("scaler", preprocessing.StandardScaler()),
    ]
)
transformer_pipeline = ColumnTransformer(
    transformers=[
        ("numeric_features", transformer_numeric, NUMERICAL_COLUMNS),
        ("ignore_features", "drop", IGNORE_COLUMNS),
    ]
)


### Linear model

In [None]:
gb_model = Pipeline(
    steps=[
        ("preprocessor", transformer_pipeline),
        ("gb_model", GradientBoostingClassifier(random_state=42)),
    ]
)
gb_model.fit(x_train, y_train)

### Evaluation

In [None]:
x = x_test
y = y_test
model = gb_model

y_pred = model.predict(x)
print(metrics.classification_report(y, y_pred))

In [None]:
metrics.plot_confusion_matrix(model, x, y)

In [None]:
f, ax = plt.subplots()
ax = metrics.plot_roc_curve(model, x, y, ax=ax)

In [None]:
f, ax = plt.subplots()
precission_recall_vs_thr(model, x, y, ax=ax)

## Numerical and Categorical features

### Transformer
El poder construir pipeline nos permite fácilmente experimentar muy rápido el efecto de las features que usamos así como las transformaciones que realizamos sobre estas.

In [None]:
CATEGORICAL_COLUMNS = [
    "country_name",
    "device_os",
    "device_lang",
]
NUMERICAL_COLUMNS = [
    "cnt_user_engagement",
    "cnt_level_start_quickplay",
    "cnt_level_end_quickplay",
    "cnt_level_complete_quickplay",
    "cnt_level_reset_quickplay",
    "cnt_post_score",
    "cnt_spend_virtual_currency",
    "cnt_ad_reward",
    "cnt_challenge_a_friend",
    "cnt_completed_5_levels",
    "cnt_use_extra_steps",
]
IGNORE_COLUMNS = [
    "user_first_engagement",
    "user_pseudo_id",
    "is_enable",
    "bounced",
    "device_lang",
]

transformer_numeric = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
        ("scaler", preprocessing.StandardScaler()),
    ]
)
transformer_categorical = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("one-hot", preprocessing.OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ]
)
transformer_pipeline = ColumnTransformer(
    transformers=[
        ("numeric_features", transformer_numeric, NUMERICAL_COLUMNS),
        ("categorical_features", transformer_categorical, CATEGORICAL_COLUMNS),
        ("ignore_features", "drop", IGNORE_COLUMNS),
    ]
)

### Modeling

In [None]:
gb_model_full = Pipeline(
    steps=[
        ("preprocessor", transformer_pipeline),
        ("gb_model", GradientBoostingClassifier(random_state=42)),
    ]
)
gb_model_full.fit(x_train, y_train)

### Evaluation

In [None]:
x = x_test
y = y_test
model = gb_model_full

y_pred = model.predict(x)
print(metrics.classification_report(y, y_pred))

In [None]:
metrics.plot_confusion_matrix(model, x, y)

In [None]:
f, ax = plt.subplots()
metrics.plot_roc_curve(model, x, y, ax=ax)

In [None]:
f, ax = plt.subplots()
precission_recall_vs_thr(model, x, y, ax=ax)

## Save model 

In [None]:
model_path = 'models'
if not os.path.exists(model_path):
    os.makedirs(model_path)

joblib.dump(gb_model_full, f'{model_path}/gb_model_full.joblib')