# Grid Search

In [None]:
import warnings
import numpy as np 
import pandas as pd

# warnings.filterwarnings('ignore')
random_state = 42

## Datensatz laden

Quelle: [https://www.kaggle.com/c/porto-seguro-safe-driver-prediction](https://www.kaggle.com/c/porto-seguro-safe-driver-prediction)

In [None]:
df = pd.read_csv('../datasets/safe-driver-prediction.csv')

In [None]:
df.head()

# Metadaten extrahieren

In [None]:
data = []
for column in df.columns:
    # Defining the role
    if column == 'target':
        role = 'target'
    elif column == 'id':
        role = 'id'
    else:
        role = 'input'
         
    # Defining the level
    if 'bin' in column or column == 'target':
        level = 'binary'
    elif 'cat' in column or column == 'id':
        level = 'nominal'
    elif df[column].dtype == np.dtype('float64'):
        level = 'interval'
    elif df[column].dtype == np.dtype('int64'):
        level = 'ordinal'
        
    # Initialize keep to True for all variables except for id
    keep = True
    if column == 'id':
        keep = False
    
    # Defining the data type 
    dtype = df[column].dtype
    
    # Creating a Dict that contains all the metadata for the variable
    column_dict = {
        'column_name': column,
        'role': role,
        'level': level,
        'keep': keep,
        'dtype': dtype
    }
    data.append(column_dict)
    
df_meta = pd.DataFrame(data, columns=['column_name', 'role', 'level', 'keep', 'dtype'])
df_meta.set_index('column_name', inplace=True)

## Pipeline definieren

In [None]:
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn.pipeline import FeatureUnion # , Pipeline
from sklearn.preprocessing import Imputer, MinMaxScaler, Normalizer, StandardScaler, LabelBinarizer, FunctionTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.under_sampling import RandomUnderSampler

In [None]:
def binary_cols():
    query = df_meta[(df_meta.level == 'binary') & (df_meta.keep) & (df_meta.index != 'target')].index
    return df[query].columns.values

def nominal_cols():
    query = df_meta[(df_meta.level == 'nominal') & (df_meta.keep) & (df_meta.index != 'id')].index
    return df[query].columns.values

def interval_cols():
    query = df_meta[(df_meta.level == 'interval') & (df_meta.keep)].index
    return df[query].columns.values

def ordinal_cols():
    query = df_meta[(df_meta.level == 'ordinal') & (df_meta.keep)].index
    return df[query].columns.values

In [None]:
pipe = Pipeline([
    ('union',  FeatureUnion([
        ('binary', Pipeline([
            ('impute', DataFrameMapper([
                (binary_cols(), Imputer(missing_values=-1, strategy='most_frequent', axis=0))
            ], input_df=True))
        ])),

        ('nominal', Pipeline([
            ('label_binarize', DataFrameMapper(
                [(c, LabelBinarizer()) for c in nominal_cols()] 
            , input_df=True))
        ])),

        ('interval', Pipeline([
            ('impute', DataFrameMapper([
                (interval_cols(), Imputer(missing_values=-1, strategy='mean', axis=0))
            ], input_df=True)),
            ('scaler', StandardScaler())
        ])),

        ('ordinal', Pipeline([
            ('impute', DataFrameMapper([
                (ordinal_cols(), Imputer(missing_values=-1, strategy='most_frequent', axis=0))
            ], input_df=True)),
            ('scaler', MinMaxScaler(feature_range=(0, 1)))
        ])),
    ])),
    ('classify', DecisionTreeClassifier())
])

In [None]:
desired_apriori = 0.30

nb_0 = len(df.loc[df.target == 0].index)
nb_1 = len(df.loc[df.target == 1].index)

undersampling_rate = ((1 - desired_apriori) * nb_1) / (nb_0 * desired_apriori)
undersampled_nb_0 = int(undersampling_rate * nb_0)

df_X = df.drop('target', axis=1)
df_y = df['target']

cc = RandomUnderSampler(ratio={0: undersampled_nb_0})
X_cc, y_cc = cc.fit_sample(df_X, df_y.ravel())

df_X = pd.DataFrame(X_cc, columns=df_X.columns)
df_y = pd.DataFrame(y_cc, columns=['target'])

df = df_X.join(df_y)

## Suche nach den besten Parametern

### Grid definieren

In [None]:
param_grid = [
    {
        'classify': [DecisionTreeClassifier(criterion='gini', class_weight=None)],
        'classify__criterion': ['gini', 'entropy'],
        'classify__class_weight': [None, 'balanced']
    },
    {
        'classify': [RandomForestClassifier(n_estimators=10, criterion='gini', class_weight=None, n_jobs=-1)],
        'classify__n_estimators': [10, 50, 100],
        'classify__criterion': ['gini', 'entropy'],
        'classify__class_weight': [None, 'balanced'],
        'classify__warm_start': [False, True]
    },
]

### Grid Search ausführen

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score

scoring = { # weighted, binary, None
    'precision_score': make_scorer(precision_score, average='binary'),
    'recall_score': make_scorer(recall_score, average='binary'),
    'f1_score': make_scorer(f1_score, average='binary'),
    'accuracy_score': make_scorer(accuracy_score)
}

refit_score = 'f1_score'
skf = StratifiedKFold(n_splits=2)

grid = GridSearchCV(pipe, cv=skf, param_grid=param_grid, scoring=scoring, refit=refit_score,
                    return_train_score=True, n_jobs=-1)
grid.fit(df_X, df_y);

[Liste der Scoring-Parameter](http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter)

### Auswertung

In [None]:
grid.best_score_ 

In [None]:
grid.best_params_ 

In [None]:
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import classification_report

y_pred = cross_val_predict(grid.best_estimator_, df_X, df_y, cv=StratifiedKFold(2), n_jobs=-1)

In [None]:
print(classification_report(y_pred, df_y, target_names=['target = 0', 'target = 1']))

In [None]:
# mit f1_score

#              precision    recall  f1-score   support

#  target = 0       0.71      0.71      0.71     49952
#  target = 1       0.34      0.33      0.34     22361

# avg / total       0.59      0.60      0.60     72313

In [None]:
import cloudpickle as pickle

model_pkl_path = 'model.pkl'

with open(model_pkl_path, 'wb') as fh:
    pickle.dump(grid.best_estimator_, fh)
    print('Pickled model to "%s"' % model_pkl_path)