# Pipeline

In [1]:
import warnings
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')
random_state = 42

## Datensatz laden

Quelle: [https://www.kaggle.com/c/porto-seguro-safe-driver-prediction](https://www.kaggle.com/c/porto-seguro-safe-driver-prediction)

In [2]:
df = pd.read_csv('../datasets/safe-driver-prediction.csv')

In [3]:
df.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


# Metadaten extrahieren

In [4]:
data = []
for column in df.columns:
    # Defining the role
    if column == 'target':
        role = 'target'
    elif column == 'id':
        role = 'id'
    else:
        role = 'input'
         
    # Defining the level
    if 'bin' in column or column == 'target':
        level = 'binary'
    elif 'cat' in column or column == 'id':
        level = 'nominal'
    elif df[column].dtype == np.dtype('float64'):
        level = 'interval'
    elif df[column].dtype == np.dtype('int64'):
        level = 'ordinal'
        
    # Initialize keep to True for all variables except for id
    keep = True
    if column == 'id':
        keep = False
    
    # Defining the data type 
    dtype = df[column].dtype
    
    # Creating a Dict that contains all the metadata for the variable
    column_dict = {
        'column_name': column,
        'role': role,
        'level': level,
        'keep': keep,
        'dtype': dtype
    }
    data.append(column_dict)
    
df_meta = pd.DataFrame(data, columns=['column_name', 'role', 'level', 'keep', 'dtype'])
df_meta.set_index('column_name', inplace=True)

## Pipeline definieren

In [5]:
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn.pipeline import FeatureUnion # , Pipeline
from sklearn.preprocessing import Imputer, MinMaxScaler, Normalizer, StandardScaler, LabelBinarizer, FunctionTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.under_sampling import RandomUnderSampler

  from numpy.core.umath_tests import inner1d


In [6]:
def binary_cols():
    query = df_meta[(df_meta.level == 'binary') & (df_meta.keep) & (df_meta.index != 'target')].index
    return df[query].columns.values

def nominal_cols():
    query = df_meta[(df_meta.level == 'nominal') & (df_meta.keep) & (df_meta.index != 'id')].index
    return df[query].columns.values

def interval_cols():
    query = df_meta[(df_meta.level == 'interval') & (df_meta.keep)].index
    return df[query].columns.values

def ordinal_cols():
    query = df_meta[(df_meta.level == 'ordinal') & (df_meta.keep)].index
    return df[query].columns.values

In [7]:
pipe = Pipeline([
    ('union',  FeatureUnion([
        ('binary', Pipeline([
            ('impute', DataFrameMapper([
                (binary_cols(), Imputer(missing_values=-1, strategy='most_frequent', axis=0))
            ], input_df=True))
        ])),

        ('nominal', Pipeline([
            ('label_binarize', DataFrameMapper(
                [(c, LabelBinarizer()) for c in nominal_cols()] 
            , input_df=True))
        ])),

        ('interval', Pipeline([
            ('impute', DataFrameMapper([
                (interval_cols(), Imputer(missing_values=-1, strategy='mean', axis=0))
            ], input_df=True)),
            ('scaler', StandardScaler())
        ])),

        ('ordinal', Pipeline([
            ('impute', DataFrameMapper([
                (ordinal_cols(), Imputer(missing_values=-1, strategy='most_frequent', axis=0))
            ], input_df=True)),
            ('scaler', MinMaxScaler(feature_range=(0, 1)))
        ])),
    ])),
    ('classify', DecisionTreeClassifier())
])

## Pipeline testen

In [8]:
desired_apriori = 0.30

nb_0 = len(df.loc[df.target == 0].index)
nb_1 = len(df.loc[df.target == 1].index)

undersampling_rate = ((1 - desired_apriori) * nb_1) / (nb_0 * desired_apriori)
undersampled_nb_0 = int(undersampling_rate * nb_0)

df_X = df.drop('target', axis=1)
df_y = df['target']

cc = RandomUnderSampler(ratio={0: undersampled_nb_0})
X_cc, y_cc = cc.fit_sample(df_X, df_y)

df_X = pd.DataFrame(X_cc, columns=df_X.columns)
df_y = pd.DataFrame(y_cc, columns=['target'])

df = df_X.join(df_y)

In [9]:
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import classification_report

y_pred = cross_val_predict(pipe, df_X, df_y, cv=StratifiedKFold(2), n_jobs=-1)

In [10]:
print(classification_report(df_y, y_pred, target_names=['target = 0', 'target = 1']))

             precision    recall  f1-score   support

 target = 0       0.71      0.70      0.71     50619
 target = 1       0.33      0.35      0.34     21694

avg / total       0.60      0.59      0.60     72313

