# Workflow

In [1]:
import warnings
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')
random_state = 42

## Datensatz laden

Quelle: [https://www.kaggle.com/c/porto-seguro-safe-driver-prediction](https://www.kaggle.com/c/porto-seguro-safe-driver-prediction)

In [2]:
df = pd.read_csv('../datasets/safe-driver-prediction.csv')

# Übersicht

In [3]:
df.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


# Metadaten extrahieren

In [4]:
data = []
for column in df.columns:
    # Defining the role
    if column == 'target':
        role = 'target'
    elif column == 'id':
        role = 'id'
    else:
        role = 'input'
         
    # Defining the level
    if 'bin' in column or column == 'target':
        level = 'binary'
    elif 'cat' in column or column == 'id':
        level = 'nominal'
    elif df[column].dtype == np.dtype('float64'):
        level = 'interval'
    elif df[column].dtype == np.dtype('int64'):
        level = 'ordinal'
        
    # Initialize keep to True for all variables except for id
    keep = True
    if column == 'id':
        keep = False
    
    # Defining the data type 
    dtype = df[column].dtype
    
    # Creating a Dict that contains all the metadata for the variable
    column_dict = {
        'column_name': column,
        'role': role,
        'level': level,
        'keep': keep,
        'dtype': dtype
    }
    data.append(column_dict)
    
df_meta = pd.DataFrame(data, columns=['column_name', 'role', 'level', 'keep', 'dtype'])
df_meta.set_index('column_name', inplace=True)

# Vorverarbeitung (Pre-Processing)

## Fehlende Werte

* Kategorische Attribute
 * **ps_car_03_cat** & **ps_car_05_cat** enthalten mehr als 50% fehlende Werte ==> entfernen
 * Bei den anderen Attributen kann -1 als einzelne Kathegorie gewertet werden
* **ps_reg_03** (continuous): Werte werden mit "mean" ersetzt
* **ps_car_11** (ordinal): Werte werden mit "most_frequent" ersetzt
* **ps_car_12** (continuous): Werte werden mit "mean" ersetzt
* **ps_car_14** (continuous): Werte werden mit "mean" ersetzt

In [5]:
drop_list = ['ps_car_03_cat', 'ps_car_05_cat']
df.drop(drop_list, inplace=True, axis=1)
df_meta.loc[drop_list, 'keep'] = False

In [6]:
from sklearn.preprocessing import Imputer

mean_imp = Imputer(missing_values=-1, strategy='mean', axis=0)
mode_imp = Imputer(missing_values=-1, strategy='most_frequent', axis=0)

df['ps_reg_03'] = mean_imp.fit_transform(df[['ps_reg_03']])
df['ps_car_12'] = mean_imp.fit_transform(df[['ps_car_12']])
df['ps_car_14'] = mean_imp.fit_transform(df[['ps_car_14']])

df['ps_car_11'] = mode_imp.fit_transform(df[['ps_car_11']])

In [7]:
df.isnull().sum().sum()

0

## Resampling

[Resampling Strategies](https://www.kaggle.com/rafjaa/resampling-strategies-for-imbalanced-datasets)

Wie in der Übersicht gezeigt, ist der Anteil der Datensätze mit target = 1 weit geringer als target = 0. Dies kann zu einem Model führen, das eine hohe Genauigkeit aufweist, aber in der Praxis keine guten Resultate liefert. Zwei mögliche Strategien, um mit diesem Problem umzugehen, sind:

* Oversampling der Datensätze mit target = 1
* Undersampling der Datensätze mit target = 0

Da wir ein größeres Trainingsset haben, können wir uns für Undersampling entscheiden.

In [8]:
from imblearn.under_sampling import RandomUnderSampler

desired_apriori = 0.30

nb_0 = len(df.loc[df.target == 0].index)
nb_1 = len(df.loc[df.target == 1].index)

undersampling_rate = ((1 - desired_apriori) * nb_1) / (nb_0 * desired_apriori)
undersampled_nb_0 = int(undersampling_rate * nb_0)

df_X = df.drop('target', axis=1)
df_y = df['target']

cc = RandomUnderSampler(ratio={0: undersampled_nb_0})
X_cc, y_cc = cc.fit_sample(df_X, df_y)

df_X = pd.DataFrame(X_cc, columns=df_X.columns)
df_y = pd.DataFrame(y_cc, columns=['target'])

df = df_X.join(df_y)

print('Datensätze mit target = 0 vor dem Undersampling: {}'.format(nb_0))
print('Datensätze mit target = 0 nach dem Undersampling: {}'.format(len(df.loc[df.target == 0].index)))

  from numpy.core.umath_tests import inner1d


Datensätze mit target = 0 vor dem Undersampling: 573518
Datensätze mit target = 0 nach dem Undersampling: 50619


## Feature Extraction

### Dummy-Attribute erstellen

In [9]:
from sklearn.preprocessing import LabelBinarizer

query = df_meta[(df_meta.level == 'nominal') & (df_meta.keep)].index

lb = LabelBinarizer()

for column in query.values:
    if len(df[column].unique()) <= 2:
        continue
    df_bin = pd.DataFrame(lb.fit_transform(df[column].values), columns=['{}_{}'.format(column, c) for c in lb.classes_])
    df = pd.concat([df, df_bin], axis=1)

In [10]:
data = []
for org_column in query.values:
    for lb_column in df.columns[df.columns.str.startswith(org_column+'_')]:
        data.append({
            'column_name': lb_column,
            'role': 'input',
            'level': 'binary',
            'keep': True,
            'dtype': df[lb_column].dtype
        })
    df_meta.loc[org_column, 'keep'] = False

df_meta_tmp = pd.DataFrame(data, columns=['column_name', 'role', 'level', 'keep', 'dtype'])
df_meta_tmp.set_index('column_name', inplace=True)

df_meta = df_meta.append(df_meta_tmp)

### "Interaction"-Attribute erstellen

In [11]:
from sklearn.preprocessing import PolynomialFeatures

query = df_meta[(df_meta.level == 'interval') & (df_meta.keep)].index

poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

interactions = pd.DataFrame(poly.fit_transform(df[query]), columns=poly.get_feature_names(query))
interactions.drop(query, axis=1, inplace=True)

df = df.join(interactions)

In [12]:
data = []
for column in interactions.columns:
    data.append({
        'column_name': column,
        'role': 'input',
        'level': 'interval',
        'keep': True,
        'dtype': interactions[column].dtype
    })

df_meta_tmp = pd.DataFrame(data, columns=['column_name', 'role', 'level', 'keep', 'dtype'])
df_meta_tmp.set_index('column_name', inplace=True)

df_meta = df_meta.append(df_meta_tmp)

## Feature Selection

### Entfernen von Attributen mit geringer oder keiner Varianz

In [13]:
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold=.01)
selector.fit(df.drop(['id', 'target'], axis=1))

f = np.vectorize(lambda x : not x)

v = df.drop(['id', 'target'], axis=1).columns[f(selector.get_support())]
print('{} variables have too low variance.'.format(len(v)))
print('These variables are {}'.format(list(v)))

### Skalierung

In [15]:
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler

query = df_meta[((df_meta.level == 'interval') | (df_meta.level == 'ordinal')) & (df_meta.keep)].index

df_tmp = df[query].copy()

scaler = StandardScaler()

df_tmp = pd.DataFrame(scaler.fit_transform(df_tmp), columns=df_tmp.columns)
df.drop(df_tmp.columns, axis=1, inplace=True)
df = df.join(df_tmp)

### Attribute auswählen mit Hilfe eines Random Forest 

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

query = df_meta[(df_meta.keep)].index

# df_X = df.drop(['id', 'target'], axis=1)
df_X = df[query].drop(['target'], axis=1)
df_y = df['target']

clf = RandomForestClassifier()
clf = clf.fit(df_X, df_y)

model = SelectFromModel(clf, prefit=True)

df = pd.concat([df_X.loc[:, model.get_support()], df.loc[:, ['id', 'target']]], axis=1, sort=False)

# Aufteilung in Trainings- und Testdaten (Sampling)

In [17]:
from sklearn.model_selection import train_test_split

query = df_meta[(df_meta.keep)].index

df_X = df[query].drop(['target'], axis=1)
df_y = df['target']

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.33, random_state=random_state)

# Trainieren des Models

In [18]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict, StratifiedKFold

clf = DecisionTreeClassifier()

y_pred = cross_val_predict(clf, df_X, df_y, cv=StratifiedKFold(2), n_jobs=-1)

print('Accuracy: {:.2f}'.format(accuracy_score(df_y, y_pred)))

Accuracy: 0.60


In [19]:
from sklearn.metrics import classification_report

print(classification_report(df_y, y_pred, target_names=['target = 0', 'target = 1']))

             precision    recall  f1-score   support

 target = 0       0.71      0.71      0.71     50619
 target = 1       0.33      0.34      0.33     21694

avg / total       0.60      0.60      0.60     72313

