In [10]:
import trace 
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler, label_binarize
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import inspect


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier


In [35]:
col_header = ['status_of_existing_account', 'duration_in_month', 'credit_his', 'purpose', 'credit_amt', 'saving_account', 'preset_emp', 'installment_rate', 'personal_status_and_sex', 'guarantors', 'present_residence', 
             'property', 'age', 'other_installment_plans', 'housing', 'No_existing_accounts', 'job', 'No_liable_people', 'telephone', 'foreign_worker', 'label']


In [41]:
german = pd.read_csv('../data/german.data', sep = ' ', names=col_header)

In [39]:
german.to_csv('../data/german_titled.csv')

In [42]:
german['identifier'] = german.index

In [49]:
german_split_1 = german[['identifier']+col_header[:10]]
german_split_2 = german[['identifier']+col_header[10:]]

In [52]:
german_split_1.to_csv('../data/german_titled_split_1.csv')
german_split_2.to_csv('../data/german_titled_split_2.csv')

In [73]:
def german_pipeline_easy(f_path = '../data/german_titled.csv'):
    '''
    This pipeline takes in one csv and apply some easy operations upon it
    '''
    data = pd.read_csv(f_path)
    # projection
    data = data[['status_of_existing_account', 'duration_in_month', 'credit_his', 'purpose', 'credit_amt', 'saving_account', 'preset_emp', 'installment_rate', 'personal_status_and_sex', 'guarantors', 'present_residence', 
                 'property', 'age','label']]
    # filtering
    data = data[data.credit_amt>=4000]

    #start sklearn pipeline
    one_hot_and_impute = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder())
    ])

    featurizer = ColumnTransformer(transformers=[
        ('onehot', OneHotEncoder(), ['status_of_existing_account', 'credit_his','purpose', 'saving_account', 'preset_emp']),
        ('impute_onehot', one_hot_and_impute, ['personal_status_and_sex', 'guarantors', 'property']),
        ('std_scaler', StandardScaler(), ['duration_in_month', 'credit_amt', 'present_residence', 'age'])
    ])
    pipeline = Pipeline([
        ('features', featurizer),
        ('learner', RandomForestClassifier())
    ])
    return pipeline
    

In [113]:
def german_pipeline_normal(f_path = '../data/german_titled.csv'):
    f_path_1 = '../data/german_titled_split_1.csv'
    f_path_2 = '../data/german_titled_split_2.csv'

    # load data
    data_split_1 = pd.read_csv(f_path_1, index_col = 0)
    data_split_2 = pd.read_csv(f_path_2, index_col = 0)

    # join
    data = pd.merge(data_split_1, data_split_2, on='identifier')

    # drop first col
    data.drop(data.columns[0], axis=1, inplace = True)

    # projection
    data = data[['status_of_existing_account', 'duration_in_month', 'credit_his', 'purpose', 'credit_amt', 'saving_account', 'preset_emp', 'installment_rate', 'personal_status_and_sex', 'guarantors', 'present_residence', 
                 'property', 'age','label']]
    # filtering
    data = data[data.credit_amt>=4000]

    #start sklearn pipeline
    one_hot_and_impute = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder())
    ])

    featurizer = ColumnTransformer(transformers=[
        ('onehot', OneHotEncoder(), ['status_of_existing_account', 'credit_his','purpose', 'saving_account', 'preset_emp']),
        ('impute_onehot', one_hot_and_impute, ['personal_status_and_sex', 'guarantors', 'property']),
        ('std_scaler', StandardScaler(), ['duration_in_month', 'credit_amt', 'present_residence', 'age'])
    ])
    pipeline = Pipeline([
        ('features', featurizer),
        ('learner', RandomForestClassifier())
    ])
    return pipeline
