In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

In [7]:
TARGET = "Loan_Status"
TRAIN_DATA_LOCATION = "data/train.csv" 
TEST_DATA_LOCATION = "data/test.csv"
TEST_SIZE = 0.3
RANDOM_STATE = 42
REPLACE_MISSING_VALUE = -99999
DROP_FEATURES = ['Loan_ID']

In [8]:
def create_datasets(train, target):
    """Create datasets."""
    y = train[target]
    X = train.drop(target, axis=1)
    return X, y

def calculate_performance(y_true, y_score):
    """"Calculate performance."""

    return 2 * roc_auc_score(y_true, y_score) - 1

# Load datasets and split
train = pd.read_csv(TRAIN_DATA_LOCATION)
X, y = create_datasets(train, TARGET)

In [32]:
# Development of ordinal encoder
mappings = {}
non_numeric_features = train.select_dtypes(exclude=np.number)
for feature in non_numeric_features:
        feature_mappings = {}
        if train[feature].value_counts().count() > 100:
                continue
        else:
                tmp = train.groupby([feature]).agg({TARGET:'mean'})
                tmp.sort_values(TARGET, inplace=True)
                for i, feature_value in enumerate(tmp.index.values):
                        feature_mappings[feature_value] = i
        mappings[feature] = feature_mappings
                
        

In [33]:
mappings

{'Gender': {'Female': 0, 'Male': 1},
 'Married': {'No': 0, 'Yes': 1},
 'Dependents': {'1': 0, '3+': 1, '0': 2, '2': 3},
 'Education': {'Not Graduate': 0, 'Graduate': 1},
 'Self_Employed': {'Yes': 0, 'No': 1},
 'Property_Area': {'Rural': 0, 'Urban': 1, 'Semiurban': 2}}

In [36]:
mappings['Education']

{'Not Graduate': 0, 'Graduate': 1}

In [39]:
X['Education'].map(mappings['Education'])

0      1
1      1
2      1
3      0
4      1
      ..
609    1
610    1
611    1
612    1
613    1
Name: Education, Length: 614, dtype: int64