In [None]:
!head -n 10 data/installments_payments.csv

In [None]:
import pandas as pd
filename = 'data/application_train.csv'
df = pd.read_csv(filename)
df.head(20) 

In [None]:
print(df.dtypes)

In [None]:
df[['SK_ID_CURR', 'TARGET']].groupby('TARGET').count()

In [None]:
df[['SK_ID_CURR', 'TARGET']].groupby('TARGET').count()/df.shape[0]

In [None]:
filename = 'data/HomeCredit_columns_description.csv'
desc_df = pd.read_csv(filename, encoding = "ISO-8859-1")
desc_df[desc_df['Table'] == 'application_{train|test}.csv']

In [None]:
def get_target_dist(df):
    rows = df.shape[0]
    target_dist_df = df[['SK_ID_CURR', 'TARGET']].groupby('TARGET').count()
    target_dist_df['PERCENT'] = target_dist_df['SK_ID_CURR']*100/rows
    return target_dist_df

In [None]:
filename = 'data/application_train.csv'
train_df = pd.read_csv(filename)
rows = train_df.shape[0]
print(f'total rows: {rows}')
print(get_target_dist(train_df))

In [None]:
train_df = pd.read_csv(filename, nrows=10000)
rows = train_df.shape[0]
print(f'total rows: {rows}')

In [None]:
print(get_target_dist(train_df))

In [None]:
train_df = pd.read_csv(filename)
train_df = train_df.sample(n=10000)
rows = train_df.shape[0]
print(f'total rows: {rows}') 
print(get_target_dist(train_df))

In [None]:
nan_cols = train_df.columns[train_df.isnull().any()]
nan_cnt = train_df[nan_cols].isnull().sum()
print(nan_cnt)

In [None]:
import numpy as np

y = []
data = []

target_col = 'TARGET'
features = list([x for x in train_df.columns if x != target_col])

for row in train_df.to_dict('records'):
    y.append(row[target_col])
    data.append({k: row[k] for k in features})
    
y = np.array(y)

In [None]:
from sklearn.model_selection import train_test_split

data_train, data_val, y_train, y_val = train_test_split(data, y, train_size=0.8, stratify=y)
print(f'data_train cnt: {len(data_train)}')
print(f'data_val cnt: {len(data_val)}')

In [None]:
from collections import defaultdict

def get_y_dist(y):
    target2cnt = defaultdict(int)
    for yi in y:
        target2cnt[yi] += 1
    
    print('target\tcnt\tratio')
    for target in sorted(target2cnt):
        cnt = target2cnt[target]
        print(f'{target}\t{cnt}\t{cnt/len(y)}')

In [None]:
print('target distribution in training data')
get_y_dist(y_train)

print('\ntarget distribution in validation data')
get_y_dist(y_val)

In [None]:
from sklearn.feature_extraction import DictVectorizer

vectorizer = DictVectorizer()
X_train = vectorizer.fit_transform(data_train)
print(f'after vectorization: {X_train.shape}')

In [None]:
for i, feature in enumerate(vectorizer.feature_names_):
    print(f'{i}\t{feature}')

In [None]:
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import MaxAbsScaler

imputer = Imputer(strategy='mean')
X_train = imputer.fit_transform(X_train)

scaler = MaxAbsScaler()
X_train = scaler.fit_transform(X_train.toarray())
print(f'X_train data type: {type(X_train)}')
print(f'X_train: {X_train.shape})')

In [None]:
X_val = vectorizer.transform(data_val)
X_val = imputer.transform(X_val)
X_val = scaler.transform(X_val)
print(f'X_val data type: {type(X_val)}')
print(f'X_val: {X_val.shape})')

In [None]:
# Model Training
from sklearn.linear_model import LogisticRegression
import time

model = LogisticRegression(class_weight='balanced')

start = time.time()
print(f'Fitting model on {X_train.shape[0]} samples...')
model.fit(X_train, y_train)

end = time.time()
print('Finished model training in %.3f seconds.' % (end-start))

In [None]:
def get_sample_weights(y):
    weights = []
    for yi in y:
        weights.append(10 if yi else 1)
    return np.array(weights)

from sklearn.linear_model import LogisticRegression
import time

model = LogisticRegression()

start = time.time()
print(f'Fitting model on {X_train.shape[0]} samples...')
model.fit(X_train, y_train, sample_weight=get_sample_weights(y_train))

end = time.time()
print('Finished model training in %.3f seconds.' % (end-start))

In [None]:
y_preds = model.predict(X_val)

In [None]:
for i, y_pred in enumerate(y_preds):
    y_true = y_val[i]
    print(f'i\ty_pred: {y_pred}, y_true: {y_true}')

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_true=y_val, y_pred=y_preds, labels=[0, 1], target_names=['NO', 'YES']))

In [None]:
def evaluate(X_val, y_val):
    from sklearn.metrics import roc_curve, roc_auc_score, auc
    pos_idx = list(model.classes_).index(1)

    print(X_val.shape, model.predict_proba(X_val).shape, pos_idx)
    y_score = model.predict_proba(X_val)[:,pos_idx]
    print(y_score.shape)
    fpr, tpr, _ = roc_curve(y_val, y_score, pos_label=1)
    roc_auc = roc_auc_score(y_val, y_score)
    
    return roc_auc, fpr, tpr

In [None]:
print(X_val.shape)
print(y_val.shape)
roc_auc, fpr, tpr = evaluate(X_val, y_val)

In [None]:
import matplotlib.pyplot as plt
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")

In [None]:
prev_app_filename = 'data/previous_application.csv'
prev_app_df = pd.read_csv(prev_app_filename)
prev_app_df.head(20)

In [None]:
filename = 'data/HomeCredit_columns_description.csv'
desc_df = pd.read_csv(filename, encoding = "ISO-8859-1")
desc_df[desc_df['Table'] == 'previous_application.csv']

In [None]:
prev_agg = prev_app_df.groupby('SK_ID_CURR')
prev_df = prev_agg.agg({'SK_ID_PREV': 'count', 'AMT_ANNUITY': 'sum'}).rename(columns={
    'SK_ID_PREV': 'PREV_APPS', 'AMT_ANNUITY': 'PREV_AMT_ANNUITY'})

In [None]:
prev_df.head(20)

In [None]:
curr_prev_df = train_df.fillna(value=train_df.mean()).join(prev_df, on='SK_ID_CURR', how='left')
curr_prev_df[['PREV_APPS', 'PREV_AMT_ANNUITY']] = curr_prev_df[['PREV_APPS', 'PREV_AMT_ANNUITY']].fillna(value=0)
print(curr_prev_df.shape[0])
curr_prev_df

In [None]:
filename = 'data/bureau.csv'
bureau_df = pd.read_csv(filename)
active_bureau_df = bureau_df[bureau_df['CREDIT_ACTIVE']=='Active']
active_bureau_df

In [None]:
active_bureau_agg = active_bureau_df.groupby('SK_ID_CURR')
active_bureau_agg_df = active_bureau_agg.agg({'AMT_CREDIT_SUM_DEBT': 'sum', 'CNT_CREDIT_PROLONG': 'sum'})
active_bureau_agg_df

In [None]:
df = curr_prev_df.join(active_bureau_agg_df, on='SK_ID_CURR', how='left')
df[['AMT_CREDIT_SUM_DEBT', 'CNT_CREDIT_PROLONG']] = df[['AMT_CREDIT_SUM_DEBT', 'CNT_CREDIT_PROLONG']].fillna(value=0)
print(df.shape[0])
df.shape

In [None]:
df

In [None]:
import pickle
from sklearn.externals import joblib

joblib.dump(model, 'model')
joblib.dump(scaler, 'scaler')
joblib.dump(vectorizer, 'vectorizer')

model = joblib.load('model')
scaler = joblib.load('scaler')
vectorizer = joblib.load('vectorizer')