## All missingness and their relationships run here

Overall missingness 

Missingness relationships 

Hypotheses for missingness

1. [Libraries and Imports](#1)
2. [Statistics](#2)
3. [aa](#3)
4. [bb](#4)
5. [cc](#5)

In [None]:
### Libraries and Imports

In [None]:
# base 
import os 
import sys
from src import helpers, config, plotting, evaluation
import random 
import json 
import numpy as np
import pandas as pd 
import pandas.api.types as types
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import missingno as msno
import pickle
from pathlib import Path
from typing import List, Set, Dict, Tuple
from typing import Union, Any, Optional, Iterable, Hashable, Type

# base
from sklearn.base import TransformerMixin

# ml preprocessing 
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler

# models
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression

# validation 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_validate, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, roc_curve, roc_auc_score, precision_score, recall_score, plot_confusion_matrix

# pipelines 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer

# ignore warnings 
import warnings 
warnings.filterwarnings('ignore')

from sklearn import set_config
set_config(display='diagram')

# yet to arrange
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer 
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
assert config.FIN_FILE_PATH == Path.cwd().parent / "data" / "final"
assert config.REPORTS_PATH == Path.cwd().parent / "reports" / "figures"
assert config.RAW_FILE_PATH == Path.cwd().parent / "data" / "raw"

In [None]:
# set the target column
%matplotlib inline 

sns.set_style('white')
sns.set_palette('deep')
mpl.rcParams['figure.figsize'] = config.DEFAULT_FIGSIZE
mpl.rcParams['lines.linewidth'] = config.DEFAULT_PLOT_LINEWIDTH
mpl.rcParams['lines.linestyle'] = config.DEFAULT_PLOT_LINESTYLE
mpl.rcParams['font.size'] = config.DEFAULT_AXIS_FONT_SIZE

df = pd.read_parquet(config.INT_FILE_PATH / config.INT_FILE_NAME)

train = pd.read_parquet(config.FIN_FILE_PATH / 'train.parquet')

In [None]:
helpers.missingness_checks(train)

### Missingness

In [None]:
helpers.missingness_checks(df.sort_values(by='status'))

### Statistics

In [None]:
helpers.quick_eda(df)

### Overview of Data

In [None]:
plotting.quick_plot(df[['status', 'loan_amount', 'rate_of_interest', 'upfront_charges', 'term', 
                'property_value', 'income', 'credit_score', 'ltv', 'dtir1']].sample(10000), hue=config.TARGET)
plt.show()

### Hypotheses 

1. To drop 
    - id -> UID
    - year -> No variation 
    - interest_rate_spread -> Not able to discriminate

1. Demographics
    - Gender -> Joint less likely to default
        - Engineer whether loan was joint or not
        - Sex not provided

2. Loan types 
    - loan_type -> what is a type 2 loan? 
    - loan_limit -> missing values -> ncf tends to default more 
    - lump_sum_payment -> 
    - approv_in_adv -> NA 908
    - term -> most likely not going to be discriminating

3. Purpose 
    - loan_purpose
    - credit_worthiness
    - open_credit -> imbalanced might not be worth to compute
    - business_or_commercial -> higher chance of default if biz/comm
    - 

4. Deterministic 
    - construction_type
    

loan_limit                    3344
approv_in_adv                  908
loan_purpose                   134
rate_of_interest             36439
upfront_charges              39642
term                            41
neg_ammortization              121
interest_only                    0
lump_sum_payment                 0
property_value               15098
occupancy_type                   0
secured_by                       0

### Train test split

In [None]:
X = df.drop(columns=config.TARGET)
y = df[[config.TARGET]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=config.RANDOM_STATE)

In [None]:
X_train.to_parquet(config.INT_FILE_PATH / 'X_train.parquet')
X_test.to_parquet(config.INT_FILE_PATH / 'X_test.parquet')
y_train.to_parquet(config.INT_FILE_PATH / 'y_train.parquet')
y_test.to_parquet(config.INT_FILE_PATH / 'y_test.parquet')

## KNN imputation

## Binning

In [None]:
X_train = pd.read_parquet(config.INT_FILE_PATH / 'X_train.parquet')
X_train.info()

In [None]:
### property

sns.histplot(X_train['property_value'])
plt.show()

X_train['property_value_binned'] = pd.qcut(X_train['property_value'], q=5, 
                                labels=['1', '2', '3', '4', '5'])
X_train['property_value_binned'] = X_train['property_value_binned'].astype('object').fillna('missing')
X_train['property_value_binned'].value_counts()

In [None]:
sns.histplot(X_train['credit_score'])
plt.show() 


X_train['credit_score_binned'] = pd.qcut(X_train['credit_score'], q=5, 
labels=['1', '2', '3', '4', '5'])

X_train['credit_score_binned'] = X_train['credit_score'].astype('object').fillna('missing')
X_train['credit_score_binned'].value_counts()

In [None]:
sns.histplot(X_train['ltv'])
plt.show()

X_train['ltv_binned'] = pd.qcut(X_train['ltv'], q=5, 
labels=['1', '2', '3', '4', '5'])

X_train['ltv_binned'] = X_train['ltv_binned'].astype('object').fillna('missing')

X_train['ltv_binned'].value_counts()

In [None]:
sns.histplot(X_train['dtir1'])
plt.show()

X_train['dtir1_binned'] = pd.qcut(X_train['dtir1'], q=5, 
labels=['1', '2', '3', '4', '5'])

X_train['dtir1_binned'] = X_train['dtir1_binned'].astype('object').fillna('missing')

X_train['dtir1_binned'].value_counts()

In [None]:
sns.histplot(X_train['income'])



In [None]:
X_train.columns

In [None]:
seq = [ 'dtir1', 'income', 'credit_score', 'property_value', 'ltv','rate_of_interest', 'upfront_charges', 'loan_limit', 'gender', 'approv_in_adv', 'loan_type', 'loan_purpose',
       'credit_worthiness', 'open_credit', 'business_or_commercial',
       'loan_amount', 'term', 'neg_ammortization', 'interest_only', 'lump_sum_payment',
       'occupancy_type', 'total_units', 
       'credit_type', 'co_applicant_credit_type', 'age',
       'submission_of_application', 'region',
       'property_value_binned', 'credit_score_binned', 'ltv_binned']

In [None]:
X_train = X_train.reindex(columns=seq)

In [None]:
msno.matrix(X_train)

In [None]:
X_train.loc[X_train['credit_score_binned'] == 'missing']

In [None]:
from sklearn.impute import KNNImputer

In [None]:
msno.heatmap(X_train)

## Baseline 

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier() 


pp = Pipeline([
        ('ohe', OneHotEncoder()),
        ('rfc', rfc)
     ])

pp.fit(X_train, y_train)
