### What is the purpose of this file? 

Show distributions of continuous and categorical variables

In [None]:
# base 
from src import helpers, config, plotting, evaluation
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import missingno as msno
import logging
from urllib.request import urlretrieve 

# base
from sklearn.base import TransformerMixin
from sklearn import set_config

# ignore warnings 
import warnings 
warnings.filterwarnings('ignore')



# yet to arrange
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer 
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
%matplotlib inline 

set_config(display='diagram')
sns.set_style('white')
sns.set_palette('deep')
mpl.rcParams['figure.figsize'] = config.DEFAULT_FIGSIZE
mpl.rcParams['lines.linewidth'] = config.DEFAULT_PLOT_LINEWIDTH
mpl.rcParams['lines.linestyle'] = config.DEFAULT_PLOT_LINESTYLE
mpl.rcParams['font.size'] = config.DEFAULT_AXIS_FONT_SIZE

# logging settings 
# logging.basicConfig(level=logging.DEBUG, format='%(levelname)s:%(message)s')
logger = logging.getLogger()
logger.disabled = False

pd.set_option('display.max_columns', 30)
TARGET = 'status'

### What does the overall class distribution look like? 

### What does the overall correlations look like? 

### Example header 

### Example header 

### Example header 

### Example header 

### Hypotheses 
 
1. To drop 
    - unnamed:_0
    - id -> UID
    - year -> No variation 

2. Deterministic 
    - construction_type -> to drop 
    - secured_by -> drop 
    - security_type -> drop
    - open_credit -> consider dropping (imbalanced)

    Unsure of these
    - interest_rate_spread -> Not able to discriminate
    - rate_of_interest -> Not able to discriminate
    - upfront_charges -> Not able to discriminate

3. Applicant characteristics
    - gender -> Joint less likely to default -> Change to categorical
        - Engineer whether loan was joint or not??
        - Sex not provided?
    - age
    - dtir1 - debt to income ratio - missing 24121 values -> bin values?
    - region
    - income
    - credit_worthiness
    - credit_score
    - credit_type
    - co_applicant_credit_type
    - submission_of_application

4. Loan details 
    - loan_type -> what is a type 2 loan? 
    - loan_limit -> missing values -> ncf tends to default more 
    - loan amount -> 
    - loan_purpose -> missing 134
    - lump_sum_payment -> remap 
    - approv_in_adv -> NA 908
    - term -> most likely not going to be discriminating
    - interest_only 
    - neg_ammortization -> seems important

5. Property
    - total_units (number of units bought?)
    - business_or_commercial -> higher chance of default if biz/comm
    - property_value
    - occupancy_type
    - ltv

In [None]:
sns.kdeplot(df['ltv'])

In [None]:
'''
- interest_rate_spread -> Not able to discriminate
- rate_of_interest -> Not able to discriminate
- upfront_charges -> Not able to discriminate
'''
df.groupby(TARGET)['upfront_charges'].count()

In [None]:
df.groupby(TARGET)['open_credit'].value_counts(normalize=True)

### Dropping columns that are not useful

In [None]:
# df.groupby(TARGET)['secured_by'].value_counts()
df.groupby(TARGET)['secured_by'].hist()

In [None]:
df = df.drop(columns=['unnamed:_0', 'id', 'year', 'interest_rate_spread']) 
df = df.drop(columns=['secured_by', 'construction_type', 'security_type'])

### Applicant Characteristics
    - gender -> Joint less likely to default -> Change to categorical
        - Engineer whether loan was joint or not??
        - Sex not provided?
    - age
    - dtir1 - debt to income ratio - missing 24121 values -> bin values?
    - region
    - income
    - credit_worthiness
    - credit_score
    - credit_type
    - co_applicant_credit_type
    - submission_of_application

In [None]:
##  gender
## ============================================
df['gender'] = helpers.convert_to_dtype(df['gender'], type='categorical')
# df['gender'].nunique()
# df['gender'].value_counts()
# df['joint_loan'] = df['gender'].apply(lambda x: x == 'Joint')


## age 
## ============================================
display(df['age'].unique())
age_bins = ['<25', '25-34', '35-44', '45-54', '55-64', '65-74', '>74']
age_cat = pd.CategoricalDtype(categories=age_bins, ordered=True)

df['age'] = df['age'].astype(age_cat)


## region 
## ============================================
# standardize to lowercase
df['region'] = df['region'].str.lower().astype('category')


## credit_worthiness 
## ============================================
df['credit_worthiness'] = df['credit_worthiness'].astype('category')


## credit_score 
## ============================================
# no changes so far
df['credit_score'] = df['credit_score']
df['credit_score'].hist()

## credit_type / co_applicant_credit_type
## ============================================
df['credit_type'].value_counts()
df['credit_type'] = df['credit_type'].astype('category')

df['co_applicant_credit_type'] = df['co_applicant_credit_type'].astype('category')
df['co_applicant_credit_type'].isna().sum()


## submission_of_application
## ============================================
df['submission_of_application'] = df['submission_of_application'].astype('category')

In [None]:
## income
## ============================================
df['income'].describe().T
df[df['income'] == df['income'].max()]

# large incomes
display(df['income'].nlargest(10))

# check if any negative incomes
assert df['income'].min() >= 0

## debt to income ratio 
## ============================================
df['dtir1'].isna().sum()
# 24121 missing

df['dtir1'].dtype
# df['dtir1'].hist()

### Loan details
    - loan_type -> what is a type 2 loan? 
    - loan_limit -> missing values -> ncf tends to default more 
    - loan amount -> 
    - loan_purpose -> missing 134
    - lump_sum_payment -> remap 
    - approv_in_adv -> NA 908
    - term -> most likely not going to be discriminating
    - interest_only 
    - neg_ammortization -> seems important

In [None]:
def remove_outliers(): pass

In [None]:
std = df['loan_amount'].std()
mean = df['loan_amount'].mean()

# calculate the cutoff
cut_off = std * 3

# upper and lower limit 
lower, upper = mean - cut_off, mean + cut_off

outlier_df = df[~((df['loan_amount'] < upper) & (df['loan_amount'] > lower))]
outlier_df['loan_amount'].sort_values(ascending=False)

In [None]:
## loan_type
## ============================================
df['loan_type'] = helpers.convert_to_dtype(df['loan_type'], 'categorical')


## loan_limit 3344 missing values
# - CF = conforming loan 
# - NCF = non-conforming loan
## ============================================
df['loan_limit'] = df['loan_limit'].astype('category')
df['loan_limit'].isna().sum()


## loan_amount
## ============================================
df[['loan_amount']].boxplot()


## loan_purpose
## ============================================
df['loan_purpose'] = helpers.convert_to_dtype(df['loan_purpose'], 'categorical')


## lump_sum_payment
## ============================================
df['lump_sum_payment'].value_counts()

lump_sum_mapping = {'not_lpsm': False, 'lpsm': True}  
df['lump_sum_payment'] = df['lump_sum_payment'].map(lump_sum_mapping).astype('category')
# df.groupby('status')['lump_sum_payment'].value_counts(normalize=True)


## approv_in_adv -> missing 908
## ============================================
df['approv_in_adv'] = df['approv_in_adv'].astype('category')

## term bin values? 
## ============================================
df['term'].hist()


## interest_only 
## ============================================
'''
What Is an Interest-Only Mortgage? An interest-only mortgage is a type of 
mortgage in which the mortgagor (the borrower) is required to pay only the 
interest on the loan for a certain period. The principal is repaid either 
in a lump sum at a specified date, or in subsequent payments.
'''
interest_only_mapping = {'not_int': False, 'int_only': True}  
df['interest_only'] = df['interest_only'].map(interest_only_mapping).astype('category')
df['interest_only'].value_counts()

## neg_ammortization 
## ============================================
df['neg_ammortization'] = df['neg_ammortization'].astype('category')

df.groupby(TARGET).neg_ammortization.value_counts()

In [None]:
# df['loan_amount'].head(10)
df.loc[df['loan_amount'].nsmallest(50).index,:]

### Property related
    - total_units (number of units bought?)
    - business_or_commercial -> higher chance of default if biz/comm
    - property_value
    - occupancy_type
    - ltv

In [None]:
## total_units 
## ============================================
total_units_cat = pd.CategoricalDtype(categories=['1U', '2U', '3U', '4U'], ordered=True)
df['total_units'] = df['total_units'].astype(total_units_cat)
# df.groupby(TARGET)['total_units'].value_counts(normalize=True)


## business_or_commercial 
## ============================================
df['business_or_commercial'] = df['business_or_commercial'].astype('category')
# df.groupby(TARGET)['business_or_commercial'].value_counts(normalize=True)


## property_value 
## ============================================
df[['property_value']]


## occupancy_type 
## ============================================
occupancy_type_map = {'pr': 'primary residence', 'sr': 'secondary residence', 'ir': 'investment residence'}
df['occupancy_type'] = df['occupancy_type'].map(occupancy_type_map).astype('category')
df['occupancy_type'].value_counts(normalize=True)

## ltv 
## ============================================
df[['ltv']]
# df.groupby(TARGET)['total_units'].value_counts(normalize=True)

In [None]:
df.info()

In [None]:
property_value_nm = df[~df['property_value'].isna()]
df['property_value'].isna().sum() / df.shape[0]
property_value_nm['status'].value_counts(normalize=True)

### upfront_charges, construction_type, interest_rate_spread
- Missing values for all status with loan defaults
- ??? 

In [None]:
df.groupby(TARGET)['upfront_charges'].mean()

# df.pivot_table(values=loan, index=TARGET)

df['construction_type'].isna().sum()

sns.histplot(data=df, x='interest_rate_spread', hue=TARGET)
plt.show()

### Loan purpose indepth

In [None]:
df['loan_purpose'].value_counts()

cat_features = helpers.get_categorical_columns(df)

for col in cat_features:
    df_bar = df.groupby('loan_purpose')[col].value_counts().to_frame('counts').reset_index()
    sns.catplot(data=df_bar, x='gender', y='counts', row='loan_purpose', kind='bar')

In [None]:
df.groupby(TARGET)['security_type'].value_counts()

### Missing loan limits indepth

In [None]:
missing_loan_limits = df.loc[df['approv_in_adv'].isna()]
non_missing_loan_limits = df.loc[~df['approv_in_adv'].isna()]

missing_loan_limits = missing_loan_limits.drop(columns='approv_in_adv')

display(missing_loan_limits.describe().T)
display(non_missing_loan_limits.describe().T)

sns.pairplot(missing_loan_limits)
plotting.quick_plot(missing_loan_limits)