# 3.4.2-4 Modelling Dataset Versions
#### normalization, balancing and train/test splits

## 1) Load Required Libraries

In [82]:
library("dplyr")
library("caret")
library("mltools")

## 2) Load Data

In [2]:
transformed2.df <- read.csv(file="TransfromedData2.csv", header=TRUE, sep=",")
glimpse(transformed2.df)

Observations: 884,570
Variables: 27
$ X                                      <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,…
$ loan_amnt                              <int> 5600, 7500, 12200, 11200, 2500…
$ term                                   <fct>  36 months,  36 months,  36 mo…
$ int_rate                               <dbl> 18.04, 13.87, 12.29, 11.34, 15…
$ installment                            <dbl> 202.57, 255.86, 406.91, 368.48…
$ grade                                  <fct> G, D, C, C, E, C, C, A, B, C, …
$ home_ownership                         <fct> OWN, MORTGAGE, OWN, RENT, MORT…
$ annual_inc                             <dbl> 0.6677529, 0.7174611, 0.661276…
$ verification_status                    <fct> Not Verified, Not Verified, No…
$ pymnt_plan                             <fct> n, n, n, n, n, n, n, n, n, n, …
$ purpose                                <fct> home_improvement, Others, home…
$ dti                                    <dbl> 25.84, 20.95, 9.31, 6.92, 6.82…
$ open_acc      

## 3) Transform Data for Modelling

### 3.1) Basic Transforms

Convert all factor levels to uppercase and replace spaces for underscores to make them more readable

In [3]:
transformed2.df <- transformed2.df %>% mutate(term = factor(term, labels = c('36MO', '60MO')),
        verification_status = factor(verification_status, labels = c('NOT_VERIFIED', 'SOURCE_VERIFIED', 'VERIFIED')),
        purpose = factor(purpose, labels = c('CREDIT_CARD', 'DEBT_CONSOLIDATION', 'HOME_IMPROVEMENT', 'MAJOR_PURCHASE', 'OTHERS', 'SMALL_BUSINESS')),
        initial_list_status = factor(initial_list_status, labels = c('F', 'W')),
        regions = factor(regions, labels = c('MIDWEST', 'NORTHEAST', 'SOUTH', 'WEST')),
        pymnt_plan = factor(pymnt_plan, labels = c("N", "Y")))

Rename external source columns for readability and ease of programmatic reference

In [4]:
transformed2.df <- transformed2.df %>% rename("compensation_of_employees" = "Compensation.of.employees", 
                           "gross_operating_surplus" = "Gross.operating.surplus",
                           "per_capita_real_gdp_by_state" = "Per.capita.real.GDP.by.state",
                           "quantity_indexes_for_real_gdp_by_state" = "Quantity.indexes.for.real.GDP.by.state",
                           "real_gdp_by_state" = "Real.GDP.by.state",
                           "subsidies" = "Subsidies",
                           "taxes_on_production_and_imports" = "Taxes.on.production.and.imports")

Add a `returns` with the amount expected to be gained from the loan from interests alone

In [5]:
transformed2.df <- transformed2.df %>% mutate(returns = (installment * ifelse(term=='36MO', 36, 60)) - loan_amnt)

Glimpse newly transformed dataframe

In [6]:
glimpse(transformed2.df)

Observations: 884,570
Variables: 28
$ X                                      <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,…
$ loan_amnt                              <int> 5600, 7500, 12200, 11200, 2500…
$ term                                   <fct> 36MO, 36MO, 36MO, 36MO, 36MO, …
$ int_rate                               <dbl> 18.04, 13.87, 12.29, 11.34, 15…
$ installment                            <dbl> 202.57, 255.86, 406.91, 368.48…
$ grade                                  <fct> G, D, C, C, E, C, C, A, B, C, …
$ home_ownership                         <fct> OWN, MORTGAGE, OWN, RENT, MORT…
$ annual_inc                             <dbl> 0.6677529, 0.7174611, 0.661276…
$ verification_status                    <fct> NOT_VERIFIED, NOT_VERIFIED, NO…
$ pymnt_plan                             <fct> N, N, N, N, N, N, N, N, N, N, …
$ purpose                                <fct> HOME_IMPROVEMENT, OTHERS, HOME…
$ dti                                    <dbl> 25.84, 20.95, 9.31, 6.92, 6.82…
$ open_acc      

### 3.2) One-hot encoded version (for monotonic constraints)

In [7]:
transformed2.1hot.df <- one_hot(as.data.table(transformed2.df))
glimpse(transformed2.1hot.df)

Observations: 884,570
Variables: 50
$ X                                      <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,…
$ loan_amnt                              <int> 5600, 7500, 12200, 11200, 2500…
$ term_36MO                              <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ term_60MO                              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ int_rate                               <dbl> 18.04, 13.87, 12.29, 11.34, 15…
$ installment                            <dbl> 202.57, 255.86, 406.91, 368.48…
$ grade_A                                <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
$ grade_B                                <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
$ grade_C                                <int> 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, …
$ grade_D                                <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
$ grade_E                                <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
$ grade_F                                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ grade_G       

### 3.3) Train / Test split both versions

Initialize Seed

In [8]:
seedNum = 19
set.seed(seedNum)

Create partitions

In [9]:
train.indx <- createDataPartition(y=transformed2.df$default, p=0.85, list=FALSE)
train.1hot.df <- transformed2.1hot.df[train.indx,-1]
test.1hot.df <- transformed2.1hot.df[-train.indx,-1]
train.df <- transformed2.df[train.indx,-1]
test.df <- transformed2.df[-train.indx,-1]

### 3.4) Create sampled versions of both training datasets (for cross validation)

Undersample train dataset for cross validation 

In [10]:
train.samp.indx <- createDataPartition(y=train.df$default, p=0.15, list=FALSE)
train.samp.df <- transformed2.1hot.df[train.samp.indx,-1]
train.samp.1hot.df <- transformed2.1hot.df[train.samp.indx,-1]

### 3.5) Save Datasets

In [12]:
write.csv(transformed2.df, file = "transformed2.df.csv")
write.csv(transformed2.1hot.df, file = "transformed2.1hot.df.csv")
write.csv(train.df, file = "train.df.csv")
write.csv(train.samp.df, file = "train.samp.df.csv")
write.csv(train.1hot.df, file = "train.1hot.df.csv")
write.csv(train.samp.1hot.df, file = "train.samp.1hot.df.csv")
write.csv(test.df, file = "test.df.csv")
write.csv(test.1hot.df, file = "test.1hot.df.csv")