## Applying Logistic Regression Model to 2017 Data ##

In [1]:
# Importing packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from scipy.stats import iqr
from sklearn import preprocessing
import pickle
from sklearn.preprocessing import MinMaxScaler



In [2]:
# Loading 2017 data
data = pd.read_csv("2017_Data.csv", header=0, index_col=0)
data.head()

Unnamed: 0,Year,Assigned_ID,Bank,MortDate,LTV_M,Purpose_M,Coop_M,Product_M,Occup_M,Back_M,BoCreditScor_M,PropType_M
0,2017,2092102,Atlanta,2016,0.13,1.61,1.0,0.879,1.0,1.0,0.743,1.0
1,2017,2092103,Atlanta,2016,0.13,1.61,1.0,0.879,1.0,1.0,0.743,1.0
2,2017,2092104,Atlanta,2016,0.13,1.0,1.0,0.879,1.0,1.431,0.743,1.0
3,2017,2092105,Atlanta,2016,0.13,1.61,1.0,0.879,1.0,1.431,1.76,1.0
4,2017,2092106,Atlanta,2016,0.13,1.61,1.0,0.879,1.0,2.02,2.85,1.0


In [4]:
# Splitting data into target/data variables

data_year = data.Year
data_assigned = data.Assigned_ID
data_bank = data.Bank
data_mortdate = data.MortDate
data = data.drop(columns=['Year','Assigned_ID','Bank','MortDate'], axis=1)

In [5]:
# Checking data
data.head()

Unnamed: 0,LTV_M,Purpose_M,Coop_M,Product_M,Occup_M,Back_M,BoCreditScor_M,PropType_M
0,0.13,1.61,1.0,0.879,1.0,1.0,0.743,1.0
1,0.13,1.61,1.0,0.879,1.0,1.0,0.743,1.0
2,0.13,1.0,1.0,0.879,1.0,1.431,0.743,1.0
3,0.13,1.61,1.0,0.879,1.0,1.431,1.76,1.0
4,0.13,1.61,1.0,0.879,1.0,2.02,2.85,1.0


#### Outliers ####

__No outliers for this data as everything is binned.__

#### Normalization ####

In [6]:
# Checking the mean of the data
data.mean()

LTV_M             0.129780
Purpose_M         1.250602
Coop_M            1.000000
Product_M         0.879000
Occup_M           1.006970
Back_M            1.470387
BoCreditScor_M    1.469742
PropType_M        1.011208
dtype: float64

#### Skewness ####

In [7]:
# Checking skewness
data.skew()

LTV_M             -3.271996
Purpose_M          0.362529
Coop_M             0.000000
Product_M          0.000000
Occup_M            6.551674
Back_M             0.353205
BoCreditScor_M     2.091029
PropType_M        10.975720
dtype: float64

__There is no skewness, as the attributes that have high skew are categorical variables.__

In [8]:
# Concatinating the dataframe back together

data = pd.concat([data, data_year], axis=1, join='inner')
data = pd.concat([data, data_bank], axis=1, join='inner')
data = pd.concat([data, data_mortdate], axis=1, join='inner')
data = pd.concat([data, data_assigned], axis=1, join='inner')
data.head()

Unnamed: 0,LTV_M,Purpose_M,Coop_M,Product_M,Occup_M,Back_M,BoCreditScor_M,PropType_M,Year,Bank,MortDate,Assigned_ID
0,0.13,1.61,1.0,0.879,1.0,1.0,0.743,1.0,2017,Atlanta,2016,2092102
1,0.13,1.61,1.0,0.879,1.0,1.0,0.743,1.0,2017,Atlanta,2016,2092103
2,0.13,1.0,1.0,0.879,1.0,1.431,0.743,1.0,2017,Atlanta,2016,2092104
3,0.13,1.61,1.0,0.879,1.0,1.431,1.76,1.0,2017,Atlanta,2016,2092105
4,0.13,1.61,1.0,0.879,1.0,2.02,2.85,1.0,2017,Atlanta,2016,2092106


### Loading Logistic Regression Model Already Built ###

In [9]:
# Loading logistic regression model
logreg = pickle.load(open('logreg_model.p','rb'))

In [10]:
# Attempt 1
pred_cols = list(data.columns.values)[:-4]

scaler = MinMaxScaler()
X_pred = scaler.fit(data[pred_cols]).transform(data[pred_cols])

predictions = pd.Series(logreg.predict(X_pred))
print(predictions.value_counts())

1    40928
0    15062
dtype: int64


In [11]:
data['Portfolio_Worthy'] = predictions
data.head()

Unnamed: 0,LTV_M,Purpose_M,Coop_M,Product_M,Occup_M,Back_M,BoCreditScor_M,PropType_M,Year,Bank,MortDate,Assigned_ID,Portfolio_Worthy
0,0.13,1.61,1.0,0.879,1.0,1.0,0.743,1.0,2017,Atlanta,2016,2092102,0
1,0.13,1.61,1.0,0.879,1.0,1.0,0.743,1.0,2017,Atlanta,2016,2092103,0
2,0.13,1.0,1.0,0.879,1.0,1.431,0.743,1.0,2017,Atlanta,2016,2092104,1
3,0.13,1.61,1.0,0.879,1.0,1.431,1.76,1.0,2017,Atlanta,2016,2092105,0
4,0.13,1.61,1.0,0.879,1.0,2.02,2.85,1.0,2017,Atlanta,2016,2092106,1


In [12]:
data.to_csv("2017_Good_Data.csv", sep = ',')