<h1>Lending Club Data</h1>

<h2>Multivariate Regression</h2>

In [57]:
# import libraries
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
import numpy as np
import math
# from patsy.contrasts import Treatment
from sklearn.linear_model import LogisticRegression

In [58]:
# read data into dataframe
loansData = pd.read_csv("LoanStats3d.csv", skiprows=[0])
loansData.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code
0,53624987,57155760,6000,6000,6000,36 months,11.53%,197.95,B,B5,...,0,0,0,,0.0,Aug-2015,Jul-2015,0,,1
1,53584764,57115519,6200,6200,6200,36 months,7.89%,193.98,A,A5,...,0,0,0,Jul-2015,193.98,Aug-2015,Jul-2015,0,,1
2,53222534,56742275,19000,19000,19000,36 months,8.18%,596.97,B,B1,...,0,0,0,Jul-2015,596.97,Aug-2015,Jul-2015,0,47.0,1
3,53564350,57095071,21000,21000,20975,60 months,21.99%,579.88,F,F1,...,0,0,0,,0.0,Aug-2015,Jul-2015,0,,1
4,52038169,55467933,10000,10000,10000,60 months,10.99%,217.38,B,B4,...,0,0,0,Jul-2015,217.38,Aug-2015,Jul-2015,0,,1


In [59]:
# data clean up 

# remove na values
annual_inc = loansData["annual_inc"].dropna()
int_rate = loansData["int_rate"].dropna()

# remove % from interest rate values
int_rate = int_rate.map(lambda x: float(x.rstrip("%")))

In [60]:
# use annual income to model interest rate 

# ols model
loansData["int_rate"] = int_rate
loansData["annual_inc"] = annual_inc
f = sm.formula.ols(formula='int_rate ~ annual_inc', data=loansData).fit()
f.summary()

0,1,2,3
Dep. Variable:,int_rate,R-squared:,0.012
Model:,OLS,Adj. R-squared:,0.012
Method:,Least Squares,F-statistic:,2103.0
Date:,"Sun, 06 Sep 2015",Prob (F-statistic):,0.0
Time:,22:59:25,Log-Likelihood:,-520270.0
No. Observations:,180102,AIC:,1041000.0
Df Residuals:,180100,BIC:,1041000.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,13.4117,0.016,814.911,0.000,13.379 13.444
annual_inc,-7.807e-06,1.7e-07,-45.860,0.000,-8.14e-06 -7.47e-06

0,1,2,3
Omnibus:,9973.069,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,11901.801
Skew:,0.586,Prob(JB):,0.0
Kurtosis:,3.461,Cond. No.,155000.0


In [61]:
# add home ownership to model 

# user id which will serve as unique id to join loansData and home ownership dummies
unique_key = loansData["id"]

# get dummy variables for home ownership 
home_ownership = loansData["home_ownership"]
home_ownership_dummies = pd.get_dummies(home_ownership)
home_ownership_dummies["unique_key"] = unique_key 
home_ownership_dummies

# join based on unique key (id)
loansData = pd.merge(loansData, home_ownership_dummies, on=unique_key, how='left')

# drop extra unique key column
loansData = loansData.drop('unique_key', 1)
loansData.head(15)

# reshape mortgage/own/rent into one column? 



Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,MORTGAGE,OWN,RENT
0,53624987,57155760,6000,6000,6000,36 months,11.53,197.95,B,B5,...,,0.0,Aug-2015,Jul-2015,0,,1,0,0,1
1,53584764,57115519,6200,6200,6200,36 months,7.89,193.98,A,A5,...,Jul-2015,193.98,Aug-2015,Jul-2015,0,,1,0,0,1
2,53222534,56742275,19000,19000,19000,36 months,8.18,596.97,B,B1,...,Jul-2015,596.97,Aug-2015,Jul-2015,0,47.0,1,1,0,0
3,53564350,57095071,21000,21000,20975,60 months,21.99,579.88,F,F1,...,,0.0,Aug-2015,Jul-2015,0,,1,1,0,0
4,52038169,55467933,10000,10000,10000,60 months,10.99,217.38,B,B4,...,Jul-2015,217.38,Aug-2015,Jul-2015,0,,1,0,0,1
5,53584763,57115517,5400,5400,5400,36 months,17.57,194.07,D,D4,...,Jul-2015,194.07,Aug-2015,Jul-2015,0,,1,0,0,1
6,51057384,54457145,19200,19200,19200,60 months,15.61,462.94,D,D1,...,Jul-2015,462.94,Aug-2015,Jul-2015,0,,1,1,0,0
7,53674434,57205168,16550,16550,16500,60 months,17.86,419.01,D,D5,...,Jul-2015,419.01,Aug-2015,Jul-2015,0,,1,0,0,1
8,53312695,56822456,14000,14000,13975,60 months,18.55,359.72,E,E2,...,Jul-2015,359.72,Aug-2015,Jul-2015,0,,1,1,0,0
9,51978619,55408386,15000,15000,15000,60 months,11.53,330.12,B,B5,...,Jul-2015,330.12,Aug-2015,Jul-2015,0,,1,1,0,0


In [24]:
# add interaction of home ownership and income

f = sm.formula.ols(formula="int_rate ~ annual_inc + C(home_ownership_dummies)")

res2 = smf.ols(formula='Lottery ~ Literacy * Wealth - 1', data=df).fit()


TypeError: from_formula() takes at least 3 arguments (2 given)