In [173]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

### Process data

In [174]:
# Load data
loansData = pd.read_csv('https://github.com/Thinkful-Ed/curric-data-001-data-sets/raw/master/loans/loansData.csv')

# Clean and process data
loansData['Interest.Rate'] = map(lambda x: round(float(x[:-1]), 4), loansData['Interest.Rate'])
loansData['Loan.Length'] = map(lambda x: int(x[:-7]), loansData['Loan.Length'])
loansData['FICO.Score'] = map(lambda x: int(x[:3]), loansData['FICO.Range'])

# Show columns we'll use for the regressions
loansData[['Interest.Rate', 'Loan.Length', 'FICO.Score']].head()

Unnamed: 0,Interest.Rate,Loan.Length,FICO.Score
81174,8.9,36,735
99592,12.12,36,715
80059,21.98,60,690
15825,9.99,36,695
33182,11.71,36,695


### Split and regress

In [175]:
# Split X and y into 10 folds
X = np.column_stack([loansData['Loan.Length'], loansData['FICO.Score']])
y = loansData['Interest.Rate'].tolist()
num_instances = len(X)
kfold = KFold(n_splits=10)

# Linear regression
model = LinearRegression()

### Cros-validation

In [176]:
results = cross_val_score(model, X, y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0)

Accuracy: 68.742% (2.346%)


In [177]:
results = cross_val_predict(model, X, y, cv=kfold)
MSE = mean_squared_error(y, results)
MAE = mean_absolute_error(y, results)
R2S = r2_score(y, results)

print("MSE: {}, MAE: {}, R2S: {}".format(MSE, MAE, R2S))

MSE: 5.42648996084, MAE: 1.84368454386, R2S: 0.689037515442


### Checking R2

In [178]:
J = sm.add_constant(X)
model = sm.OLS(y,J)
f = model.fit()

print(f.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.690
Model:                            OLS   Adj. R-squared:                  0.690
Method:                 Least Squares   F-statistic:                     2776.
Date:                Sat, 04 Feb 2017   Prob (F-statistic):               0.00
Time:                        23:55:26   Log-Likelihood:                -5658.3
No. Observations:                2500   AIC:                         1.132e+04
Df Residuals:                    2497   BIC:                         1.134e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const         65.7041      0.957     68.660      0.0