In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import statsmodels.api as sm
import decimal
from statsmodels.nonparametric.kde import KDEUnivariate
from statsmodels.nonparametric import smoothers_lowess
from pandas import Series, DataFrame
from patsy import dmatrices
from sklearn import datasets, svm
from sklearn.svm import SVR

#data = pd.read_csv('data', parse_dates=[[0,1]], infer_datetime_format=True)
df = pd.read_csv("training.csv", infer_datetime_format=True,low_memory=False)

columnNames = ['int_rate','id_loan','id_borrower','loan_amt','funded_amt','funded_amt_inv',
              'term','grade','subgrade','emp_title','emp_length','home_ownership','annual_inc',
               'verification_status','issue_d','purpose','loan_cat','loan_title','zip_code',
               'state','dti','delinq_2yrs','earliest_cr_line','inq_last_6mths','mths_since_last_deliq',
               'mths_since_last_record','num_opencr_line','num_der_rec','revol_bal','revol_util',
               'total_cr_line','init_list_status']

df.columns = columnNames

# Pandas can not store NaN in int type column, thus had remove all NaNs from them as well
for col in ('loan_amt','funded_amt','funded_amt_inv', 'term'):
    df = df[pd.notnull(df[col])]
    
# reset index
df = df.reset_index(drop=True)

D = decimal.Decimal
# Pandas can not store NaN in int type column, thus had remove all NaNs
for col in ('loan_amt','funded_amt','funded_amt_inv','term'):
    df[col] = df[col].str.replace(r'[^-+\d.]', '').astype(D)
    df[col] = df[col].astype('int')
    
#  Convert columns having % from obj(string type) to float and divide by 100    
df['int_rate'] = df['int_rate'].str.replace('%','').astype('float')/100

# Remove the rows where int_rate is missing
df = df[pd.notnull(df['int_rate'])]
# Reset index, this is critical
df = df.reset_index(drop=True)

df.drop(['id_loan','id_borrower'],1, inplace=True)

from datetime import datetime
# Read the column in the correct format
temp = pd.to_datetime(df['issue_d'],format = '%b-%d')

date_format = "%m/%d/%Y"
dttoday = datetime.strptime('1/1/1900', date_format)

d = np.zeros(len(temp))
for i in range(len(temp)):
    try:
        d[i] = ((np.timedelta64(temp[i] - pd.Timestamp(dttoday),'D').astype(int))/365)
        #break
    except:
        d[i] = ((np.timedelta64(temp[i] - pd.Timestamp(dttoday),'M').astype(int))/12)

df['issue_d'] = d

# Remove the rows where annual income claim is larger than 1 million
df = df[df['annual_inc'] <= 1000000]
# Reset index, this is critical
df = df.reset_index(drop=True)

df.drop(['emp_title'],1, inplace=True)

df.replace('n/a', np.nan,inplace=True)
df.emp_length.fillna(value=0,inplace=True)
df['emp_length'].replace(to_replace='[^0-9]+', value='', inplace=True, regex=True)
df['emp_length'] = df['emp_length'].astype(int)

df.drop(['purpose'],1, inplace=True)
df.drop(['loan_title'],1, inplace=True)
df.drop(['zip_code'],1, inplace=True)

from datetime import datetime
# Read the column in the correct format
t = pd.to_datetime(df['earliest_cr_line'],format = '%b-%y')

date_format = "%m/%d/%Y"
dttoday = datetime.strptime('1/1/2017', date_format)

d = np.zeros(len(t))
for i in range(len(t)):
    try:
        d[i] = ((np.timedelta64(t[i] - pd.Timestamp(dttoday),'D').astype(int))/-365)
        #break
    except:
        d[i] = ((np.timedelta64(t[i] - pd.Timestamp(dttoday),'M').astype(int))/-12)

d[d<0] = d[d<0] + 100

df['earliest_cr_line'] = d

df.revol_util = pd.Series(df.revol_util).str.replace('%', '').astype(float)

df.drop(['funded_amt','funded_amt_inv','mths_since_last_deliq', 'total_cr_line'], axis=1, inplace=True)

In [19]:
# from sklearn import ensemble
# from sklearn import datasets
# from sklearn.utils import shuffle
# from sklearn.metrics import mean_squared_error
# from matplotlib import pyplot as plt

# y = df2.int_rate.values
# df2.drop('int_rate',axis = 1, inplace=True)

# X, y = shuffle(df2.values, y, random_state=30)
# #X = X.astype(np.float64)

# offset = int(X.shape[0] * 0.75)
# X_train, y_train = X[:offset], y[:offset]
# X_test, y_test = X[offset:], y[offset:]

In [24]:
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot as plt
from sklearn.grid_search import GridSearchCV

# param_grid = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
#               'max_depth': [4, 6],
#               'min_samples_leaf': [3, 5, 9, 17],
#               'max_features': [1.0, 0.3, 0.1]
#               }
param_grid = {'learning_rate': [0.1],
              'max_depth': [4],
              'min_samples_leaf': [3],
              'max_features': [1.0],
              }

est = GridSearchCV(ensemble.GradientBoostingRegressor(n_estimators=100),
                   param_grid, n_jobs=4, refit=True)

# Create an acceptable formula for our machine learning algorithms
#formula_ml = 'int_rate ~ C(grade) + C(subgrade) + annual_inc + dti + mths_since_last_record + C(state)'

formula_ml = 'int_rate ~ loan_amt '
for i in range(2,len(df.columns)):
    
    if str(df.dtypes[i]) == 'object': 
        formula_ml = formula_ml + ' + ' + 'C(' + df.columns[i] + ')'
    elif (str(df.dtypes[i]) == 'float64') | (str(df.dtypes[i]) == 'float32'):
        formula_ml = formula_ml + ' + ' + df.columns[i]
        
# import the machine learning library that holds the randomforest
import sklearn.ensemble as ske

# Create the random forest model and fit the model to our training data
y, x = dmatrices(formula_ml, data=df, return_type='dataframe')
# RandomForestClassifier expects a 1 demensional NumPy array, so we convert
y = np.asarray(y).ravel()

# For prototype build, only use a tiny portion to test to save time
testet = int(x.shape[0] * 0.05)
x,y =  x[:testet], y[:testet]

## Split data for testing and training
offset = int(x.shape[0] * 0.75)
X_train, y_train = x[:offset], y[:offset]
X_test, y_test = x[offset:], y[offset:]

#instantiate and fit our model
est.fit(X_train, y_train)

best_params = est.best_params_

In [None]:
from sklearn.svm import SVR
from sklearn.grid_search import GridSearchCV
# Try Support Vector Regression
#failed as there are a lot string type predictors in Y
param_grid = {'C': [100, 5, 10],
              'kernel':('linear','rbf','poly'),
              'epsilon': [1, 2, 5, 10],
              'gamma': [10.0, 0.3, 0.1]
              }

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

# For prototype build, only use a tiny portion to test to save time

X, y = shuffle(df.values, y, random_state=30)

testet = int(X.shape[0] * 0.05)
X,y =  X[:testet], y[:testet]

## Split data for testing and training
offset = int(X.shape[0] * 0.75)
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]
                 
svr = svm.SVR()
clf = GridSearchCV(svm.SVR(),tuned_parameters, n_jobs=4, refit=True)
#clf = SVR(C=1.0, epsilon=0.2)
clf.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.2f"
      % (clf.best_params_, clf.best_score_))
from sklearn.cross_validation import StratifiedShuffleSplit

kernel = ('linear','rbf','poly')
C_range = [0.000001,0.01,0.1,1,10,20,50]
epsilon=  [0.00001,10,20,50,100]
gamma_range = np.logspace(-99, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range, kernel = kernel,epsilon=epsilon)
#cv = StratifiedShuffleSplit(y_train, n_iter=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVR(), param_grid=param_grid)
grid.fit(X_train, y_train)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

In [2]:
df = pd.get_dummies(df)

In [6]:
df = df.dropna()
# reset index
df = df.reset_index(drop=True)

In [7]:
from sklearn import linear_model
from sklearn.utils import shuffle
y = df.int_rate.values

X, y = shuffle(df.values, y, random_state=30)

clf = linear_model.SGDRegressor()
clf.fit(X, y)

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', n_iter=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, verbose=0, warm_start=False)

In [10]:
model = sm.OLS(y,X)
f = model.fit()

print ('Coefficients: ', f.params[0:2])
print ('Intercept: ', f.params[2])
print ('P-Values: ', f.pvalues)
print ('R-Squared: ', f.rsquared)

Coefficients:  [  1.00000000e+00  -4.30332974e-18]
Intercept:  -8.3848241435e-17
P-Values:  [  0.00000000e+00   2.57519411e-15   8.17183010e-01   9.72282066e-01
   0.00000000e+00   4.16335954e-03   4.59408667e-01   4.74378762e-12
   7.97354651e-02   9.63592345e-01   7.74801798e-02   5.55669861e-01
   3.63235517e-03   0.00000000e+00   4.58801081e-01   2.47003642e-03
   5.99043520e-01   5.90999871e-01   2.88957834e-01   9.50901726e-03
   4.83890809e-02   4.54312533e-01   7.44671971e-01   7.03684281e-01
   5.78930145e-01   5.50063768e-02   1.15857144e-01   5.95404938e-03
   4.48757626e-03   1.12243170e-01   3.15777347e-01   6.40378140e-01
   7.79928398e-01   1.97570890e-01   1.79383394e-01   2.28003302e-02
   3.05773056e-02   2.88877305e-01   3.47057342e-01   1.28821381e-01
   9.92152059e-01   7.43828929e-01   3.93163208e-02   5.60534673e-01
   4.03728969e-01   5.48214250e-01   1.27013170e-01   8.87022950e-02
   4.27147049e-04   3.20480222e-01   2.34622602e-01   1.23395394e-02
   6.746503

In [11]:
f.predict(X)

array([ 0.1299,  0.1449,  0.1398, ...,  0.1531,  0.1499,  0.1212])

In [12]:
y

array([ 0.1299,  0.1449,  0.1398, ...,  0.1531,  0.1499,  0.1212])

In [None]:
# from sklearn.svm import SVR
# # Set the parameters by cross-validation
# tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
#                      'C': [1, 10, 100, 1000]},
#                     {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

# scores = ['precision', 'recall']

# for score in scores:
#     print("# Tuning hyper-parameters for %s" % score)
#     print()

#     clf = GridSearchCV(SVR(C=1), tuned_parameters, cv=5,
#                        scoring='%s_weighted' % score)
#     clf.fit(X_train, y_train)

#     print("Best parameters set found on development set:")
#     print()
#     print(clf.best_params_)
#     print()
#     print("Grid scores on development set:")
#     print()
#     for params, mean_score, scores in clf.grid_scores_:
#         print("%0.3f (+/-%0.03f) for %r"
#               % (mean_score, scores.std() * 2, params))
#     print()


In [None]:
# # Create an acceptable formula for our machine learning algorithms
# formula_ml = 'int_rate ~ C(grade) + C(subgrade) + annual_inc + dti + term + C(state)'
# # import the machine learning library that holds the randomforest
# import sklearn.ensemble as ske

# # Create the random forest model and fit the model to our training data
# y, x = dmatrices(formula_ml, data=df, return_type='dataframe')
# # RandomForestClassifier expects a 1 demensional NumPy array, so we convert
# y = np.asarray(y).ravel()
# #instantiate and fit our model
# results_rf = ske.RandomForestClassifier(n_estimators=100).fit(x, y)

# # Score the results
# score = results_rf.score(x, y)
# print ("Mean accuracy of Random Forest Predictions on the data was: {0}".format(score))

In [10]:
df.dtypes

int_rate                  float64
loan_amt                    int32
term                        int32
grade                      object
subgrade                   object
emp_length                  int32
home_ownership             object
annual_inc                float64
verification_status        object
issue_d                   float64
loan_cat                   object
state                      object
dti                       float64
delinq_2yrs               float64
earliest_cr_line          float64
inq_last_6mths            float64
mths_since_last_record    float64
num_opencr_line           float64
num_der_rec               float64
revol_bal                 float64
revol_util                float64
init_list_status           object
dtype: object

In [18]:
str(df.dtypes[1]) == 'object'

False

In [22]:
# Create an acceptable formula for our machine learning algorithms
#formula_ml = 'int_rate ~ C(grade) + loan_amt + C(subgrade) + annual_inc + dti + mths_since_last_record + C(state)'
formula_ml = 'int_rate ~ loan_amt '
for i in range(2,len(df.columns)):
    
    if str(df.dtypes[i]) == 'object': 
        formula = formula + ' + ' + 'C(' + df.columns[i] + ')'
    elif (str(df.dtypes[i]) == 'float64') | (str(df.dtypes[i]) == 'float32'):
        formula = formula + ' + ' + df.columns[i]
        
    

In [23]:
formula

'int_rate ~ loan_amt  + C(grade) + C(subgrade) + C(home_ownership) + annual_inc + C(verification_status) + issue_d + C(loan_cat) + C(state) + dti + delinq_2yrs + earliest_cr_line + inq_last_6mths + mths_since_last_record + num_opencr_line + num_der_rec + revol_bal + revol_util + C(init_list_status)'