### Maximum 10000 samples to run SVR based on documents

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import sklearn
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn import svm
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate

from math import sqrt

import warnings
warnings.filterwarnings("ignore")

In [2]:
df1 = pd.read_csv('trainforsvm.csv')

In [3]:
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,loan_amount,applicant_income,rate_spread
0,0,1,0,0,1,0,0,1,0,0,-0.025077,0.042033,1
1,0,1,0,0,0,1,0,1,0,0,0.12223,-0.298088,2
2,1,0,0,0,0,1,0,1,0,0,0.178347,-0.37583,3
3,1,0,0,0,0,1,0,1,0,0,-0.600277,-0.346677,6
4,0,1,0,0,1,0,0,0,1,0,-0.824745,-0.259217,4


In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 13 columns):
0                   9999 non-null int64
1                   9999 non-null int64
2                   9999 non-null int64
3                   9999 non-null int64
4                   9999 non-null int64
5                   9999 non-null int64
6                   9999 non-null int64
7                   9999 non-null int64
8                   9999 non-null int64
9                   9999 non-null int64
loan_amount         9999 non-null float64
applicant_income    9999 non-null float64
rate_spread         9999 non-null int64
dtypes: float64(2), int64(11)
memory usage: 1015.6 KB


In [5]:
df1.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,loan_amount,applicant_income,rate_spread
count,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0
mean,0.454045,0.530053,0.006901,0.009001,0.848085,0.150515,0.0014,0.734373,0.051205,0.214421,0.027453,0.001151,1.950195
std,0.497909,0.499121,0.082787,0.09445,0.358957,0.357593,0.037394,0.441688,0.220427,0.410441,1.098622,0.817862,1.580386
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.993096,-0.677079,1.0
25%,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.523116,-0.317523,1.0
50%,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,-0.172384,-0.16204,1.0
75%,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.269538,0.090621,2.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,36.17043,36.172555,8.0


In [6]:
df1.shape

(9999, 13)

In [7]:
array = df1.values

In [8]:
type(array)

numpy.ndarray

In [9]:
X = array[:,0:12]
y = array[:,12]

In [10]:
X.shape

(9999, 12)

In [11]:
y.shape

(9999,)

### Cross-validation on training dataset

In [12]:
model = SVR()

In [13]:
cv_svr = cross_validate(model,X,y,scoring='r2',cv=5,n_jobs=-1,return_train_score=True)

In [14]:
cv_svr

{'fit_time': array([4.15203261, 4.1989038 , 4.15203261, 4.12077618, 2.52951288]),
 'score_time': array([0.40624309, 0.42187476, 0.37499571, 0.42187667, 0.25020719]),
 'test_score': array([0.51212592, 0.51253852, 0.48625515, 0.51987865, 0.50591396]),
 'train_score': array([0.50907168, 0.51522991, 0.51589705, 0.51236158, 0.51742739])}

In [15]:
svr_train_average = np.average(cv_svr['train_score'])
svr_test_average = np.average(cv_svr['test_score'])

In [16]:
print(svr_train_average)

print(svr_test_average)

0.5139975220969052
0.5073424401500678


### Using Grid Search to find best parameters

In [29]:
model = SVR()

In [38]:
params_grid = {"C":[1.0,2.0,3.0],
               "gamma" :[1,5,10]
              }

In [39]:
grid_search_svr = GridSearchCV(model,params_grid,scoring='r2',n_jobs=-1,cv=3,verbose=True,return_train_score=True)

In [40]:
grid_search_svr.fit(X,y)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  1.0min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='auto_deprecated', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [1.0, 2.0, 3.0], 'gamma': [1, 5, 10]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='r2', verbose=True)

In [41]:
model.get_params().keys()

dict_keys(['C', 'cache_size', 'coef0', 'degree', 'epsilon', 'gamma', 'kernel', 'max_iter', 'shrinking', 'tol', 'verbose'])

In [42]:
grid_search_svr.best_params_  #'C': 2.0, 'gamma': 1

{'C': 2.0, 'gamma': 1}

In [46]:
cv_results_svr = pd.DataFrame.from_dict(grid_search_svr.cv_results_)

In [47]:
cv_results_svr

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,3.500033,0.057847,0.621824,0.013008,1,1,"{'C': 1.0, 'gamma': 1}",0.52303,0.504272,0.520568,0.515957,0.008323,3,0.52834,0.541921,0.534861,0.535041,0.005546
1,4.142167,0.047205,0.698042,0.027105,1,5,"{'C': 1.0, 'gamma': 5}",0.513868,0.496199,0.517348,0.509138,0.009259,4,0.538772,0.551922,0.54583,0.545508,0.005373
2,5.018913,0.730947,0.786425,0.032414,1,10,"{'C': 1.0, 'gamma': 10}",0.507977,0.49158,0.514948,0.504835,0.009795,7,0.54509,0.561109,0.556169,0.554123,0.006698
3,5.30636,0.257168,0.777863,0.015217,2,1,"{'C': 2.0, 'gamma': 1}",0.523057,0.505717,0.523586,0.517453,0.008301,1,0.53531,0.547204,0.539836,0.540783,0.004902
4,6.812133,0.17493,0.763724,0.052968,2,5,"{'C': 2.0, 'gamma': 5}",0.509065,0.496735,0.516567,0.507456,0.008176,5,0.548737,0.560303,0.555576,0.554872,0.004748
5,7.739131,0.352769,0.913865,0.101574,2,10,"{'C': 2.0, 'gamma': 10}",0.506116,0.490016,0.510436,0.502189,0.008787,8,0.55849,0.572375,0.567214,0.566027,0.00573
6,6.01773,0.284178,0.799768,0.017023,3,1,"{'C': 3.0, 'gamma': 1}",0.521518,0.506019,0.523755,0.517097,0.007887,2,0.538173,0.549937,0.541722,0.543277,0.004927
7,7.931338,0.301533,0.930647,0.064589,3,5,"{'C': 3.0, 'gamma': 5}",0.506719,0.497776,0.516188,0.506895,0.007518,6,0.553841,0.565403,0.560672,0.559972,0.004746
8,8.617518,0.335549,0.658324,0.058368,3,10,"{'C': 3.0, 'gamma': 10}",0.500365,0.487912,0.508406,0.498894,0.008431,9,0.564902,0.579009,0.573349,0.57242,0.005796


In [50]:
bestsvr = grid_search_svr.best_estimator_

In [51]:
bestsvr

SVR(C=2.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

## Load in train set

### Load in test set

In [None]:
X_test = pd.read_csv('testmodified.csv')

In [None]:
X_test.head()

In [None]:
X_test.shape

In [None]:
y_predict = bestsvr.predict(X_test)

In [None]:
y_predict

In [None]:
y_predict.shape

In [None]:
svrprediction = pd.DataFrame(y_predict)

In [None]:
svrprediction

In [None]:
sns.distplot(svrprediction)

In [None]:
MSE = mean_squared_error(y,y_predict)
print("Mean Squared Error is", MSE)

In [None]:
RMSE = format(np.sqrt(MSE),'.3f')
print("RMSE is", RMSE)

In [None]:
MAE = mean_absolute_error(y,y_predict)
print("MAE is", MAE)

In [None]:
r2 = r2_score(y,y_predict)
print("R2 score is", r2)

In [None]:
svrprediction.to_csv('svrresult.csv')

**Result is 0.3669**