In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats

## Interest Rate

### Import

In [2]:
train = pd.read_csv('loan_data_train.csv')
test = pd.read_csv('loan_data_test.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 15 columns):
ID                                2199 non-null float64
Amount.Requested                  2199 non-null object
Amount.Funded.By.Investors        2199 non-null object
Interest.Rate                     2200 non-null object
Loan.Length                       2199 non-null object
Loan.Purpose                      2199 non-null object
Debt.To.Income.Ratio              2199 non-null object
State                             2199 non-null object
Home.Ownership                    2199 non-null object
Monthly.Income                    2197 non-null float64
FICO.Range                        2200 non-null object
Open.CREDIT.Lines                 2196 non-null object
Revolving.CREDIT.Balance          2197 non-null object
Inquiries.in.the.Last.6.Months    2197 non-null float64
Employment.Length                 2131 non-null object
dtypes: float64(3), object(12)
memory usage: 257.9+ KB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 14 columns):
ID                                300 non-null int64
Amount.Requested                  300 non-null int64
Amount.Funded.By.Investors        300 non-null object
Loan.Length                       300 non-null object
Loan.Purpose                      300 non-null object
Debt.To.Income.Ratio              300 non-null object
State                             300 non-null object
Home.Ownership                    300 non-null object
Monthly.Income                    300 non-null float64
FICO.Range                        300 non-null object
Open.CREDIT.Lines                 300 non-null object
Revolving.CREDIT.Balance          300 non-null int64
Inquiries.in.the.Last.6.Months    300 non-null int64
Employment.Length                 291 non-null object
dtypes: float64(1), int64(4), object(9)
memory usage: 32.9+ KB


### Data Cleaning

- we should combine train and test data sets, and then clean them together
- we will need to split it back into train and test afterwards so add column indicating 'DataType' = train/test
- also, test data set does not have interest.rate column so we need to add that as well and assign them to null values

In [5]:
train['DataType'] = 'train'
test['DataType'] = 'test'
test['Interest.Rate'] = np.nan

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 16 columns):
ID                                2199 non-null float64
Amount.Requested                  2199 non-null object
Amount.Funded.By.Investors        2199 non-null object
Interest.Rate                     2200 non-null object
Loan.Length                       2199 non-null object
Loan.Purpose                      2199 non-null object
Debt.To.Income.Ratio              2199 non-null object
State                             2199 non-null object
Home.Ownership                    2199 non-null object
Monthly.Income                    2197 non-null float64
FICO.Range                        2200 non-null object
Open.CREDIT.Lines                 2196 non-null object
Revolving.CREDIT.Balance          2197 non-null object
Inquiries.in.the.Last.6.Months    2197 non-null float64
Employment.Length                 2131 non-null object
DataType                          2200 non-null object
dtypes: fl

In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 16 columns):
ID                                300 non-null int64
Amount.Requested                  300 non-null int64
Amount.Funded.By.Investors        300 non-null object
Loan.Length                       300 non-null object
Loan.Purpose                      300 non-null object
Debt.To.Income.Ratio              300 non-null object
State                             300 non-null object
Home.Ownership                    300 non-null object
Monthly.Income                    300 non-null float64
FICO.Range                        300 non-null object
Open.CREDIT.Lines                 300 non-null object
Revolving.CREDIT.Balance          300 non-null int64
Inquiries.in.the.Last.6.Months    300 non-null int64
Employment.Length                 291 non-null object
DataType                          300 non-null object
Interest.Rate                     0 non-null float64
dtypes: float64(2), int64(4), objec

In [8]:
df = pd.concat([train,test],sort=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2500 entries, 0 to 299
Data columns (total 16 columns):
ID                                2499 non-null float64
Amount.Requested                  2499 non-null object
Amount.Funded.By.Investors        2499 non-null object
Interest.Rate                     2200 non-null object
Loan.Length                       2499 non-null object
Loan.Purpose                      2499 non-null object
Debt.To.Income.Ratio              2499 non-null object
State                             2499 non-null object
Home.Ownership                    2499 non-null object
Monthly.Income                    2497 non-null float64
FICO.Range                        2500 non-null object
Open.CREDIT.Lines                 2496 non-null object
Revolving.CREDIT.Balance          2497 non-null object
Inquiries.in.the.Last.6.Months    2497 non-null float64
Employment.Length                 2422 non-null object
DataType                          2500 non-null object
dtypes: flo

In [9]:
df.head()

Unnamed: 0,ID,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length,DataType
0,79542.0,25000,25000.0,18.49%,60 months,debt_consolidation,27.56%,VA,MORTGAGE,8606.56,720-724,11,15210,3.0,5 years,train
1,75473.0,19750,19750.0,17.27%,60 months,debt_consolidation,13.39%,NY,MORTGAGE,6737.5,710-714,14,19070,3.0,4 years,train
2,67265.0,2100,2100.0,14.33%,36 months,major_purchase,3.50%,LA,OWN,1000.0,690-694,13,893,1.0,< 1 year,train
3,80167.0,28000,28000.0,16.29%,36 months,credit_card,19.62%,NV,MORTGAGE,7083.33,710-714,12,38194,1.0,10+ years,train
4,17240.0,24250,17431.82,12.23%,60 months,credit_card,23.79%,OH,MORTGAGE,5833.33,730-734,6,31061,2.0,10+ years,train


- drop ID and Amount Funded columns (Amount Funded is a future variable; it does not determine interest rate)
- Amount requested - convert to numeric type; any non-numeric values assign NaN
- Interest Rate - remove % sign at end and convert to numeric
- Debt to Income Ratio - remove % sign at end and convert to numeric
- FICO Range - get min,max of range and take average
- Open Credit Lines - convert to numeric type
- Revolving Credit Balance - convert to numeric type
- Employment Length - remove all 'years'/'year'/'<'/'+'/'.' and convert to numeric

In [10]:
df.drop(['ID','Amount.Funded.By.Investors'],axis=1,inplace=True)

In [11]:
df['Amount.Requested'] = pd.to_numeric(df['Amount.Requested'],errors='coerce')

In [12]:
df['Interest.Rate'] = df['Interest.Rate'].str.replace('%','').astype(float)

In [13]:
df['Debt.To.Income.Ratio'] = df['Debt.To.Income.Ratio'].str.replace('%','').astype(float)

In [14]:
k = df['FICO.Range'].str.split('-',expand=True).astype(float)
df['FICO'] = (k[0]+k[1])/2
df.drop(['FICO.Range'],axis=1,inplace=True)

In [15]:
df['Open.CREDIT.Lines'] = pd.to_numeric(df['Open.CREDIT.Lines'],errors='coerce')

In [16]:
df['Revolving.CREDIT.Balance'] = pd.to_numeric(df['Revolving.CREDIT.Balance'],errors='coerce')

In [17]:
df['Employment.Length'].value_counts()

10+ years    653
< 1 year     249
2 years      243
3 years      235
5 years      202
4 years      191
1 year       177
6 years      163
7 years      127
8 years      108
9 years       72
.              2
Name: Employment.Length, dtype: int64

In [18]:
df['Employment.Length'] = df['Employment.Length'].str.replace(' years','')
df['Employment.Length'] = df['Employment.Length'].str.replace(' year','')
df['Employment.Length'] = df['Employment.Length'].str.replace('+','')
df['Employment.Length'] = df['Employment.Length'].str.replace('< ','')
df['Employment.Length'] = df['Employment.Length'].str.replace('.','')
df['Employment.Length'] = pd.to_numeric(df['Employment.Length'],errors='coerce')

In [19]:
df['Employment.Length'].value_counts()

10.0    653
1.0     426
2.0     243
3.0     235
5.0     202
4.0     191
6.0     163
7.0     127
8.0     108
9.0      72
Name: Employment.Length, dtype: int64

- Perform Dummification on remaining categorical columns

### Dummification of Categorical variables

In [20]:
cat_cols = df.select_dtypes(['object']).columns
cat_cols = cat_cols[:-1]
cat_cols

Index(['Loan.Length', 'Loan.Purpose', 'State', 'Home.Ownership'], dtype='object')

> k = freqs[freqs>threshold].index[:-1]

Here, we shouldn't do **[:-1]** since this would mean it categorizes this with those variables that has frequency less than threshold.  
Instead, we should run wihtout **[-1]**

In [21]:
# Dummification of Categorical Variables
threshold = 20                     # threshold frequency to consider for dummification
for col in cat_cols:               # for all categorical columns
    freqs = df[col].value_counts()   
    k = freqs[freqs>threshold].index[:-1]   # consider only those values whose frequency is above threshold
    for cat in k:                           # for each categorical variable
        name = col + '_' + cat              # unique name
        df[name] = (df[col]==cat).astype(int)  # create new column for that category
    del df[col]                            # delete original column
    print(col)

Loan.Length
Loan.Purpose
State
Home.Ownership


In [22]:
df.shape

(2500, 51)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2500 entries, 0 to 299
Data columns (total 51 columns):
Amount.Requested                   2495 non-null float64
Interest.Rate                      2200 non-null float64
Debt.To.Income.Ratio               2499 non-null float64
Monthly.Income                     2497 non-null float64
Open.CREDIT.Lines                  2491 non-null float64
Revolving.CREDIT.Balance           2495 non-null float64
Inquiries.in.the.Last.6.Months     2497 non-null float64
Employment.Length                  2420 non-null float64
DataType                           2500 non-null object
FICO                               2500 non-null float64
Loan.Length_36 months              2500 non-null int32
Loan.Purpose_debt_consolidation    2500 non-null int32
Loan.Purpose_credit_card           2500 non-null int32
Loan.Purpose_other                 2500 non-null int32
Loan.Purpose_home_improvement      2500 non-null int32
Loan.Purpose_major_purchase        2500 non-null i

In [24]:
df.isnull().sum()

Amount.Requested                     5
Interest.Rate                      300
Debt.To.Income.Ratio                 1
Monthly.Income                       3
Open.CREDIT.Lines                    9
Revolving.CREDIT.Balance             5
Inquiries.in.the.Last.6.Months       3
Employment.Length                   80
DataType                             0
FICO                                 0
Loan.Length_36 months                0
Loan.Purpose_debt_consolidation      0
Loan.Purpose_credit_card             0
Loan.Purpose_other                   0
Loan.Purpose_home_improvement        0
Loan.Purpose_major_purchase          0
Loan.Purpose_small_business          0
Loan.Purpose_car                     0
Loan.Purpose_wedding                 0
Loan.Purpose_medical                 0
Loan.Purpose_moving                  0
State_CA                             0
State_NY                             0
State_TX                             0
State_FL                             0
State_IL                 

### Imputing Missing Values

In [25]:
for col in df.columns:                                                                   # for each column
    if (col not in ['Interest.Rate','DataType']) & (df[col].isnull().sum()>0):           # if not interest rate ot data type and null values exist
        df.loc[df[col].isnull(),col] = df.loc[df['DataType']=='train',col].mean()        # replace null values with mean of all training data values

In [26]:
df.isnull().sum()

Amount.Requested                     0
Interest.Rate                      300
Debt.To.Income.Ratio                 0
Monthly.Income                       0
Open.CREDIT.Lines                    0
Revolving.CREDIT.Balance             0
Inquiries.in.the.Last.6.Months       0
Employment.Length                    0
DataType                             0
FICO                                 0
Loan.Length_36 months                0
Loan.Purpose_debt_consolidation      0
Loan.Purpose_credit_card             0
Loan.Purpose_other                   0
Loan.Purpose_home_improvement        0
Loan.Purpose_major_purchase          0
Loan.Purpose_small_business          0
Loan.Purpose_car                     0
Loan.Purpose_wedding                 0
Loan.Purpose_medical                 0
Loan.Purpose_moving                  0
State_CA                             0
State_NY                             0
State_TX                             0
State_FL                             0
State_IL                 

Split DataFrame back into train and test:

In [27]:
train = df[df['DataType'] == 'train']
train.drop(['DataType'],axis=1,inplace=True)
test = df[df['DataType'] == 'test']
test.drop(['DataType','Interest.Rate'],axis=1,inplace=True)
del df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


### Linear Regression

Split train dataset into train and validate datasets:

In [28]:
from sklearn.model_selection import train_test_split
train1,validate = train_test_split(train,test_size=0.2,random_state=2)

In [29]:
x_train1 = train1.drop('Interest.Rate',axis=1)
y_train1 = train1['Interest.Rate']

In [30]:
x_train1.shape

(1760, 49)

In [31]:
from sklearn.preprocessing import scale
x_train1_scaled = scale(x_train1)

  


Build Regression model on train dataset:

In [32]:
from sklearn.linear_model import LinearRegression
lm1 = LinearRegression()
lm1.fit(x_train1_scaled,y_train1)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [33]:
lm1.intercept_

13.11028409090909

In [34]:
list(zip(x_train1.columns,lm1.coef_))

[('Amount.Requested', 1.2412823100498058),
 ('Debt.To.Income.Ratio', -0.02934993084458956),
 ('Monthly.Income', -0.11367740251339292),
 ('Open.CREDIT.Lines', -0.17974272069177563),
 ('Revolving.CREDIT.Balance', -0.07340992347361104),
 ('Inquiries.in.the.Last.6.Months', 0.41807116721087),
 ('Employment.Length', 0.12426329262744062),
 ('FICO', -3.0826371641341477),
 ('Loan.Length_36 months', -1.2965703236855517),
 ('Loan.Purpose_debt_consolidation', -0.2338934351603894),
 ('Loan.Purpose_credit_card', -0.23018843939378297),
 ('Loan.Purpose_other', 0.11873687058019135),
 ('Loan.Purpose_home_improvement', -0.08798256518316994),
 ('Loan.Purpose_major_purchase', -0.018202014151089074),
 ('Loan.Purpose_small_business', 0.012859186498847097),
 ('Loan.Purpose_car', 0.0031623601053895056),
 ('Loan.Purpose_wedding', -0.10204736200866393),
 ('Loan.Purpose_medical', -0.04533498442400785),
 ('Loan.Purpose_moving', 0.13919333909181386),
 ('State_CA', -0.07806101292421919),
 ('State_NY', -0.03906872287

In [35]:
from sklearn.metrics import mean_squared_error, r2_score
y_pred = lm1.predict(x_train1_scaled)
print('Training Accuracy:')
print('MSE =',mean_squared_error(y_pred,y_train1))
print('R^2 =',r2_score(y_pred,y_train1))

Training Accuracy:
MSE = 4.118615454703139
R^2 = 0.6904918191259801


Evaluate model with validate dataset:

In [36]:
x_val = validate.drop('Interest.Rate',axis=1)
y_val = validate['Interest.Rate']
x_val_scaled = scale(x_val)
y_pred = lm1.predict(x_val_scaled)
print('Test Accuracy:')
print('MSE =',mean_squared_error(y_pred,y_val))
print('R^2 =',r2_score(y_pred,y_val))

Test Accuracy:
MSE = 4.249401069122784
R^2 = 0.6954509084963858


  This is separate from the ipykernel package so we can avoid doing imports until


Once we are satidifed with our model, we build a new model on ALL training data and predict interst arte for test data.

In [37]:
x_train = train.drop('Interest.Rate',axis=1)
y_train = train['Interest.Rate']
x_train_scaled = scale(x_train)
lm2 = LinearRegression()
lm2.fit(x_train_scaled,y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [38]:
y_pred = lm2.predict(x_train_scaled)
print('Training Accuracy:')
print('MSE =',mean_squared_error(y_pred,y_train))
print('R^2 =',r2_score(y_pred,y_train))

Training Accuracy:
MSE = 4.105637011197528
R^2 = 0.6913818651793673


In [39]:
test_scaled = scale(test)
lm_pred = lm2.predict(test_scaled)
lm_pred

  """Entry point for launching an IPython kernel.


array([16.39696247, 15.76974097, 10.20251785,  3.12300054, 15.18859855,
        6.59085469, 15.6258013 , 10.56292366, 15.87652998, 12.3196074 ,
        9.38097087, 14.84614932, 11.18832843, 13.66830424, 13.43177155,
       18.24105837, 10.01560129, 15.91634568, 13.75428443, 13.87010929,
       22.62008547, 17.46614603, 11.66336396, 14.86125109,  9.46058821,
       10.98110732, 13.07323815, 19.4806412 , 11.97650696, 16.59332702,
       15.00975322, 15.39868655, 12.23016814, 15.24680989, 13.61339806,
       14.18668515, 19.45686439, 10.98710449, 11.73933849, 16.77492664,
       14.10528086, 10.96641054, 14.60145911, 13.01151919, 15.65854861,
       17.09720824, 14.6976251 , 17.06813587, 16.76001187, 10.1276804 ,
       13.65394029, 19.78261501,  9.38132006, 19.72228401, 15.34410921,
       14.58642542, 17.48909144, 14.40332416, 10.78918852, 14.32469732,
       13.04025006, 16.91782098,  8.52344146, 14.47339631, 10.17113064,
       11.29301742, 12.75007268, 14.72754415, 11.58844953, 14.54

### Ridge Regression

In [40]:
from sklearn.linear_model import Ridge,Lasso
from sklearn.model_selection import GridSearchCV

GridSearchCV will help choose the best model by running regression for different values of hyper parameters and evaluate them based on some score

In [41]:
lambdas = np.linspace(1,100,100)
params = {'alpha':lambdas}              # list of alpha values to run ridge regression on 
model = Ridge(fit_intercept=True)
grid_search = GridSearchCV(model,param_grid=params,cv=10,scoring='neg_mean_absolute_error')
grid_search.fit(x_train,y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': array([  1.,   2., ...,  99., 100.])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_absolute_error', verbose=0)

In [42]:
ridge_model = grid_search.best_estimator_
ridge_model

Ridge(alpha=34.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

alpha = 34 gives the best model in terms of the neg_mean_absolute_error
build model with this value

In [43]:
ridge_pred = ridge_model.predict(test)
ridge_pred

array([15.75109565, 15.73010357, 10.14468459,  3.76958704, 15.19390662,
        7.1421839 , 15.53643384, 10.70257787, 15.31068037, 12.45037898,
       10.15286276, 15.27174052, 11.60825381, 13.67420137, 13.53345   ,
       18.17300231, 10.37918668, 15.79829125, 13.9607565 , 13.97851011,
       22.22973825, 17.36171244, 12.24694189, 14.90709652,  9.7564946 ,
       10.97897394, 13.1195306 , 19.34856488, 12.1557721 , 16.95856414,
       15.04012954, 15.42790216, 12.46104363, 14.87180327, 13.82935252,
       14.34132999, 19.36051318, 11.22211157, 11.93499515, 16.70338878,
       14.23265138, 11.26540904, 14.75495475, 13.25109647, 15.81460511,
       17.1526763 , 14.80002671, 17.04446679, 16.7530149 , 10.50895687,
       13.70746069, 19.65219078,  9.63864676, 19.53910361, 15.48829399,
       14.71840365, 17.50453102, 14.55799173, 10.99136203, 14.7275581 ,
       13.26303071, 16.93513707,  8.88369672, 14.67281728, 10.58491918,
       11.50670557, 12.99165982, 14.72452426, 11.75041837, 14.55

### Lasso Regression

In [44]:
lambdas = np.linspace(1,10,100)
model = Lasso(fit_intercept=True)
params = {'alpha':lambdas}
grid_search = GridSearchCV(model,param_grid=params,cv=10,scoring='neg_mean_absolute_error')
grid_search.fit(x_train,y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': array([ 1.     ,  1.09091, ...,  9.90909, 10.     ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_absolute_error', verbose=0)

In [45]:
grid_search.best_estimator_

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

since alpha is at one extreme of our range, we should fine tune our lambdas and run again

In [46]:
lambdas = np.linspace(0.001,2,100)
params = {'alpha':lambdas}
model = Lasso(fit_intercept=True)
grid_search = GridSearchCV(model,param_grid=params,cv=10,scoring='neg_mean_absolute_error')
grid_search.fit(x_train,y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': array([1.00000e-03, 2.11919e-02, ..., 1.97981e+00, 2.00000e+00])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_absolute_error', verbose=0)

In [50]:
lasso_model = grid_search.best_estimator_
lasso_model

Lasso(alpha=0.021191919191919192, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=False, positive=False, precompute=False,
   random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [53]:
lasso_pred = lasso_model.predict(test)
lasso_pred

array([15.36032328, 15.88916291, 10.26112365,  3.8308861 , 15.30546576,
        7.41199396, 15.89833883, 10.49052758, 14.66072998, 12.27577655,
       10.20957297, 15.43644094, 11.76009898, 13.52473623, 13.641083  ,
       18.16708402, 10.25105383, 15.56859926, 14.01395419, 14.10537963,
       22.17721279, 17.83721311, 12.2742275 , 14.90928494,  9.7690368 ,
       10.70677987, 13.06113527, 19.58077172, 12.39541509, 17.03788943,
       15.30725766, 15.46604565, 12.5958439 , 14.69286867, 13.56654966,
       14.33417   , 19.36462143, 11.12257013, 11.77487522, 16.60520708,
       14.28329616, 11.45380944, 14.69504677, 13.21462305, 16.04954711,
       16.99122131, 15.30819439, 16.76555313, 16.87784164, 10.6079011 ,
       13.73995017, 19.70251362,  9.52890554, 19.6061302 , 15.40359972,
       14.84705631, 17.9131928 , 14.87009174, 10.86985434, 15.04398748,
       13.33618298, 17.02194353,  8.8977951 , 14.57446613, 10.26899544,
       11.49567077, 12.94188894, 14.88460125, 11.73323675, 14.60

In [54]:
DataFrame({'Linear':lm_pred, 'Ridge':ridge_pred, 'Lasso':lasso_pred})

Unnamed: 0,Linear,Ridge,Lasso
0,16.396962,15.751096,15.360323
1,15.769741,15.730104,15.889163
2,10.202518,10.144685,10.261124
3,3.123001,3.769587,3.830886
4,15.188599,15.193907,15.305466
5,6.590855,7.142184,7.411994
6,15.625801,15.536434,15.898339
7,10.562924,10.702578,10.490528
8,15.876530,15.310680,14.660730
9,12.319607,12.450379,12.275777


- If you have too many columns, use LASSO since it will reduce dimensions (remove insignificant columns)
- If you don't have too many columns, use Ridge since it will not remove but just shrink the insignificant variables
- usually expect $num\_rows = (num\_col)^3$
- if rows less than expected then we want to reduce dimesnions so use LASSO,
- else use Ridge