<a href="https://colab.research.google.com/github/ganeshkodi/20533-ImplementingMicrosoftAzureInfrastructureSolutions/blob/master/Ganesh_Kodi_CaseStudy10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Working with Missing Data

**Utilizing the Boston Housing dataset**

In [67]:
!pip install ml_metrics



In [0]:
# Import package dependencies
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from ml_metrics import rmse
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston

In [69]:
# Load in the dataset
boston = load_boston()
print(boston.data.shape)

(506, 13)


In [70]:
print(boston.keys())

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])


In [71]:
# View the data descriptions
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [72]:
boston.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [73]:
# Convert the matrix to pandas
bos = pd.DataFrame(boston.data)
bos.columns = boston.feature_names
bos['MEDV'] = boston.target
bos.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [74]:
bos.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


## Start by fitting a Linear Regression model to the full dataset

**Create a training and testing split (ex., 70/30-split)**

In [75]:
# Create training and testing sets (cross-validation not needed)
train_set = bos.sample(frac=0.7, random_state=100)
test_set = bos[~bos.isin(train_set)].dropna()
print(train_set.shape[0])
print(test_set.shape[0])

354
152


In [76]:
train_set.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
198,0.03768,80.0,1.52,0.0,0.404,7.274,38.3,7.309,2.0,329.0,12.6,392.2,6.62,34.6
229,0.44178,0.0,6.2,0.0,0.504,6.552,21.4,3.3751,8.0,307.0,17.4,380.34,3.76,31.5
502,0.04527,0.0,11.93,0.0,0.573,6.12,76.7,2.2875,1.0,273.0,21.0,396.9,9.08,20.6
31,1.35472,0.0,8.14,0.0,0.538,6.072,100.0,4.175,4.0,307.0,21.0,376.73,13.04,14.5
315,0.25356,0.0,9.9,0.0,0.544,5.705,77.7,3.945,4.0,304.0,18.4,396.42,11.5,16.2


In [0]:
# Get the training and testing row indices for later use
train_index = train_set.index.values.astype(int)
test_index = test_set.index.values.astype(int)

In [78]:
# Demonstration of using the row indices above to select consistent records
bos.iloc[train_index].head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
198,0.03768,80.0,1.52,0.0,0.404,7.274,38.3,7.309,2.0,329.0,12.6,392.2,6.62,34.6
229,0.44178,0.0,6.2,0.0,0.504,6.552,21.4,3.3751,8.0,307.0,17.4,380.34,3.76,31.5
502,0.04527,0.0,11.93,0.0,0.573,6.12,76.7,2.2875,1.0,273.0,21.0,396.9,9.08,20.6
31,1.35472,0.0,8.14,0.0,0.538,6.072,100.0,4.175,4.0,307.0,21.0,376.73,13.04,14.5
315,0.25356,0.0,9.9,0.0,0.544,5.705,77.7,3.945,4.0,304.0,18.4,396.42,11.5,16.2


In [0]:
# Converting the training and testing datasets back to matrix-formats
X_train = train_set.iloc[:, :-1].values # returns the data; excluding the target
Y_train = train_set.iloc[:, -1].values # returns the target-only
X_test = test_set.iloc[:, :-1].values # ""
Y_test = test_set.iloc[:, -1].values # ""

In [80]:
# Fit a linear regression to the training data
reg = LinearRegression(normalize=True).fit(X_train, Y_train)
print(reg.score(X_train, Y_train))
print(reg.coef_)
print(reg.intercept_)
print(reg.get_params())

0.7478284701218886
[-1.35456753e-01  5.48606010e-02  5.46611167e-02  3.57648807e+00
 -2.01163242e+01  3.96567027e+00  1.33685712e-02 -1.48716658e+00
  2.99295349e-01 -9.83868843e-03 -9.45023886e-01  6.45207267e-03
 -5.77572297e-01]
36.07934768828229
{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': True}


In [81]:
# Find the variable with the largest "normalized" coefficient value
print('The abs(max) coef-value is {}'.format(max(reg.coef_))) # Positive Max
#print('The abs(max) coef-value is {}'.format(max(reg.coef_, key=abs))) # ABS Max
max_var = max(reg.coef_) # Positive Max
#max_var = max(reg.coef_, key=abs) # ABS Max
var_index = reg.coef_.tolist().index(max_var)
print('The variable associated with this coef-value is {}'.format(boston.feature_names[var_index]))

The abs(max) coef-value is 3.9656702708586287
The variable associated with this coef-value is RM


In [82]:
Y_pred = reg.predict(X_test)

orig_mae = mean_absolute_error(Y_test,Y_pred)
orig_mse = mean_squared_error(Y_test,Y_pred)
orig_rmse_val = rmse(Y_test,Y_pred)
orig_r2 = r2_score(Y_test,Y_pred)
print("MAE: %.3f"%orig_mae)
print("MSE:  %.3f"%orig_mse)
print("RMSE:  %.3f"%orig_rmse_val)
print("R2:  %.3f"%orig_r2)

MAE: 3.605
MSE:  24.099
RMSE:  4.909
R2:  0.705


In [0]:
res_frame = pd.DataFrame({'data':'original',
                   'imputation':'none',
                   'mae': orig_mae, 
                   'mse': orig_mse, 
                   'rmse':orig_rmse_val, 
                   'R2':orig_r2,
                   'mae_diff':np.nan,
                   'mse_diff':np.nan,
                   'rmse_diff':np.nan,
                   'R2_diff':np.nan}, index=[0])

In [84]:
res_frame

Unnamed: 0,data,imputation,mae,mse,rmse,R2,mae_diff,mse_diff,rmse_diff,R2_diff
0,original,none,3.604571,24.098505,4.909023,0.70494,,,,


## Round 1 of Imputation

**Here we can randomly sample the full dataset and replace a single column's values**

In [85]:
temp_frame =  pd.DataFrame([])
def missingRM(fraction):
  in_sample = bos.sample(frac=fraction, random_state=99)
  out_sample = bos[~bos.isin(in_sample)].dropna()
  in_sample['RM'] = np.nan
  in_sample['RM'] = in_sample['RM'].fillna(out_sample['RM'].median())
  imputed_data = pd.concat([in_sample, out_sample])
  imputed_data = imputed_data.sort_index()
  train_set = imputed_data.iloc[train_index]
  test_set = imputed_data.iloc[test_index]
  X_train = train_set.iloc[:, :-1].values
  Y_train = train_set.iloc[:, -1].values
  X_test = test_set.iloc[:, :-1].values
  Y_test = test_set.iloc[:, -1].values
  reg2 = LinearRegression().fit(X_train, Y_train)
  Y_pred = reg2.predict(X_test)

  mae = mean_absolute_error(Y_test,Y_pred)
  mse = mean_squared_error(Y_test,Y_pred)
  rmse_val = rmse(Y_test,Y_pred)
  r2 = r2_score(Y_test,Y_pred)
  frame= pd.DataFrame({'data imputed':fraction,
                    'imputation':'MAR',
                    'mae': mae, 
                    'mse': mse, 
                    'rmse':rmse_val,
                    'R2':r2,
                    'mae_diff':mae-orig_mae,
                    'mse_diff':mse-orig_mse,
                    'rmse_diff':rmse_val-orig_rmse_val,
                    'R2_diff':r2-orig_r2
                    }, index=[0])
  with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    print(frame)
for i in 0.01,0.05,0.1,0.2,0.33,0.5:
  missingRM(i)


   data imputed imputation       mae        mse      rmse        R2  mae_diff  \
0          0.01        MAR  3.634187  24.206667  4.920027  0.703616  0.029616   

   mse_diff  rmse_diff   R2_diff  
0  0.108162   0.011004 -0.001324  
   data imputed imputation       mae        mse      rmse        R2  mae_diff  \
0          0.05        MAR  3.644905  24.652966  4.965175  0.698151  0.040334   

   mse_diff  rmse_diff   R2_diff  
0  0.554461   0.056153 -0.006789  
   data imputed imputation       mae        mse      rmse        R2  mae_diff  \
0           0.1        MAR  3.659578  25.155434  5.015519  0.691999  0.055007   

   mse_diff  rmse_diff   R2_diff  
0  1.056929   0.106496 -0.012941  
   data imputed imputation       mae       mse      rmse        R2  mae_diff  \
0           0.2        MAR  3.663828  25.05039  5.005036  0.693285  0.059256   

   mse_diff  rmse_diff   R2_diff  
0  0.951885   0.096014 -0.011655  
   data imputed imputation       mae        mse      rmse        R2  m

In [102]:
temp_frame =  pd.DataFrame([])
def missingRMNOX(fraction):
  in_sample = bos.sample(frac=fraction, random_state=99)
  out_sample = bos[~bos.isin(in_sample)].dropna()
  in_sample['RM'] = in_sample.apply(lambda x: x['RM'] if (x['AGE'] > 30 and x['AGE'] % 2 == 0 ) else np.NaN, axis=1)
  in_sample['NOX'] = in_sample.apply(lambda x: x['NOX'] if (x['AGE'] > 30 and x['AGE'] % 2 != 0 ) else np.NaN, axis=1)
  print(in_sample['RM'].isna().sum())
  print(in_sample['NOX'].isna().sum())

  in_sample['RM'] = in_sample['RM'].fillna(out_sample['RM'].median())
  in_sample['NOX'] = in_sample['NOX'].fillna(out_sample['NOX'].mean())
  imputed_data = pd.concat([in_sample, out_sample])
  imputed_data = imputed_data.sort_index()
  train_set = imputed_data.iloc[train_index]
  test_set = imputed_data.iloc[test_index]
  X_train = train_set.iloc[:, :-1].values
  Y_train = train_set.iloc[:, -1].values
  X_test = test_set.iloc[:, :-1].values
  Y_test = test_set.iloc[:, -1].values
  reg2 = LinearRegression().fit(X_train, Y_train)
  Y_pred = reg2.predict(X_test)

  mae = mean_absolute_error(Y_test,Y_pred)
  mse = mean_squared_error(Y_test,Y_pred)
  rmse_val = rmse(Y_test,Y_pred)
  r2 = r2_score(Y_test,Y_pred)
  frame= pd.DataFrame({'data imputed':fraction,
                    'imputation':'MAR',
                    'mae': mae, 
                    'mse': mse, 
                    'rmse':rmse_val,
                    'R2':r2,
                    'mae_diff':mae-orig_mae,
                    'mse_diff':mse-orig_mse,
                    'rmse_diff':rmse_val-orig_rmse_val,
                    'R2_diff':r2-orig_r2
                    }, index=[0])
  with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    print(frame)
for i in 0.1,0.2,0.3:
  missingRMNOX(i)


40
17
   data imputed imputation       mae        mse      rmse        R2  mae_diff  \
0           0.1        MAR  3.639741  24.847128  4.984689  0.695774   0.03517   

   mse_diff  rmse_diff   R2_diff  
0  0.748623   0.075667 -0.009166  
84
28
   data imputed imputation       mae        mse      rmse        R2  mae_diff  \
0           0.2        MAR  3.642942  25.332479  5.033138  0.689831  0.038371   

   mse_diff  rmse_diff   R2_diff  
0  1.233974   0.124115 -0.015109  
126
44
   data imputed imputation       mae        mse      rmse        R2  mae_diff  \
0           0.3        MAR  3.743918  27.054917  5.201434  0.668742  0.139347   

   mse_diff  rmse_diff   R2_diff  
0  2.956412   0.292411 -0.036198  


In [0]:
temp_frame =  pd.DataFrame([])
def missingNonRandomRM(fraction):
  in_sample = bos.sample(frac=fraction, random_state=99)
  out_sample = bos[~bos.isin(in_sample)].dropna()
  in_sample['RM'] = in_sample.apply(lambda x: x['RM'] if (x['AGE'] > 30 and x['AGE'] % 2 == 0 ) else np.NaN, axis=1)
  in_sample['NOX'] = in_sample.apply(lambda x: x['NOX'] if (x['AGE'] > 30 and x['AGE'] % 2 != 0 ) else np.NaN, axis=1)
  print(in_sample['RM'].isna().sum())
  print(in_sample['NOX'].isna().sum())

  in_sample['RM'] = in_sample['RM'].fillna(out_sample['RM'].median())
  in_sample['NOX'] = in_sample['NOX'].fillna(out_sample['NOX'].mean())
  imputed_data = pd.concat([in_sample, out_sample])
  imputed_data = imputed_data.sort_index()
  train_set = imputed_data.iloc[train_index]
  test_set = imputed_data.iloc[test_index]
  X_train = train_set.iloc[:, :-1].values
  Y_train = train_set.iloc[:, -1].values
  X_test = test_set.iloc[:, :-1].values
  Y_test = test_set.iloc[:, -1].values
  reg2 = LinearRegression().fit(X_train, Y_train)
  Y_pred = reg2.predict(X_test)

  mae = mean_absolute_error(Y_test,Y_pred)
  mse = mean_squared_error(Y_test,Y_pred)
  rmse_val = rmse(Y_test,Y_pred)
  r2 = r2_score(Y_test,Y_pred)
  frame= pd.DataFrame({'data imputed':fraction,
                    'imputation':'MAR',
                    'mae': mae, 
                    'mse': mse, 
                    'rmse':rmse_val,
                    'R2':r2,
                    'mae_diff':mae-orig_mae,
                    'mse_diff':mse-orig_mse,
                    'rmse_diff':rmse_val-orig_rmse_val,
                    'R2_diff':r2-orig_r2
                    }, index=[0])
  with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    print(frame)
for i in 0.1,0.2,0.3:
  missingRMNOX(i)


40
17
   data imputed imputation       mae        mse      rmse        R2  mae_diff  \
0           0.1        MAR  3.639741  24.847128  4.984689  0.695774   0.03517   

   mse_diff  rmse_diff   R2_diff  
0  0.748623   0.075667 -0.009166  
84
28
   data imputed imputation       mae        mse      rmse        R2  mae_diff  \
0           0.2        MAR  3.642942  25.332479  5.033138  0.689831  0.038371   

   mse_diff  rmse_diff   R2_diff  
0  1.233974   0.124115 -0.015109  
126
44
   data imputed imputation       mae        mse      rmse        R2  mae_diff  \
0           0.3        MAR  3.743918  27.054917  5.201434  0.668742  0.139347   

   mse_diff  rmse_diff   R2_diff  
0  2.956412   0.292411 -0.036198  


In [87]:
in_sample = bos.sample(frac=0.3, random_state=99)
in_sample.shape

(152, 14)

In [88]:
out_sample = bos[~bos.isin(in_sample)].dropna()
out_sample.shape

(354, 14)

In [89]:
print(out_sample.shape[0] + in_sample.shape[0])
print(bos.shape[0])

506
506


In [90]:
in_sample.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
281,0.03705,20.0,3.33,0.0,0.4429,6.968,37.2,5.2447,5.0,216.0,14.9,392.23,4.59,35.4
273,0.22188,20.0,6.96,1.0,0.464,7.691,51.8,4.3665,3.0,223.0,18.6,390.77,6.58,35.2
251,0.21409,22.0,5.86,0.0,0.431,6.438,8.9,7.3967,7.0,330.0,19.1,377.07,3.59,24.8
329,0.06724,0.0,3.24,0.0,0.46,6.333,17.2,5.2146,4.0,430.0,16.9,375.21,7.34,22.6
40,0.03359,75.0,2.95,0.0,0.428,7.024,15.8,5.4011,3.0,252.0,18.3,395.62,1.98,34.9


## Choose a variable to replace

In [91]:
#in_sample['NOX'] = np.nan
in_sample['RM'] = np.nan
in_sample.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
281,0.03705,20.0,3.33,0.0,0.4429,,37.2,5.2447,5.0,216.0,14.9,392.23,4.59,35.4
273,0.22188,20.0,6.96,1.0,0.464,,51.8,4.3665,3.0,223.0,18.6,390.77,6.58,35.2
251,0.21409,22.0,5.86,0.0,0.431,,8.9,7.3967,7.0,330.0,19.1,377.07,3.59,24.8
329,0.06724,0.0,3.24,0.0,0.46,,17.2,5.2146,4.0,430.0,16.9,375.21,7.34,22.6
40,0.03359,75.0,2.95,0.0,0.428,,15.8,5.4011,3.0,252.0,18.3,395.62,1.98,34.9


**Choose an imputation method**

In [92]:
#out_sample['NOX'].median()
out_sample['RM'].median()

6.2085

In [93]:
#in_sample['NOX'] = in_sample['NOX'].fillna(out_sample['NOX'].median())
#in_sample['NOX'] = in_sample['NOX'].fillna(1)
in_sample['RM'] = in_sample['RM'].fillna(out_sample['RM'].median())
in_sample.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
281,0.03705,20.0,3.33,0.0,0.4429,6.2085,37.2,5.2447,5.0,216.0,14.9,392.23,4.59,35.4
273,0.22188,20.0,6.96,1.0,0.464,6.2085,51.8,4.3665,3.0,223.0,18.6,390.77,6.58,35.2
251,0.21409,22.0,5.86,0.0,0.431,6.2085,8.9,7.3967,7.0,330.0,19.1,377.07,3.59,24.8
329,0.06724,0.0,3.24,0.0,0.46,6.2085,17.2,5.2146,4.0,430.0,16.9,375.21,7.34,22.6
40,0.03359,75.0,2.95,0.0,0.428,6.2085,15.8,5.4011,3.0,252.0,18.3,395.62,1.98,34.9


**Rejoin the imputed and original datasets**

In [94]:
imputed_data = pd.concat([in_sample, out_sample])
imputed_data = imputed_data.sort_index()
imputed_data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,6.2085,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.2085,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


**Use the same training and testing indices to fit the model**

In [95]:
train_set = imputed_data.iloc[train_index]
test_set = imputed_data.iloc[test_index]
train_set.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
198,0.03768,80.0,1.52,0.0,0.404,7.274,38.3,7.309,2.0,329.0,12.6,392.2,6.62,34.6
229,0.44178,0.0,6.2,0.0,0.504,6.552,21.4,3.3751,8.0,307.0,17.4,380.34,3.76,31.5
502,0.04527,0.0,11.93,0.0,0.573,6.12,76.7,2.2875,1.0,273.0,21.0,396.9,9.08,20.6
31,1.35472,0.0,8.14,0.0,0.538,6.072,100.0,4.175,4.0,307.0,21.0,376.73,13.04,14.5
315,0.25356,0.0,9.9,0.0,0.544,5.705,77.7,3.945,4.0,304.0,18.4,396.42,11.5,16.2


In [0]:
X_train = train_set.iloc[:, :-1].values
Y_train = train_set.iloc[:, -1].values
X_test = test_set.iloc[:, :-1].values
Y_test = test_set.iloc[:, -1].values

**Fit a new model to the imputed dataset**

In [97]:
reg2 = LinearRegression().fit(X_train, Y_train)
print(reg2.score(X_train, Y_train))
print(reg2.coef_)
print(reg2.intercept_)
print(reg2.get_params())

0.7342584385725508
[-1.43461121e-01  6.37668265e-02  3.34890184e-02  3.22508130e+00
 -2.19738118e+01  3.43434336e+00  2.59099929e-02 -1.57129188e+00
  3.13212423e-01 -9.49237900e-03 -9.78316140e-01  4.61748422e-03
 -6.93071864e-01]
42.59859535268602
{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': False}


In [98]:
Y_pred = reg2.predict(X_test)

mae = mean_absolute_error(Y_test,Y_pred)
mse = mean_squared_error(Y_test,Y_pred)
rmse_val = rmse(Y_test,Y_pred)
r2 = r2_score(Y_test,Y_pred)
print("MAE: %.3f"%mae)
print("MSE:  %.3f"%mse)
print("RMSE:  %.3f"%rmse_val)
print("R2:  %.3f"%r2)

MAE: 3.720
MSE:  25.988
RMSE:  5.098
R2:  0.682


In [0]:
temp_frame = pd.DataFrame({'data':'30% imputed',
                   'imputation':'MAR',
                   'mae': mae, 
                   'mse': mse, 
                   'rmse':rmse_val,
                   'R2':r2,
                   'mae_diff':mae-orig_mae,
                   'mse_diff':mse-orig_mse,
                   'rmse_diff':rmse_val-orig_rmse_val,
                   'R2_diff':r2-orig_r2
                   }, index=[0])

In [100]:
res_frame = pd.concat([res_frame, temp_frame])
res_frame

Unnamed: 0,data,imputation,mae,mse,rmse,R2,mae_diff,mse_diff,rmse_diff,R2_diff
0,original,none,3.604571,24.098505,4.909023,0.70494,,,,
0,30% imputed,MAR,3.720447,25.987578,5.097801,0.68181,0.115876,1.889073,0.188778,-0.02313
