# Regression Case Study

In [437]:
import numpy as np
import pandas as pd
from itertools import combinations
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV, RFE
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_squared_error
from sklearn.linear_model import Ridge

import src.model as m

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [438]:
df = pd.read_csv('data/Train.csv')
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,saledate,...,Undercarriage_Pad_Width,Stick_Length,Thumb,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls
0,1139246,66000,999089,3157,121,3.0,2004,68.0,Low,11/16/2006 0:00,...,,,,,,,,,Standard,Conventional
1,1139248,57000,117657,77,121,3.0,1996,4640.0,Low,3/26/2004 0:00,...,,,,,,,,,Standard,Conventional
2,1139249,10000,434808,7009,121,3.0,2001,2838.0,High,2/26/2004 0:00,...,,,,,,,,,,
3,1139251,38500,1026470,332,121,3.0,2001,3486.0,High,5/19/2011 0:00,...,,,,,,,,,,
4,1139253,11000,1057373,17311,121,3.0,2007,722.0,Medium,7/23/2009 0:00,...,,,,,,,,,,


### Linear Regression Pipeline

In [536]:
features = [
'SalesID' ,      # Always included as key for predictions
# 'MachineID' ,
'ModelID' ,
# 'datasource' ,
'auctioneerID' ,
'YearMade' ,
# 'MachineHoursCurrentMeter',
# 'UsageBand',
# 'saledate',    # BE CAREFUL
# 'fiModelDesc',
# 'fiBaseModel',
# 'fiSecondaryDesc',
# 'fiModelSeries',
# 'fiModelDescriptor',
'ProductSize',
# 'fiProductClassDesc',
# 'state',
'ProductGroup',
# 'ProductGroupDesc',
# 'Drive_System',
'Enclosure',
# 'Forks',
# 'Pad_Type',
# 'Ride_Control',
# 'Stick',
# 'Transmission',
# 'Turbocharged',
# 'Blade_Extension',
# 'Blade_Width',
# 'Enclosure_Type',
# 'Engine_Horsepower',
'Hydraulics',
# 'Pushblock',
# 'Ripper',
# 'Scarifier',
# 'Tip_Control',
'Tire_Size',
# 'Coupler',
# 'Coupler_System',
# 'Grouser_Tracks',
# 'Hydraulics_Flow',
# 'Track_Type',
# 'Undercarriage_Pad_Width',
# 'Stick_Length',
# 'Thumb',
# 'Pattern_Changer',
# 'Grouser_Type',
# 'Backhoe_Mounting',
# 'Blade_Type',
# 'Travel_Controls',
# 'Differential_Type',
# 'Steering_Controls'
]

In [537]:
X_df, y_df = m.clean_features(df, features, 'SalePrice')
trained_features = list(X_df.columns)
X_sid = X_df.pop('SalesID')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df)

In [538]:
pipeline = Pipeline([('scalar', StandardScaler()), ('linear', LinearRegression())])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# Floor predictions at mean
y_pred[y_pred < 0] = y_train.mean()

# Scoring
rmse_score = np.sqrt(mean_squared_error(y_pred, y_test))
rmsle_score = np.sqrt(mean_squared_log_error(y_pred, y_test))

print(X_train.columns)
print('RMSE: {}'.format(rmse_score))
print('RMSLE: {}'.format(rmsle_score))
# pipeline.named_steps['linear'].coef_

Index(['ModelID', 'auctioneerID', 'YearMade', 'MachineHoursCurrentMeter',
       'compact', 'large', 'large / medium', 'medium', 'mini', 'erops',
       'erops ac', 'erops w ac', 'no rops', 'none or unspecified_x', '10 inch',
       '10"', '13"', '14"', '15.5', '15.5"', '17.5', '17.5"', '20.5', '20.5"',
       '23.1"', '23.5', '23.5"', '26.5', '29.5', '7.0"', 'bl', 'mg', 'ssl',
       'tex', 'ttt', '2 valve', '3 valve', '4 valve', 'auxiliary',
       'base + 1 function', 'base + 2 function', 'base + 3 function',
       'base + 4 function', 'base + 5 function', 'base + 6 function',
       'none or unspecified_y'],
      dtype='object')
RMSE: 15632.84145007804
RMSLE: 0.5204443679961709


In [539]:
params = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001]}
ridge = Ridge()
g = GridSearchCV(ridge, params, cv=10)
g.fit(X_train, y_train)

rbf_grid_score = g.best_score_

print('Best Params: {}'.format(g.best_params_))
print('Best Score: {}'.format(g.best_score_))

Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number1.274789e-17
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number1.274789e-18
  overwrite_a=True).T


Best Params: {'alpha': 1}
Best Score: 0.5429062288593635


In [541]:
ridge = Ridge(alpha=1)
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)

# Floor predictions at mean
ridge_pred[ridge_pred < 0] = y_train.mean()

# Scoring
rmse_score = np.sqrt(mean_squared_error(ridge_pred, y_test))
rmsle_score = np.sqrt(mean_squared_log_error(ridge_pred, y_test))

print(X_train.columns)
print('RMSE: {}'.format(rmse_score))
print('RMSLE: {}'.format(rmsle_score))
# ridge.coef_

Index(['ModelID', 'auctioneerID', 'YearMade', 'MachineHoursCurrentMeter',
       'compact', 'large', 'large / medium', 'medium', 'mini', 'erops',
       'erops ac', 'erops w ac', 'no rops', 'none or unspecified_x', '10 inch',
       '10"', '13"', '14"', '15.5', '15.5"', '17.5', '17.5"', '20.5', '20.5"',
       '23.1"', '23.5', '23.5"', '26.5', '29.5', '7.0"', 'bl', 'mg', 'ssl',
       'tex', 'ttt', '2 valve', '3 valve', '4 valve', 'auxiliary',
       'base + 1 function', 'base + 2 function', 'base + 3 function',
       'base + 4 function', 'base + 5 function', 'base + 6 function',
       'none or unspecified_y'],
      dtype='object')
RMSE: 15632.836296230256
RMSLE: 0.5203386565358565


### Produce Test Predictions


In [542]:
test_df = pd.read_csv('data/test.csv')

X = m.clean_features(test_df, features, target=None, fill=trained_features)
sid = X.pop('SalesID')

In [543]:
test_pred = pipeline.predict(X)
# test_pred = ridge.predict(X)

# Floor predictions at mean
test_pred[test_pred < 0] = y_train.mean()

results = pd.concat([pd.Series(sid), pd.Series(test_pred)], axis=1)
results.columns = ['SalesID', 'SalePrice']
results.to_csv('./data/WIN.csv')

### Recursive Feature Elimination with Cross-Validation

In [366]:
X_df, y_df = m.clean_features(df, features, 'SalePrice')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df)

In [483]:
# Scoring functions
msle_func = make_scorer(mean_squared_log_error)
mse_func = make_scorer(mean_squared_error)

# RFECV
estimator = LinearRegression()
selector = RFECV(estimator, cv=10)
selector = RFE(estimator, n_features_to_select=20)
selector = selector.fit(X_train, y_train)
y_pred = selector.predict(X_test)

# Floor predictions at zero
y_pred[y_pred < 0] = y_train.mean()

# Scoring
rmse_score = np.sqrt(mean_squared_error(y_pred, y_test))
rmsle_score = np.sqrt(mean_squared_log_error(y_pred, y_test))

print('Selected features: {}'.format(X_train.columns[selector.support_]))
print('RMSE: {}'.format(rmse_score))
print('RMSLE: {}'.format(rmsle_score))

Selected features: Index(['erops w ac', 'no rops', 'none or unspecified', '10 inch', '10"', '13"',
       '15.5', '17.5', '17.5"', '20.5"', '23.1"', '23.5', '23.5"', '26.5',
       '29.5', '7.0"', 'bl', 'mg', 'ssl', 'ttt'],
      dtype='object')
RMSE: 18449.216170656866
RMSLE: 0.5613296074995255


### LASSO for feature selection

In [316]:
from sklearn.linear_model import LassoCV

In [317]:
lasso = LassoCV()
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)
np.sqrt(mean_squared_log_error(lasso_pred, y_test))

  return mean_squared_error(np.log(y_true + 1), np.log(y_pred + 1),


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
lasso.coef_, X_train.columns[lasso.coef_ != 0]