## Modeling and Model Tuning:

### Import Libraries:

In [266]:
import seaborn as sns
import pandas as pd
import numpy as np

import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge

### Read in Data:

In [267]:
# Read in clean_engineered training data

df_train = pd.read_csv('../datasets/train_cleaned_engineered.csv')

In [268]:
# Read in clean_engineered test data test

df_test = pd.read_csv('../datasets/test_cleaned_engineered.csv')

### Set up X and Y:

In [283]:
# Set up X and y:

X_train = df_train[['Sq Ft Total','Overall Qual','Lot Frontage',
                    'Home Age', 'Yrs Since Remod','Has Fence',
                    'Has Wood Deck','Has Masonry',
                   'Has Fireplace', 'Has Garage', 'Has Pool',
                    'Exter Qual', 'Exter Cond','Bsmt Cond',
                    'Bsmt Qual', 'Kitchen Qual']]

y_train = df_train['log_SalePrice']
     
X_test = df_test[['Sq Ft Total','Overall Qual','Lot Frontage',
                    'Home Age', 'Yrs Since Remod','Has Fence',
                    'Has Wood Deck','Has Masonry',
                   'Has Fireplace', 'Has Garage', 'Has Pool',
                    'Exter Qual', 'Exter Cond','Bsmt Cond',
                    'Bsmt Qual', 'Kitchen Qual']]


### Instantiate and Fit OLS Model:

In [284]:
# Instantiate:

lr = LinearRegression()

In [285]:
# Fit:

lr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

### Score OLS Model:

In [286]:
# Score on training data:

lr.score(X_train,y_train)

0.8696135717875197

In [287]:
# Cross-Validation Score: 

cross_val_score(lr, X_train, y_train, cv = 5).mean()

0.8678461938725789

### Using Standard Scaler on Data:

In [274]:
# Normalize and scale:

# Instantiate.
sc = StandardScaler()
# Fit and transform.
X_train_sc = sc.fit_transform(X_train)
# Transform.
X_test_sc = sc.transform(X_test)
# Cross Val score
cross_val_score(lr, X_train_sc, y_train, cv=5).mean()

0.8681481591995016

### Using Ridge OLS Model:

In [275]:
# Instantiate Ridge

ridge = Ridge()

In [276]:
ridge.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [277]:
ridge.score(X_train, y_train)

0.8699464051071071

In [278]:
# Cross Val score
cross_val_score(ridge, X_train, y_train, cv=5).mean()

0.8681808695439542

### Using LASSO OLS Model:

In [279]:
# Instantiate LASSO

lasso = Lasso()

In [280]:
lasso.fit(X_train, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [281]:
lasso.score(X_train, y_train)

0.7802291596012945

In [282]:
# Cross Val score
cross_val_score(lasso, X_train, y_train, cv=5).mean()

0.7810137456908139

### Translate Predictions Back to Dollars:

In [262]:
# These are out predictions. Huzzah!

y_pred = lr.predict(X_test)

In [263]:
# Translate predictions back to dollars.

y_pred=np.exp(y_pred)

In [264]:
# Create dataframe of predictions for concat, below:

y_pred=pd.DataFrame(y_pred)

In [265]:
y_pred

Unnamed: 0,0
0,123470.282855
1,197012.171442
2,194560.795940
3,118322.741640
4,164601.803750
...,...
873,180533.522513
874,223976.224839
875,131402.867139
876,106872.768529


### Generate Model Summary:

In [247]:
# Create and fit model
model = sm.OLS(y_train, X_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared (uncentered):,0.973
Model:,OLS,Adj. R-squared (uncentered):,0.973
Method:,Least Squares,F-statistic:,4309.0
Date:,"Thu, 09 Apr 2020",Prob (F-statistic):,0.0
Time:,20:42:26,Log-Likelihood:,-24197.0
No. Observations:,2049,AIC:,48430.0
Df Residuals:,2032,BIC:,48520.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Sq Ft Total,46.9789,1.427,32.925,0.000,44.181,49.777
Overall Qual,9179.7677,948.625,9.677,0.000,7319.388,1.1e+04
Lot Frontage,181.9294,36.923,4.927,0.000,109.518,254.341
Age at Sale,202.6452,548.237,0.370,0.712,-872.520,1277.811
Home Age,-492.0899,548.513,-0.897,0.370,-1567.797,583.617
Yrs Since Remod,-390.7945,46.967,-8.321,0.000,-482.902,-298.687
Has Fence,762.2670,1929.751,0.395,0.693,-3022.230,4546.764
Has Wood Deck,1459.3358,1570.880,0.929,0.353,-1621.367,4540.038
Has Masonry,2978.3670,1687.751,1.765,0.078,-331.535,6288.269

0,1,2,3
Omnibus:,645.584,Durbin-Watson:,2.043
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4469.135
Skew:,1.301,Prob(JB):,0.0
Kurtosis:,9.751,Cond. No.,43200.0


### Send Predictions to CSV:

In [212]:
# Create columns of Ids and the baseline_array
predict = pd.concat([df_test.loc[:, 'Id'], y_pred], axis=1)
predict.columns = ['Id', 'SalePrice']
predict.to_csv('../predictions/ers_predict_04.csv', index=None)

In [213]:
predict.head()

Unnamed: 0,Id,SalePrice
0,2658,123470.282855
1,2718,197012.171442
2,2414,194560.79594
3,1989,118322.74164
4,625,164601.80375
