# Submission 03 -- Exploratory Data Analysis

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.model_selection import train_test_split, cross_val_score

## Third Attempt
Using the Lasso coefficients, narrow in on better features.

In [None]:
# Create a series of column names zipped with their lasso coefficients
coef_list = pd.Series({col:coef for col,coef in zip(X_train.columns,lasso.coef_)})

# Pull the columns that did not have 0 coefficients as my new features.
features = coef_list[abs(round(coef_list,2)) > 0.0].index
features

Index(['lot_area', 'overall_qual', 'overall_cond', 'year_built',
       'year_remod/add', 'exter_qual', 'bsmt_qual', 'bsmtfin_sf_1',
       'total_bsmt_sf', 'heating_qc', '1st_flr_sf', 'gr_liv_area',
       'bsmt_full_bath', 'kitchen_qual', 'totrms_abvgrd', 'fireplace_qu',
       'garage_yr_blt', 'garage_cars', 'wood_deck_sf', 'screen_porch',
       'ms_zoning_A (agr)', 'ms_zoning_C (all)', 'ms_zoning_FV',
       'ms_zoning_RM', 'neighborhood_Crawfor', 'neighborhood_GrnHill',
       'neighborhood_MeadowV', 'neighborhood_NridgHt', 'neighborhood_Somerst',
       'neighborhood_StoneBr', 'condition_1_Norm', 'bldg_type_1Fam',
       'bldg_type_Twnhs', 'roof_style_Mansard', 'roof_matl_ClyTile',
       'exterior_1st_AsbShng', 'exterior_1st_BrkFace', 'foundation_PConc',
       'bsmt_exposure_Gd', 'bsmtfin_type_1_Unf', 'heating_Grav',
       'central_air_N', 'functional_Maj1', 'functional_Maj2', 'functional_Sal',
       'functional_Sev', 'functional_Typ', 'garage_type_Attchd',
       'paved_dri

### Use `PolynomialFeatures()` to create interaction columns.
Using the list of columns from the submission above, we are going to create polynomial columns and see how they perform.

In [None]:
from sklearn.preprocessing import PolynomialFeatures

### Import Train, Polynomial Transform, Scale and Model

In [None]:
train = pd.read_csv('../datasets/train_clean.csv')
X = train[features]
y = np.log1p(train['saleprice'])

#### Create Polynomial Features

In [None]:
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

#### Train Test Split the Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_poly, y)

#### Scale the data

In [None]:
ss = StandardScaler()
Z_train = ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

#### Instantiate and fit model.

In [None]:
lasso = LassoCV()

In [None]:
cross_val_score(lasso, Z_train, y_train, cv=5).mean()





0.8712706170234817

In [None]:
lasso.fit(Z_train, y_train)



LassoCV(alphas=None, copy_X=True, cv='warn', eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=None, normalize=False,
    positive=False, precompute='auto', random_state=None,
    selection='cyclic', tol=0.0001, verbose=False)

In [None]:
lasso.score(Z_train, y_train)

0.9348766126887255

In [None]:
lasso.score(Z_test, y_test)

0.878232542201079

### Create submission: `sub03.csv`
Now even more error due to variance. We will see how this model performance on the kaggle and then we will leave the lasso model behind for a bit.

In [None]:
# Pull clean csv file.
test = pd.read_csv('../datasets/test_clean.csv')

# Pull our features from the testing dataframe.
X = test[features]

# Transform our data
X_poly = poly.transform(X)

# Scale our data
Z_test = ss.transform(X_poly)

# Predict our prices, and convert into interpretable values.
test['SalePrice'] = np.expm1(lasso.predict(Z_test))

# Create csv
test[['id','SalePrice']].rename({'id':'Id'},axis=1).to_csv('../submissions/sub_03.csv'
                                                           ,index=False)

### Kaggle Result
- This model received a score of 21114.99264 on kaggle.
- This is more overfit than the last model and this score is 867.12327 points **worse** than `sub02`.

In [None]:
# Create a series of column names zipped with their lasso coefficients
coef_list = pd.Series({col:coef for col,coef in zip(poly.get_feature_names(features),lasso.coef_)})

# Pull the columns that did not have 0 coefficients as my new features.
features_00 = coef_list[abs(round(coef_list,2)) > 0.0].sort_values(ascending=False)