In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from category_encoders import TargetEncoder
import sklearn
import zipfile

# Unzip data

In [2]:
with zipfile.ZipFile('housedata.zip', 'r') as z:
    z.extractall('./')

In [169]:
data = pd.read_csv('data.csv')
data

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,3.130000e+05,3.0,1.50,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2.384000e+06,5.0,2.50,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,3.420000e+05,3.0,2.00,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,4.200000e+05,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,5.500000e+05,4.0,2.50,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,2014-07-09 00:00:00,3.081667e+05,3.0,1.75,1510,6360,1.0,0,0,4,1510,0,1954,1979,501 N 143rd St,Seattle,WA 98133,USA
4596,2014-07-09 00:00:00,5.343333e+05,3.0,2.50,1460,7573,2.0,0,0,3,1460,0,1983,2009,14855 SE 10th Pl,Bellevue,WA 98007,USA
4597,2014-07-09 00:00:00,4.169042e+05,3.0,2.50,3010,7014,2.0,0,0,3,3010,0,2009,0,759 Ilwaco Pl NE,Renton,WA 98059,USA
4598,2014-07-10 00:00:00,2.034000e+05,4.0,2.00,2090,6630,1.0,0,0,3,1070,1020,1974,0,5148 S Creston St,Seattle,WA 98178,USA


# 2.1 
Determine which features are continuous vs categorical. Drop rows without a valid sales
price.

In [170]:
# Drop the date column and all rows with prices <= 0
data = data[data.price > 0.0]
data = data.drop(columns='date')
# Get the target 'price' column and remove it from the dataframe
# Remove 'street' and 'country'columns are not useful
price = data['price']
data = data.drop(columns=['price', 'country', 'street'])

In [200]:
# Separate features based on the type of preprocessing required
continuous = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'sqft_above', 'sqft_basement',
              'yr_built', 'yr_renovated']
ordinal = ['condition', 'view']
categorical = ['waterfront', 'city']
target = ['statezip']

In [207]:
target_names, target_nums = np.unique(data['city'], return_counts=True)
target_names

array(['Algona', 'Auburn', 'Beaux Arts Village', 'Bellevue',
       'Black Diamond', 'Bothell', 'Burien', 'Carnation', 'Clyde Hill',
       'Covington', 'Des Moines', 'Duvall', 'Enumclaw', 'Fall City',
       'Federal Way', 'Inglewood-Finn Hill', 'Issaquah', 'Kenmore',
       'Kent', 'Kirkland', 'Lake Forest Park', 'Maple Valley', 'Medina',
       'Mercer Island', 'Milton', 'Newcastle', 'Normandy Park',
       'North Bend', 'Pacific', 'Preston', 'Ravensdale', 'Redmond',
       'Renton', 'Sammamish', 'SeaTac', 'Seattle', 'Shoreline',
       'Skykomish', 'Snoqualmie', 'Snoqualmie Pass', 'Tukwila', 'Vashon',
       'Woodinville', 'Yarrow Point'], dtype=object)

In [55]:
categorical = []
continuous = []
for col in data.columns:
    print(col, data[col].dtypes)

bedrooms float64
bathrooms float64
sqft_living int64
sqft_lot int64
floors float64
waterfront int64
view int64
condition int64
sqft_above int64
sqft_basement int64
yr_built int64
yr_renovated int64
city object
statezip object


Within the dataset, the features which are continuous include bedrooms, bathrooms, sqft_living, sqft_loft, sqft_above, sqft_basement, The features which are categorial include the city, street, country, and statezip. As street and country are not useful for the task of predicting price, these columns have been dropped from the dataframe.

## 2.3 
Visualize the dependency of the target on each continuous feature (2d scatter plot).

## 2.4
Split data in training and test set. Do not use the test-set unless for a final evaluation in 2.5.
Use ColumnTransformer and pipeline to encode categorical variables (your choice of
OneHotEncoder or another one from the categorical_encoder package, or both). Impute missing
values using SimpleImputer. Evaluate Linear Regression (OLS), Ridge, Lasso and ElasticNet
using cross-validation with the default parameters. Does scaling the data (within the pipeline)
with StandardScaler help? Use the preprocessing that works best going forward.

In [208]:
X_train, X_test, y_train, y_test = train_test_split(data, price, shuffle=True, random_state=0)

In [212]:
# Add separate ones for the ones on piazza where missing value == 0 while rest take on the median
cont_processing = Pipeline([('impute', SimpleImputer(strategy='median'))])
cat_processing = Pipeline([('impute', SimpleImputer(strategy='most_frequent')), 
                           ('encode', preprocessing.OneHotEncoder(handle_unknown='ignore'))])
target_processing = Pipeline([('encode', TargetEncoder())])
preprocess = make_column_transformer((cont_processing, continuous), 
                                     (cat_processing, categorical),
                                     (target_processing, target),
                                     remainder='passthrough')

In [213]:
X_new = preprocess.fit_transform(X_train, y_train)
X_new = pd.DataFrame(X_new.toarray())
X_new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,2.0,1.00,790.0,6969.0,1.0,790.0,0.0,1955.0,1984.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.268470e+05,0.0,3.0
1,3.0,2.50,1900.0,7604.0,2.0,1900.0,0.0,1990.0,2009.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.019477e+05,0.0,3.0
2,5.0,2.00,2700.0,10875.0,1.0,1540.0,1160.0,1962.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.684805e+05,0.0,4.0
3,4.0,3.50,4230.0,20377.0,2.0,4230.0,0.0,1997.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.315566e+06,0.0,3.0
4,3.0,2.50,1970.0,23180.0,1.0,1100.0,870.0,1937.0,1998.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.527002e+05,4.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3408,3.0,1.50,1270.0,1443.0,3.0,1270.0,0.0,2007.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.897857e+05,0.0,3.0
3409,2.0,1.00,970.0,5500.0,1.0,970.0,0.0,1956.0,2001.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.399148e+05,0.0,3.0
3410,5.0,2.75,2080.0,13189.0,2.0,2080.0,0.0,1987.0,2000.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.269021e+05,0.0,3.0
3411,4.0,2.50,3070.0,34412.0,1.0,2070.0,1000.0,1950.0,1983.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.978214e+06,3.0,4.0


In [214]:
lin_reg_pipe = make_pipeline(preprocess, LinearRegression())
lin_reg = cross_val_score(lin_reg_pipe, X_train, y_train, cv=10)
np.mean(lin_reg)

0.5895763461731346

In [192]:
ridge_pipe = make_pipeline(preprocess, Ridge())
ridge = cross_val_score(ridge_pipe, X_train, y_train, cv=10)
ridge

array([0.7646654 , 0.72344808, 0.76747851, 0.15638149, 0.72453778,
       0.66288797, 0.72655455, 0.6403621 , 0.00600571, 0.70840771])

In [193]:
lasso_pipe = make_pipeline(preprocess, Lasso())
lasso = cross_val_score(lasso_pipe, X_train, y_train, cv=10)
np.mean(lasso)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


0.5879447620126067

In [195]:
en_pipe = make_pipeline(preprocess, ElasticNet(max_iter=5000))
en = cross_val_score(en_pipe, X_train, y_train, cv=10)
en

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


array([0.73208464, 0.69486342, 0.77095691, 0.16226123, 0.7177958 ,
       0.65628253, 0.66853112, 0.61511887, 0.00273721, 0.70334743])

Add scaling on continuous data

In [None]:
cont_processing = Pipeline([('impute', SimpleImputer(strategy='medium')), 
                            ('scale', preprocessing.StandardScalar())])
cat_processing = Pipeline([('impute', SimpleImputer(strategy='most_frequent')), 
                           ('encode', preprocessing.OneHotEncoder())])
preprocess_with_ss = make_column_transformer((cat_processing, categorical), 
                                     (cont_processing, continuous))

In [None]:
lin_reg_pipe = make_pipeline(preprocess_with_ss, LogisticRegression())
lin_reg = cross_val_score(lin_reg_pipe, X_train, y_train)

In [None]:
ridge_pipe = make_pipeline(preprocess_with_ss, Ridge())
ridge = cross_val_score(ridge_pipe, X_train, y_train)

In [None]:
lasso_pipe = make_pipeline(preprocess_with_ss, Lasso())
lasso = cross_val_score(lasso_pipe, X_train, y_train)

In [None]:
en_pipe = make_pipeline(preprocess_with_ss, ElasticNet())
en = cross_val_score(en_pipe, X_train, y_train)