In [16]:
import pandas as pd
import pickle
pd.set_option('display.max_columns', 300)

## Step 1: Read in hold out data, scalers, and best model

In [17]:

holdout = pd.read_csv("../datasets/kc_house_data_test_features.csv", index_col = 0)
extra = pd.read_csv("../datasets/zip_code_extra_data_fixed.csv", index_col = 0)


In [18]:
price_prediction_model=pickle.load(open("lm_final.pickle", "rb"))

FileNotFoundError: [Errno 2] No such file or directory: 'lm_final.pickle'

## Step 2: Feature Engineering for holdout set

Remember we have to perform the same transformations on our holdout data (feature engineering, extreme values, and scaling) that we performed on the original data.

In [19]:
holdout.head()
holdout_date = holdout[["date"]]
holdout.drop(columns=["date"], inplace = True)

In [20]:
print(holdout.shape) 
holdout.drop(columns=["id"], inplace = True)

(4323, 19)


In [21]:
holdout_date.shape


(4323, 1)

In [22]:
holdout.columns.to_list()

['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'zipcode',
 'lat',
 'long',
 'sqft_living15',
 'sqft_lot15']

In [23]:
# removes outliers - bedrooms  > 10

def cap_max(x, n, data, column):
    """
    x = the column number
    n = min number
    data = dataframe
    column = interested column from dataframe
    should be applied in a lambda function.
    For example: sales["bedrooms"].apply(lambda x: cutoff(x,10,sales,"bedrooms"))
    """
    if x > n:
        return data[column].median()
    else:
        return x

holdout["bedrooms"]= holdout["bedrooms"].apply(lambda x: cap_max(x,10,holdout,"bedrooms"))  

In [24]:
def cap_min(x, n, data, column):
    """
    x = the column number
    n = min number
    data = dataframe
    column = interested column from dataframe
    should be applied in a lambda function.
    For example: sales["bedrooms"].apply(lambda x: cutoff(x,10,sales,"bedrooms"))
    """
    if x == n:
        return data[column].median()
    else:
        return x

holdout["bathrooms"]= holdout["bathrooms"].apply(lambda x: cap_min(x,0,holdout,"bedrooms"))  

In [25]:
holdout.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918
1,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918
2,3,2.5,1470,1779,2.0,0,0,3,8,1160,310,2005,0,98029,47.5472,-121.998,1470,1576
3,3,1.75,1280,16200,1.0,0,0,3,8,1030,250,1976,0,98077,47.7427,-122.071,1160,10565
4,4,2.75,2830,8126,2.0,0,0,3,8,2830,0,2005,0,98059,47.4863,-122.14,2830,7916


In [26]:
holdout = pd.merge(holdout, extra, how='right', on='zipcode')

In [27]:
holdout.shape

(4323, 72)

In [28]:
holdout_date

Unnamed: 0,date
0,20140827T000000
1,20150218T000000
2,20141107T000000
3,20141203T000000
4,20150115T000000
...,...
4318,20140521T000000
4319,20150223T000000
4320,20140623T000000
4321,20150116T000000


In [34]:
holdout = pd.merge(holdout, holdout_date, how='left', left_index = True, right_index = True)
holdout["month"]= pd.DatetimeIndex(holdout["date"]).month
holdout.drop(columns=["date"], inplace = True)

In [35]:
holdout.shape

(4323, 74)

In [36]:
holdout.columns

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15', 'full_time_earnings',
       'part_time_earnings', 'no_earnings', 'car_truck_van',
       'public_transportation', 'taxi', 'motorcycle', 'bike_walk_other',
       'work_home', 'less_than_hs', 'high_school_grad', 'associates',
       'bachelors', 'masters', 'professional_school_degree', 'doctorate',
       'public_school_enrollment', 'private_school_enrollment',
       'no_school_enrollment', 'for_rent', 'rented_and_unoccupied',
       'for_sale_only', 'sold_and_unoccupied', 'seasonal_or_rec_use',
       'migrant_worker_housing', 'vacant_other_reasons',
       'in_occupied_housing_units', 'adult_correctional_facility',
       'juvenile_facilities', 'nursing_facilities', 'other_institutional',
       'military_quarters', 'other_noninstitution

In [37]:
from geopy.distance import geodesic
downtown = (47.609862, -122.342056) 
holdout['coords'] = tuple(zip(holdout['lat'],holdout['long']))
x = holdout['coords'].apply(lambda x: geodesic(downtown,x))
holdout['miles_from_downtown'] = x.astype(str).str[:-3].astype(float)
holdout.drop(columns = "coords", inplace = True)

In [33]:
holdout.shape

(4323, 74)

In [38]:
holdout.columns

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15', 'full_time_earnings',
       'part_time_earnings', 'no_earnings', 'car_truck_van',
       'public_transportation', 'taxi', 'motorcycle', 'bike_walk_other',
       'work_home', 'less_than_hs', 'high_school_grad', 'associates',
       'bachelors', 'masters', 'professional_school_degree', 'doctorate',
       'public_school_enrollment', 'private_school_enrollment',
       'no_school_enrollment', 'for_rent', 'rented_and_unoccupied',
       'for_sale_only', 'sold_and_unoccupied', 'seasonal_or_rec_use',
       'migrant_worker_housing', 'vacant_other_reasons',
       'in_occupied_housing_units', 'adult_correctional_facility',
       'juvenile_facilities', 'nursing_facilities', 'other_institutional',
       'military_quarters', 'other_noninstitution

In [39]:
holdout["house_years"] = abs(holdout["yr_renovated"]-holdout["yr_built"])
holdout["house_age_2015"] = 2015 - holdout["yr_built"]


In [40]:
holdout.drop(columns= ["yr_built", "yr_renovated"], inplace = True)

In [41]:
holdout.shape

(4323, 74)

In [42]:

waterfront_dummies_holdout = pd.get_dummies(holdout["waterfront"], prefix="wfront",  drop_first = True)
view_dummies_holdout = pd.get_dummies(holdout["view"], prefix="view", drop_first = True)
grade_dummies_holdout = pd.get_dummies(holdout["grade"], prefix="grade",  drop_first = True)
zipcode_dummies_holdout = pd.get_dummies(holdout["zipcode"], prefix="zipcode",  drop_first = True)
bedroom_dummies_holdout = pd.get_dummies(holdout["bedrooms"], prefix="rooms",  drop_first = True)
bathroom_dummies_holdout = pd.get_dummies(holdout["bathrooms"], prefix="bathrooms", drop_first = True)
month_dummies_holdout = pd.get_dummies(holdout["month"], prefix="month", drop_first = True)

In [43]:
holdout = pd.concat([holdout, waterfront_dummies_holdout, view_dummies_holdout, grade_dummies_holdout, zipcode_dummies_holdout , bedroom_dummies_holdout, bathroom_dummies_holdout, month_dummies_holdout], axis=1)

In [44]:
print(holdout.shape)

(4323, 201)


In [45]:
data_fin = pd.read_csv("final_data.csv")

FileNotFoundError: [Errno 2] File b'final_data.csv' does not exist: b'final_data.csv'

In [46]:
!ls

Bakeoff_modeling_process.ipynb        Book 4 Adding more data.ipynb
Book 2.ipynb                          Exploratory Data Analysis.ipynb
Book 3 Stats Version THIS WORKS.ipynb Predict_holdout.ipynb


## Step 3: Predict the holdout set

In [None]:
selected_columns = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'grade', 'sqft_above', 'sqft_basement',
       'yr_renovated', 'lat', 'sqft_living15', 'sqft_lot15',
       'part_time_earnings', 'no_earnings', 'car_truck_van', 'bike_walk_other',
       'work_home', 'less_than_hs', 'high_school_grad', 'associates',
       'bachelors', 'masters', 'professional_school_degree', 'doctorate',
       'public_school_enrollment', 'private_school_enrollment',
       'no_school_enrollment', 'rented_and_unoccupied', 'for_sale_only',
       'sold_and_unoccupied', 'seasonal_or_rec_use', 'vacant_other_reasons',
       'in_occupied_housing_units', 'adult_correctional_facility',
       'nursing_facilities', 'other_institutional',
       'house_owned_with_mortgage', 'vacant_households', 'single_guardian',
       'male', 'female', 'median_age', 'male_median_age', 'female_median_age',
       'population', 'population_density_per_sq_mile', 'median_home_value',
       'land_area', 'water_area_sq_mile', 'median_household_income',
       'house_years', 'good_grade', 'wfront_1', 'view_1', 'view_2', 'view_3',
       'view_4', 'grade_5', 'grade_6', 'grade_7', 'grade_9', 'grade_10',
       'grade_11', 'grade_12', 'grade_13', 'zipcode_98002', 'zipcode_98003',
       'zipcode_98004', 'zipcode_98006', 'zipcode_98023', 'zipcode_98033',
       'zipcode_98039', 'zipcode_98040', 'zipcode_98042', 'zipcode_98075',
       'zipcode_98105', 'zipcode_98112', 'zipcode_98168', 'zipcode_98199',
       'rooms_2.0', 'rooms_3.0', 'rooms_4.0', 'rooms_5.0', 'bathrooms_1.0',
       'bathrooms_1.5', 'bathrooms_1.75', 'bathrooms_3.0', 'bathrooms_3.25',
       'bathrooms_3.5', 'bathrooms_3.75', 'bathrooms_4.0', 'bathrooms_4.25',
       'bathrooms_4.5', 'bathrooms_4.75', 'bathrooms_5.25', 'bathrooms_5.5',
       'bathrooms_5.75', 'bathrooms_7.75']

In [None]:
# final_answers = final_model.predict(transformed_holdout)
price_prediction=lm_final.predict(holdout[selected_columns])

## Step 4: Export your predictions

In [None]:
# final_answer.to_csv('housing_preds_your_name.csv')

final_answer_df = pd.DataFrame(price_prediction, columns=['holdout_prices'])

final_answer_df.to_csv('Dorjey_Sherpa_prediction.csv')