In [2]:
# Import all necessary packages.

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sb_utils import save_file

In [3]:
# Import and display the explored housing prices data.
# Raw original data source: https://www.kaggle.com/datasets/fedesoriano/california-housing-prices-data-extra-features

house_data = pd.read_csv('house_data_explored.csv')
house_data

Unnamed: 0,Median_House_Value,Median_Income_adj,Median_Age,Tot_Rooms,Tot_Bedrooms,Population,Households,Latitude,Longitude,Distance_to_coast_adj,Distance_to_LA_adj,Distance_to_SanDiego_adj,Distance_to_SanJose_adj,Distance_to_SanFrancisco_adj
0,452600.0,83252.0,41.000000,880,129,322,126,37.88,-122.23,5.76,345.81,457.02,41.90,13.20
1,358500.0,83014.0,21.000000,7099,1106,2401,1138,37.86,-122.22,6.35,344.41,455.61,40.42,12.97
2,352100.0,72574.0,26.983847,1467,190,496,177,37.85,-122.24,5.13,344.62,455.79,40.31,11.69
3,341300.0,56431.0,26.983847,1274,235,558,219,37.85,-122.25,4.83,344.98,456.15,40.57,11.20
4,342200.0,38462.0,26.983847,1627,280,565,259,37.85,-122.25,4.83,344.98,456.15,40.57,11.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19670,78100.0,15603.0,25.000000,1665,374,845,330,39.48,-121.09,100.68,406.71,516.13,154.42,138.33
19671,77100.0,25568.0,18.000000,697,150,356,114,39.49,-121.21,99.70,409.95,519.62,153.39,135.65
19672,92300.0,17000.0,17.000000,2254,485,1007,433,39.43,-121.22,95.54,406.40,516.17,149.24,131.79
19673,84700.0,18672.0,18.000000,1860,409,741,349,39.43,-121.32,94.45,408.68,518.64,148.01,129.20


__Key:__

 - Median house value: Median house value for households within a block
 - Median income (adj): Median income for households within a block of houses (in US dollars)
    - Originally in thousands of US dollars
 - Median age: Median age of a house within a block; a lower number is a newer building
 - Total rooms: Total number of rooms within a block
 - Total bedrooms: Total number of bedrooms within a block
 - Population: Total number of people residing within a block
 - Households: Total number of households, a group of people residing within a home unit, for a block
 - Latitude: A measure of how far north a house is; a higher value is farther north
 - Longitude: A measure of how far west a house is; a higher value is farther west
 - Distance to coast (adj): Distance to the nearest coast point (in miles)
    - Originally in meters
 - Distance to Los Angeles (adj): Distance to the centre of Los Angeles (in miles)
    - Originally in meters
 - Distance to San Diego (adj): Distance to the centre of San Diego (in miles)
    - Originally in meters
 - Distance to San Jose (adj): Distance to the centre of San Jose (in miles)
    - Originally in meters
 - Distance to San Francisco (adj): Distance to the centre of San Francisco (in miles)
    - Originally in meters

In [4]:
# Display summary data of the explored data.

house_data.shape

(19675, 14)

In [5]:
house_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19675 entries, 0 to 19674
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Median_House_Value            19675 non-null  float64
 1   Median_Income_adj             19675 non-null  float64
 2   Median_Age                    19675 non-null  float64
 3   Tot_Rooms                     19675 non-null  int64  
 4   Tot_Bedrooms                  19675 non-null  int64  
 5   Population                    19675 non-null  int64  
 6   Households                    19675 non-null  int64  
 7   Latitude                      19675 non-null  float64
 8   Longitude                     19675 non-null  float64
 9   Distance_to_coast_adj         19675 non-null  float64
 10  Distance_to_LA_adj            19675 non-null  float64
 11  Distance_to_SanDiego_adj      19675 non-null  float64
 12  Distance_to_SanJose_adj       19675 non-null  float64
 13  D

In [6]:
house_data.describe()

Unnamed: 0,Median_House_Value,Median_Income_adj,Median_Age,Tot_Rooms,Tot_Bedrooms,Population,Households,Latitude,Longitude,Distance_to_coast_adj,Distance_to_LA_adj,Distance_to_SanDiego_adj,Distance_to_SanJose_adj,Distance_to_SanFrancisco_adj
count,19675.0,19675.0,19675.0,19675.0,19675.0,19675.0,19675.0,19675.0,19675.0,19675.0,19675.0,19675.0,19675.0,19675.0
mean,192477.921017,36749.901027,26.983847,2619.763659,539.653113,1440.812198,501.186023,35.65178,-119.563192,26.048491,168.915463,248.443383,217.319406,240.55352
std,97711.509613,15640.327384,11.106738,2181.348207,422.294861,1143.648725,383.264636,2.149802,2.006108,30.964588,154.10766,180.514495,134.703038,155.06982
min,14999.0,4999.0,1.0,2.0,2.0,3.0,2.0,32.54,-124.35,0.07,0.26,0.3,0.35,0.28
25%,116600.0,25268.0,18.0,1438.0,297.0,796.0,282.0,33.93,-121.76,6.09,20.535,98.73,73.115,74.735
50%,173800.0,34500.0,26.983847,2111.0,436.0,1179.0,411.0,34.27,-118.5,13.24,110.11,138.7,284.59,325.95
75%,248200.0,45813.0,35.0,3120.0,648.0,1746.0,606.0,37.73,-117.99,33.055,328.765,439.795,321.805,363.915
max,500000.0,131477.0,51.0,39320.0,6445.0,35682.0,6082.0,41.95,-114.31,207.42,632.72,743.73,519.94,561.49


__Tasks to do for this step__:

   - Create dummy or indicator features for categorical variables
   - Standardize the magnitude of numeric features using a scaler
   - Split your data into testing and training datasets

__Create dummy or indicator features for categorical variables__: Since there are no categorical variables, no dummy or indicator features will be created.

__Standardize the magnitude of numeric features using a scaler__: According to their histograms (created in Step 2), none of the features appear to follow a normal distribution. We will therefore use the MinMaxScaler to scale the features.

In [16]:
# Apply scaler to the data.

from sklearn.preprocessing import MinMaxScaler

X = house_data.drop(columns = 'Median_House_Value')
y = house_data['Median_House_Value']

scaler = MinMaxScaler()
scaler.fit(X)
X_sc = scaler.transform(X)
X_sc

array([[0.61870839, 0.8       , 0.02233074, ..., 0.61434163, 0.0799669 ,
        0.02302169],
       [0.61682664, 0.4       , 0.18050257, ..., 0.61244502, 0.0771185 ,
        0.02261186],
       [0.53428264, 0.51967693, 0.03726029, ..., 0.61268714, 0.07690679,
        0.02033107],
       ...,
       [0.09488607, 0.32      , 0.05727657, ..., 0.69390528, 0.28655286,
        0.23433296],
       [0.10810576, 0.34      , 0.04725571, ..., 0.69722771, 0.28418561,
        0.22971793],
       [0.14933032, 0.3       , 0.07078183, ..., 0.68962781, 0.27831559,
        0.2270095 ]])

__Split your data into testing and training datasets__: We will split the data 80/20 for train/test.

In [17]:
# Split data into testing and training sets.

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_sc, y, test_size = .2, random_state = 42)

In [20]:
X_train.shape

(15740, 13)

In [23]:
y_train.shape

(15740,)

In [24]:
X_test.shape

(3935, 13)

In [25]:
y_test.shape

(3935,)

In [26]:
# Save the preprocessed data as a new file.

save_file(house_data, 'house_data_preprocessed.csv', '../Capstone 2')

Writing file.  "../Capstone 2\house_data_preprocessed.csv"
