In [1]:
# Import the libraries
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cluster import Birch
from sklearn.metrics import mean_squared_error, r2_score
#import shap
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
#pip install requests pandas
#pip install folium
#pip install statsmodels

In [2]:
# Import the data
df = pd.read_csv('kc_house_data.csv')
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [3]:
# Display summary statistics for the DataFrame
df.describe()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
count,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
mean,4580302000.0,540088.1,3.370842,2.114757,2079.899736,15106.97,1.494309,0.007542,0.234303,3.40943,7.656873,1788.390691,291.509045,1971.005136,84.402258,98077.939805,47.560053,-122.213896,1986.552492,12768.455652
std,2876566000.0,367127.2,0.930062,0.770163,918.440897,41420.51,0.539989,0.086517,0.766318,0.650743,1.175459,828.090978,442.575043,29.373411,401.67924,53.505026,0.138564,0.140828,685.391304,27304.179631
min,1000102.0,75000.0,0.0,0.0,290.0,520.0,1.0,0.0,0.0,1.0,1.0,290.0,0.0,1900.0,0.0,98001.0,47.1559,-122.519,399.0,651.0
25%,2123049000.0,321950.0,3.0,1.75,1427.0,5040.0,1.0,0.0,0.0,3.0,7.0,1190.0,0.0,1951.0,0.0,98033.0,47.471,-122.328,1490.0,5100.0
50%,3904930000.0,450000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,1560.0,0.0,1975.0,0.0,98065.0,47.5718,-122.23,1840.0,7620.0
75%,7308900000.0,645000.0,4.0,2.5,2550.0,10688.0,2.0,0.0,0.0,4.0,8.0,2210.0,560.0,1997.0,0.0,98118.0,47.678,-122.125,2360.0,10083.0
max,9900000000.0,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,9410.0,4820.0,2015.0,2015.0,98199.0,47.7776,-121.315,6210.0,871200.0


In [4]:
# Identify object columns and convert them to numerical values
object_columns = df.select_dtypes(include=['object']).columns
for col in object_columns:
    df[col] = df[col].astype('category').cat.codes

In [5]:
# Display data types for each column
print(df.dtypes)

id                 int64
date               int16
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object


In [6]:
# Detect missing values
na_counts = df.isna().sum()

# Display the count of missing values for each column
print(na_counts)

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64


In [7]:
df.dtypes

id                 int64
date               int16
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

In [8]:
# Remove Y and features not needed
X=df.drop(columns='price')
X.head()

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,164,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,220,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,290,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,220,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,283,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [9]:
# Ensure the zipcode column is of type int64
X['zipcode'] = X['zipcode'].astype('int64')

In [10]:
X.dtypes

id                 int64
date               int16
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

In [11]:
# FEATURE ENGINEERING -  measure the distance from a home's location to the County/City's largest employers, Amazon, Starbucks, and Nordstums

# This computes probable commute time.


from math import radians, sin, cos, sqrt, atan2

# Function to calculate distance using Haversine formula
def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    r = 6371  # Radius of Earth in kilometers
    return r * c

# Define city center coordinates (example: downtown coordinates)
city_center_lat = 47.615257
city_center_lon = -122.338356

# Calculate distance to city center for each row
X['distance_to_Seattle'] = X.apply(lambda row: haversine(row['lat'], row['long'], city_center_lat, city_center_lon), axis=1)

# Display the DataFrame with the new distance_to_center feature
print(X[['lat', 'long', 'distance_to_Seattle']])

           lat     long  distance_to_Seattle
0      47.5112 -122.257            13.082094
1      47.7210 -122.319            11.847081
2      47.7379 -122.233            15.754191
3      47.5208 -122.393            11.274888
4      47.6168 -122.045            21.989458
...        ...      ...                  ...
21608  47.6993 -122.346             9.362676
21609  47.5107 -122.362            11.760782
21610  47.5944 -122.299             3.752959
21611  47.5345 -122.069            22.111252
21612  47.5941 -122.299             3.773671

[21613 rows x 3 columns]


In [12]:
# FEATURE ENGINEERING -  measure the distance from a home's location to MSFT HQ
# This computes probable commute time.


# Function to calculate distance using Haversine formula
def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    r = 6371  # Radius of Earth in kilometers
    return r * c

# Define city center coordinates (example: downtown coordinates)
MSFT_lat = 47.643543
MSFT_lon = -122.130821

# Calculate distance to city center for each row
X['distance_to_MSFT'] = X.apply(lambda row: haversine(row['lat'], row['long'], MSFT_lat, MSFT_lon), axis=1)

# Display the DataFrame with the new distance_to_center feature
print(X[['lat', 'long', 'distance_to_MSFT']])

           lat     long  distance_to_MSFT
0      47.5112 -122.257         17.496867
1      47.7210 -122.319         16.511564
2      47.7379 -122.233         12.983608
3      47.5208 -122.393         23.936895
4      47.6168 -122.045          7.085291
...        ...      ...               ...
21608  47.6993 -122.346         17.263568
21609  47.5107 -122.362         22.779616
21610  47.5944 -122.299         13.738786
21611  47.5345 -122.069         12.981182
21612  47.5941 -122.299         13.752121

[21613 rows x 3 columns]


In [13]:
# FEATURE ENGINEERING -  measure the distance from a home's location to Boeing Seatle Offices
# This computes probable commute time.


# Function to calculate distance using Haversine formula
def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    r = 6371  # Radius of Earth in kilometers
    return r * c

# Define city center coordinates (example: downtown coordinates)
Boeing_lat = 47.532733
Boeing_lon = -122.316916

# Calculate distance to city center for each row
X['distance_to_Boeing'] = X.apply(lambda row: haversine(row['lat'], row['long'], Boeing_lat, Boeing_lon), axis=1)

# Display the DataFrame with the new distance_to_center feature
print(X[['lat', 'long', 'distance_to_Boeing']])

           lat     long  distance_to_Boeing
0      47.5112 -122.257            5.096588
1      47.7210 -122.319           20.934918
2      47.7379 -122.233           23.664150
3      47.5208 -122.393            5.864758
4      47.6168 -122.045           22.437740
...        ...      ...                 ...
21608  47.6993 -122.346           18.649261
21609  47.5107 -122.362            4.178909
21610  47.5944 -122.299            6.987580
21611  47.5345 -122.069           18.613097
21612  47.5941 -122.299            6.954848

[21613 rows x 3 columns]


In [14]:
# FEATURE ENGINEERING - Identify a house to a school district and how that school district ranks
# This coputes the quality of public education

# Define the school district rankings
school_districts = {
    1: 'Bellevue',
    2: 'Mercer Island',
    3: 'Lake Washington',
    4: 'Issaquah',
    5: 'Shoreline',
    6: 'Vashon Island',
    7: 'Tahoma',
    8: 'All Others'
}

# Define zip code to school district mapping
zip_to_district = {
    98004: 1, 98005: 1, 98006: 1, 98007: 1, 98008: 1,  # Bellevue
    98040: 2,                                        # Mercer Island
    98033: 3, 98034: 3, 98072: 3, 98074: 3, 98052: 3, # Lake Washington
    98027: 4, 98029: 4, 98075: 4, 98038: 4,           # Issaquah
    98155: 5, 98133: 5, 98177: 5, 98160: 5,           # Shoreline
    98070: 6,                                        # Vashon Island
    98038: 7                                         # Tahoma
}

# Assign 'All Others' (rank 8) to zip codes not explicitly listed
X['school_district_rank'] = X['zipcode'].map(zip_to_district).fillna(8)

# Display the DataFrame with the new school_district feature
print(X[['zipcode', 'school_district_rank']])

       zipcode  school_district_rank
0        98178                   8.0
1        98125                   8.0
2        98028                   8.0
3        98136                   8.0
4        98074                   3.0
...        ...                   ...
21608    98103                   8.0
21609    98146                   8.0
21610    98144                   8.0
21611    98027                   4.0
21612    98144                   8.0

[21613 rows x 2 columns]


In [15]:
# FEATURE ENGINEERING - Area Type - Identifying a home as being in an urban area vs suburban vs rural  

# Zip code classification
urban_zips = {'98101', '98102', '98103', '98104', '98105', '98106', '98107', '98108', '98109', '98112', '98115', '98116', '98117', '98118', '98119', '98121', '98122', '98125', '98126', '98133', '98134', '98136', '98144', '98146', '98148', '98154', '98155', '98158', '98160', '98161', '98164', '98166', '98168', '98174', '98177', '98178', '98188', '98198', '98199'}
suburban_zips = {'98001', '98002', '98003', '98004', '98005', '98006', '98007', '98008', '98010', '98011', '98014', '98019', '98022', '98023', '98024', '98025', '98027', '98028', '98029', '98030', '98031', '98032', '98033', '98034', '98038', '98039', '98040', '98042', '98045', '98047', '98050', '98051', '98052', '98053', '98055', '98056', '98057', '98058', '98059', '98062', '98063', '98064', '98065', '98070', '98072', '98073', '98074', '98075', '98077', '98092', '98093'}

def classify_zipcode(zipcode):
    if zipcode in urban_zips:
        return 0  # Urban
    elif zipcode in suburban_zips:
        return 1  # Suburban
    else:
        return 2  # Rural

# Apply the classification to the DataFrame
X['area_type'] = X['zipcode'].apply(lambda x: classify_zipcode(str(x)))

# Display the resulting DataFrame
print(X)

               id  date  bedrooms  bathrooms  sqft_living  sqft_lot  floors  \
0      7129300520   164         3       1.00         1180      5650     1.0   
1      6414100192   220         3       2.25         2570      7242     2.0   
2      5631500400   290         2       1.00          770     10000     1.0   
3      2487200875   220         4       3.00         1960      5000     1.0   
4      1954400510   283         3       2.00         1680      8080     1.0   
...           ...   ...       ...        ...          ...       ...     ...   
21608   263000018    19         3       2.50         1530      1131     3.0   
21609  6600060120   288         4       2.50         2310      5813     2.0   
21610  1523300141    52         2       0.75         1020      1350     2.0   
21611   291310100   252         3       2.50         1600      2388     2.0   
21612  1523300157   166         2       0.75         1020      1076     2.0   

       waterfront  view  condition  ...  zipcode   

In [16]:
# FEATURE ENGINEERING - Deterine the housing density by zip code via accessing the US Census data.
#  Combined with 

import requests
import pandas as pd

# Your valid U.S. Census API key
api_key = 'da7546bc69f821495474c275ba74ac0c5ba32a7c'

# Base URL for the API
base_url = "https://api.census.gov/data/2020/acs/acs5"

# Parameters for the API request for housing units
params_housing = {
    "get": "B25001_001E",  # Total housing units
    "for": "zip code tabulation area:*",
    "key": api_key
}

# Parameters for the API request for population
params_population = {
    "get": "B01003_001E",  # Total population
    "for": "zip code tabulation area:*",
    "key": api_key
}

# Function to fetch data from Census API
def fetch_census_data(params):
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        try:
            return response.json()
        except ValueError:
            print("Error: Response content is not valid JSON")
            print(response.text)
            return None
    else:
        print(f"Error: Received response with status code {response.status_code}")
        print(response.text)
        return None

# Fetch housing data
housing_data = fetch_census_data(params_housing)
if housing_data:
    # Convert the housing data to a pandas DataFrame
    columns_housing = housing_data[0]
    data_housing_rows = housing_data[1:]
    df_housing = pd.DataFrame(data_housing_rows, columns=columns_housing)
    df_housing['B25001_001E'] = pd.to_numeric(df_housing['B25001_001E'])
    df_housing['zip code tabulation area'] = df_housing['zip code tabulation area'].astype(str)

# Fetch population data
population_data = fetch_census_data(params_population)
if population_data:
    # Convert the population data to a pandas DataFrame
    columns_population = population_data[0]
    data_population_rows = population_data[1:]
    df_population = pd.DataFrame(data_population_rows, columns=columns_population)
    df_population['B01003_001E'] = pd.to_numeric(df_population['B01003_001E'])
    df_population['zip code tabulation area'] = df_population['zip code tabulation area'].astype(str)

# List of zip codes in King County (source: external reliable source or predefined list)
king_county_zip_codes = [
    '98001', '98002', '98003', '98004', '98005', '98006', '98007', '98008', '98010', '98011', '98014', '98019', 
    '98022', '98023', '98024', '98025', '98027', '98028', '98029', '98030', '98031', '98032', '98033', '98034',
    '98038', '98039', '98040', '98042', '98045', '98047', '98050', '98051', '98052', '98053', '98055', '98056',
    '98057', '98058', '98059', '98062', '98063', '98064', '98065', '98070', '98072', '98073', '98074', '98075',
    '98077', '98092', '98093', '98101', '98102', '98103', '98104', '98105', '98106', '98107', '98108', '98109',
    '98112', '98115', '98116', '98117', '98118', '98119', '98121', '98122', '98125', '98126', '98133', '98134',
    '98136', '98144', '98146', '98148', '98154', '98155', '98158', '98160', '98161', '98164', '98166', '98168',
    '98174', '98177', '98178', '98188', '98198', '98199'
]

# Filter data for King County zip codes
if housing_data and population_data:
    df_housing_king = df_housing[df_housing['zip code tabulation area'].isin(king_county_zip_codes)]
    df_population_king = df_population[df_population['zip code tabulation area'].isin(king_county_zip_codes)]
    
    # Merge the two DataFrames on the zip code tabulation area
    df_merged = pd.merge(df_housing_king, df_population_king, on='zip code tabulation area')
    
    # Calculate housing density (total housing units per population)
    df_merged['housing_density'] = df_merged['B25001_001E'] / df_merged['B01003_001E']
    
    # Rank the zip codes by housing density
    #df_merged['density_rank'] = df_merged['housing_density'].rank(ascending=False)
    
    # Display the DataFrame with the housing density and rank
    print(df_merged[['zip code tabulation area', 'housing_density']])

   zip code tabulation area  housing_density
0                     98001         0.347729
1                     98002         0.421098
2                     98003         0.390656
3                     98004         0.522045
4                     98005         0.411973
..                      ...              ...
77                    98177         0.409656
78                    98178         0.385634
79                    98188         0.423691
80                    98198         0.393395
81                    98199         0.462123

[82 rows x 2 columns]


In [17]:
# Rename column to match X DataFrame for merging
df_merged.rename(columns={'zip code tabulation area': 'zipcode'}, inplace=True)
# Convert zipcode to string for merging
X['zipcode'] = X['zipcode'].astype(str)

# Merge the housing density and rank into X based on zipcode
X = pd.merge(X, df_merged, on='zipcode', how='left')
X.head()

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,sqft_living15,sqft_lot15,distance_to_Seattle,distance_to_MSFT,distance_to_Boeing,school_district_rank,area_type,B25001_001E,B01003_001E,housing_density
0,7129300520,164,3,1.0,1180,5650,1.0,0,0,3,...,1340,5650,13.082094,17.496867,5.096588,8.0,0,10222,26507,0.385634
1,6414100192,220,3,2.25,2570,7242,2.0,0,0,3,...,1690,7639,11.847081,16.511564,20.934918,8.0,0,20946,44030,0.475721
2,5631500400,290,2,1.0,770,10000,1.0,0,0,3,...,2720,8062,15.754191,12.983608,23.66415,8.0,1,9666,22953,0.421121
3,2487200875,220,4,3.0,1960,5000,1.0,0,0,5,...,1360,5000,11.274888,23.936895,5.864758,8.0,0,8272,17083,0.484224
4,1954400510,283,3,2.0,1680,8080,1.0,0,0,3,...,1800,7503,21.989458,7.085291,22.43774,3.0,1,9779,29349,0.333197


In [18]:
# Drop features
columns_to_drop=['id','lat','long','B25001_001E','B01003_001E','date','sqft_above','sqft_basement','distance_to_Seattle','area_type']
X=X.drop(columns=columns_to_drop)
X.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode,sqft_living15,sqft_lot15,distance_to_MSFT,distance_to_Boeing,school_district_rank,housing_density
0,3,1.0,1180,5650,1.0,0,0,3,7,1955,0,98178,1340,5650,17.496867,5.096588,8.0,0.385634
1,3,2.25,2570,7242,2.0,0,0,3,7,1951,1991,98125,1690,7639,16.511564,20.934918,8.0,0.475721
2,2,1.0,770,10000,1.0,0,0,3,6,1933,0,98028,2720,8062,12.983608,23.66415,8.0,0.421121
3,4,3.0,1960,5000,1.0,0,0,5,7,1965,0,98136,1360,5000,23.936895,5.864758,8.0,0.484224
4,3,2.0,1680,8080,1.0,0,0,3,8,1987,0,98074,1800,7503,7.085291,22.43774,3.0,0.333197


In [19]:
# Ensure the zipcode column is of type int64
X['zipcode'] = X['zipcode'].astype('int64')

In [20]:
X.dtypes

bedrooms                  int64
bathrooms               float64
sqft_living               int64
sqft_lot                  int64
floors                  float64
waterfront                int64
view                      int64
condition                 int64
grade                     int64
yr_built                  int64
yr_renovated              int64
zipcode                   int64
sqft_living15             int64
sqft_lot15                int64
distance_to_MSFT        float64
distance_to_Boeing      float64
school_district_rank    float64
housing_density         float64
dtype: object

In [21]:
# Verify that there are no remaining object types
object_columns_after_encoding = X.select_dtypes(include=['object']).columns
print("Columns with object data types after encoding:", object_columns_after_encoding)

Columns with object data types after encoding: Index([], dtype='object')


In [22]:
y=df['price'].values.reshape(-1,1)
y

array([[221900.],
       [538000.],
       [180000.],
       ...,
       [402101.],
       [400000.],
       [325000.]])

In [23]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
X_train.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode,sqft_living15,sqft_lot15,distance_to_MSFT,distance_to_Boeing,school_district_rank,housing_density
6325,3,1.75,1780,13095,1.0,0,0,4,9,1983,0,98042,2750,13095,30.791302,22.212432,8.0,0.356163
13473,2,1.0,1000,3700,1.0,0,0,3,6,1929,0,98118,1270,5000,15.686701,2.944773,8.0,0.38415
17614,3,1.0,1080,7486,1.5,0,0,3,6,1942,0,98146,1170,7800,23.456443,5.608079,8.0,0.396713
16970,3,2.25,2090,7500,1.0,0,0,4,7,1977,0,98031,1800,7350,27.798133,18.785413,8.0,0.339124
20868,2,2.5,1741,1439,2.0,0,0,3,8,2007,0,98034,2090,10454,8.93897,20.721273,3.0,0.428337


In [25]:
X_train.dtypes

bedrooms                  int64
bathrooms               float64
sqft_living               int64
sqft_lot                  int64
floors                  float64
waterfront                int64
view                      int64
condition                 int64
grade                     int64
yr_built                  int64
yr_renovated              int64
zipcode                   int64
sqft_living15             int64
sqft_lot15                int64
distance_to_MSFT        float64
distance_to_Boeing      float64
school_district_rank    float64
housing_density         float64
dtype: object

In [26]:
print(X_train.columns)

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'yr_built', 'yr_renovated',
       'zipcode', 'sqft_living15', 'sqft_lot15', 'distance_to_MSFT',
       'distance_to_Boeing', 'school_district_rank', 'housing_density'],
      dtype='object')


In [27]:
# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [28]:
# Replace this with the actual list of feature names before scaling and encoding
original_feature_names = [
    'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'yr_built', 'yr_renovated', 'zipcode', 'sqft_living15',
       'sqft_lot15', 'distance_to_MSFT',
       'distance_to_Boeing', 'school_district_rank',
       'housing_density'
]

# Check if the number of features matches the shape of X_train_scaled
if isinstance(X_train_scaled, np.ndarray):
    num_columns = X_train_scaled.shape[1]
else:
    num_columns = X_train_scaled.shape[1]

print("Shape of X_train_scaled:", X_train_scaled.shape)
print("Number of original feature names:", len(original_feature_names))


Shape of X_train_scaled: (17290, 18)
Number of original feature names: 18


In [29]:
# Convert the scaled features back to a DataFrame with the adjusted feature names
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=original_feature_names)
X_train_scaled_df.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode,sqft_living15,sqft_lot15,distance_to_MSFT,distance_to_Boeing,school_district_rank,housing_density
0,-0.395263,-0.474451,-0.323933,-0.043873,-0.9196,-0.084992,-0.305917,0.909073,1.150243,0.404001,-0.208294,-0.674631,1.126073,0.01344,1.149715,0.430608,0.620098,-0.994051
1,-1.468964,-1.452583,-1.183653,-0.285775,-0.9196,-0.084992,-0.305917,-0.625426,-1.413156,-1.430565,-0.208294,0.750604,-1.046523,-0.280662,-0.362172,-1.724344,0.620098,-0.525895
2,-0.395263,-1.452583,-1.095477,-0.188293,0.001545,-0.084992,-0.305917,-0.625426,-1.413156,-0.98891,-0.208294,1.275691,-1.19332,-0.178934,0.415536,-1.426472,0.620098,-0.315741
3,-0.395263,0.177636,0.017751,-0.187933,-0.9196,-0.084992,-0.305917,0.909073,-0.55869,0.20016,-0.208294,-0.880915,-0.268498,-0.195283,0.850115,0.04732,0.620098,-1.279065
4,-1.468964,0.50368,-0.366919,-0.343991,0.92269,-0.084992,-0.305917,-0.625426,0.295777,1.219364,-0.208294,-0.824656,0.157213,-0.082511,-1.037582,0.263832,-1.522144,0.213252


In [30]:


# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["feature"] = X_train_scaled_df.columns
vif_data["VIF"] = [variance_inflation_factor(X_train_scaled_df.values, i) for i in range(X_train_scaled_df.shape[1])]

print(vif_data)

                 feature       VIF
0               bedrooms  1.664397
1              bathrooms  3.302012
2            sqft_living  5.017295
3               sqft_lot  2.209276
4                 floors  1.743392
5             waterfront  1.192743
6                   view  1.399618
7              condition  1.250729
8                  grade  3.368298
9               yr_built  2.593756
10          yr_renovated  1.151174
11               zipcode  1.641318
12         sqft_living15  2.969164
13            sqft_lot15  2.240071
14      distance_to_MSFT  2.024553
15    distance_to_Boeing  1.620364
16  school_district_rank  2.047767
17       housing_density  1.390647
