In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, PowerTransformer, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score
from math import radians, sin, cos, sqrt, atan2
import requests
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



In [24]:
# Load and filter the data
df = pd.read_csv('kc_house_data.csv')
print("Initial DataFrame:")
print(df.head())

Initial DataFrame:
           id             date     price  bedrooms  bathrooms  sqft_living  \
0  7129300520  20141013T000000  221900.0         3       1.00         1180   
1  6414100192  20141209T000000  538000.0         3       2.25         2570   
2  5631500400  20150225T000000  180000.0         2       1.00          770   
3  2487200875  20141209T000000  604000.0         4       3.00         1960   
4  1954400510  20150218T000000  510000.0         3       2.00         1680   

   sqft_lot  floors  waterfront  view  ...  grade  sqft_above  sqft_basement  \
0      5650     1.0           0     0  ...      7        1180              0   
1      7242     2.0           0     0  ...      7        2170            400   
2     10000     1.0           0     0  ...      6         770              0   
3      5000     1.0           0     0  ...      7        1050            910   
4      8080     1.0           0     0  ...      8        1680              0   

   yr_built  yr_renovated  zipc

In [25]:
# Filter homes with price between 100,000 and 2,000,000
df = df[(df['price'] >= 100000) & (df['price'] <= 2000000)]
print("\nFiltered DataFrame (100,000 <= price <= 2,000,000):")
print(df.head())

df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
print("\nDataFrame with year and month extracted:")
print(df.head())


Filtered DataFrame (100,000 <= price <= 2,000,000):
           id             date     price  bedrooms  bathrooms  sqft_living  \
0  7129300520  20141013T000000  221900.0         3       1.00         1180   
1  6414100192  20141209T000000  538000.0         3       2.25         2570   
2  5631500400  20150225T000000  180000.0         2       1.00          770   
3  2487200875  20141209T000000  604000.0         4       3.00         1960   
4  1954400510  20150218T000000  510000.0         3       2.00         1680   

   sqft_lot  floors  waterfront  view  ...  grade  sqft_above  sqft_basement  \
0      5650     1.0           0     0  ...      7        1180              0   
1      7242     2.0           0     0  ...      7        2170            400   
2     10000     1.0           0     0  ...      6         770              0   
3      5000     1.0           0     0  ...      7        1050            910   
4      8080     1.0           0     0  ...      8        1680              0  

In [26]:
# Define features and target
features = ['sqft_living', 'yr_built', 'grade', 'zipcode', 'bedrooms', 'bathrooms', 'sqft_lot', 'floors', 
            'sqft_above', 'sqft_basement', 'yr_renovated', 'year', 'month', 'lat', 'long']
target = 'price'
X = df[features]
y = df[target]
print("\nFeatures DataFrame (X):")
print(X.head())
print("\nTarget Series (y):")
print(y.head())



Features DataFrame (X):
   sqft_living  yr_built  grade  zipcode  bedrooms  bathrooms  sqft_lot  \
0         1180      1955      7    98178         3       1.00      5650   
1         2570      1951      7    98125         3       2.25      7242   
2          770      1933      6    98028         2       1.00     10000   
3         1960      1965      7    98136         4       3.00      5000   
4         1680      1987      8    98074         3       2.00      8080   

   floors  sqft_above  sqft_basement  yr_renovated  year  month      lat  \
0     1.0        1180              0             0  2014     10  47.5112   
1     2.0        2170            400          1991  2014     12  47.7210   
2     1.0         770              0             0  2015      2  47.7379   
3     1.0        1050            910             0  2014     12  47.5208   
4     1.0        1680              0             0  2015      2  47.6168   

      long  
0 -122.257  
1 -122.319  
2 -122.233  
3 -122.393  
4 

In [27]:
# Function to calculate distance using the Haversine formula
def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    r = 6371  # Radius of Earth in kilometers
    return r * c

In [28]:
# Calculate distances to key locations
city_center_coords = (47.615257, -122.338356)
MSFT_coords = (47.643543, -122.130821)
Boeing_coords = (47.532733, -122.316916)

X['distance_to_Seattle'] = X.apply(lambda row: haversine(row['lat'], row['long'], *city_center_coords), axis=1)
X['distance_to_MSFT'] = X.apply(lambda row: haversine(row['lat'], row['long'], *MSFT_coords), axis=1)
X['distance_to_Boeing'] = X.apply(lambda row: haversine(row['lat'], row['long'], *Boeing_coords), axis=1)
print("\nDataFrame with distances to key locations:")
print(X[['lat', 'long', 'distance_to_Seattle', 'distance_to_MSFT', 'distance_to_Boeing']].head())




DataFrame with distances to key locations:
       lat     long  distance_to_Seattle  distance_to_MSFT  distance_to_Boeing
0  47.5112 -122.257            13.082094         17.496867            5.096588
1  47.7210 -122.319            11.847081         16.511564           20.934918
2  47.7379 -122.233            15.754191         12.983608           23.664150
3  47.5208 -122.393            11.274888         23.936895            5.864758
4  47.6168 -122.045            21.989458          7.085291           22.437740


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['distance_to_Seattle'] = X.apply(lambda row: haversine(row['lat'], row['long'], *city_center_coords), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['distance_to_MSFT'] = X.apply(lambda row: haversine(row['lat'], row['long'], *MSFT_coords), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

In [29]:
# Map zip codes to school district rankings
zip_to_district = {
    98004: 1, 98005: 1, 98006: 1, 98007: 1, 98008: 1,
    98040: 2,
    98033: 3, 98034: 3, 98072: 3, 98074: 3, 98052: 3,
    98027: 4, 98029: 4, 98075: 4, 98038: 4,
    98155: 5, 98133: 5, 98177: 5, 98160: 5,
    98070: 6,
    98038: 7
}


In [30]:
# Ensure zipcodes are strings and handle missing values
X['zipcode'] = X['zipcode'].astype(str).fillna('unknown')
X['school_district_rank'] = X['zipcode'].map(zip_to_district).fillna(8)
print("\nDataFrame with school district rankings:")
print(X[['zipcode', 'school_district_rank']].head())


DataFrame with school district rankings:
  zipcode  school_district_rank
0   98178                   8.0
1   98125                   8.0
2   98028                   8.0
3   98136                   8.0
4   98074                   8.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['zipcode'] = X['zipcode'].astype(str).fillna('unknown')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['school_district_rank'] = X['zipcode'].map(zip_to_district).fillna(8)


In [31]:
# Define sets for urban and suburban zip codes
urban_zips = {'98101', '98102', '98103', '98104', '98105', '98106', '98107', '98108', '98109', '98112', '98115', '98116', '98117', '98118', '98119', '98121', '98122', '98125', '98126', '98133', '98134', '98136', '98144', '98146', '98148', '98154', '98155', '98158', '98160', '98161', '98164', '98166', '98168', '98174', '98177', '98178', '98188', '98198', '98199'}
suburban_zips = {'98001', '98002', '98003', '98004', '98005', '98006', '98007', '98008', '98010', '98011', '98014', '98019', '98022', '98023', '98024', '98025', '98027', '98028', '98029', '98030', '98031', '98032', '98033', '98034', '98038', '98039', '98040', '98042', '98045', '98047', '98050', '98051', '98052', '98053', '98055', '98056', '98057', '98058', '98059', '98062', '98063', '98064', '98065', '98070', '98072', '98073', '98074', '98075', '98077', '98092', '98093'}


In [32]:
# Function to classify zipcodes into urban, suburban, or rural
def classify_zipcode(zipcode):
    if zipcode in urban_zips:
        return 0  # Urban
    elif zipcode in suburban_zips:
        return 1  # Suburban
    else:
        return 2  # Rural

In [33]:
# Apply the classification function to zipcodes
X['area_type'] = X['zipcode'].apply(lambda x: classify_zipcode(x))
print("\nDataFrame with area type classification:")
print(X[['zipcode', 'area_type']].head())


DataFrame with area type classification:
  zipcode  area_type
0   98178          0
1   98125          0
2   98028          1
3   98136          0
4   98074          1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['area_type'] = X['zipcode'].apply(lambda x: classify_zipcode(x))


In [34]:
# Fetch housing density from Census API
def fetch_census_data(params):
    response = requests.get("https://api.census.gov/data/2020/acs/acs5", params=params)
    return response.json() if response.status_code == 200 else None


In [35]:
# Census API parameters and key
api_key = 'da7546bc69f821495474c275ba74ac0c5ba32a7c'
params_housing = {"get": "B25001_001E", "for": "zip code tabulation area:*", "key": api_key}
params_population = {"get": "B01003_001E", "for": "zip code tabulation area:*", "key": api_key}


In [36]:
# Fetch housing and population data
housing_data = fetch_census_data(params_housing)
population_data = fetch_census_data(params_population)

In [37]:
if housing_data and population_data:
    df_housing = pd.DataFrame(housing_data[1:], columns=housing_data[0])
    df_population = pd.DataFrame(population_data[1:], columns=population_data[0])
    df_housing['B25001_001E'] = pd.to_numeric(df_housing['B25001_001E'])
    df_population['B01003_001E'] = pd.to_numeric(df_population['B01003_001E'])
    df_housing['zip code tabulation area'] = df_housing['zip code tabulation area'].astype(str)
    df_population['zip code tabulation area'] = df_population['zip code tabulation area'].astype(str)
    print("\nHousing DataFrame:")
    print(df_housing.head())
    print("\nPopulation DataFrame:")
    print(df_population.head())


Housing DataFrame:
   B25001_001E zip code tabulation area
0         7282                    00601
1        17510                    00602
2        24453                    00603
3         2789                    00606
4        12454                    00610

Population DataFrame:
   B01003_001E zip code tabulation area
0        16773                    00601
1        37083                    00602
2        45652                    00603
3         6231                    00606
4        26502                    00610


In [39]:
    # List of King County zip codes
    king_county_zip_codes = [
        '98001', '98002', '98003', '98004', '98005', '98006', '98007', '98008', '98010', '98011', '98014', '98019', 
        '98022', '98023', '98024', '98025', '98027', '98028', '98029', '98030', '98031', '98032', '98033', '98034',
        '98038', '98039', '98040', '98042', '98045', '98047', '98050', '98051', '98052', '98053', '98055', '98056',
        '98057', '98058', '98059', '98062', '98063', '98064', '98065', '98070', '98072', '98073', '98074', '98075',
        '98077', '98092', '98093', '98101', '98102', '98103', '98104', '98105', '98106', '98107', '98108', '98109',
        '98112', '98115', '98116', '98117', '98118', '98119', '98121', '98122', '98125', '98126', '98133', '98134',
        '98136', '98144', '98146', '98148', '98154', '98155', '98158', '98160', '98161', '98164', '98166', '98168',
        '98174', '98177', '98178', '98188', '98198', '98199'
    ]

# Filter data for King County zip codes
    df_housing_king = df_housing[df_housing['zip code tabulation area'].isin(king_county_zip_codes)]
    df_population_king = df_population[df_population['zip code tabulation area'].isin(king_county_zip_codes)]
# Merge housing and population data
    df_merged = pd.merge(df_housing_king, df_population_king, on='zip code tabulation area')
    df_merged['housing_density'] = df_merged['B25001_001E'] / df_merged['B01003_001E']
    df_merged.rename(columns={'zip code tabulation area': 'zipcode'}, inplace=True)
    
      # Merge housing density data with main DataFrame
    X = pd.merge(X, df_merged[['zipcode', 'housing_density']], on='zipcode', how='left')
    print("\nDataFrame with housing density:")
    print(X.head())



DataFrame with housing density:
   sqft_living  yr_built  grade zipcode  bedrooms  bathrooms  sqft_lot  \
0         1180      1955      7   98178         3       1.00      5650   
1         2570      1951      7   98125         3       2.25      7242   
2          770      1933      6   98028         2       1.00     10000   
3         1960      1965      7   98136         4       3.00      5000   
4         1680      1987      8   98074         3       2.00      8080   

   floors  sqft_above  sqft_basement  ...  month      lat     long  \
0     1.0        1180              0  ...     10  47.5112 -122.257   
1     2.0        2170            400  ...     12  47.7210 -122.319   
2     1.0         770              0  ...      2  47.7379 -122.233   
3     1.0        1050            910  ...     12  47.5208 -122.393   
4     1.0        1680              0  ...      2  47.6168 -122.045   

   distance_to_Seattle  distance_to_MSFT  distance_to_Boeing  \
0            13.082094         17.496

In [40]:
# Drop latitude and longitude columns as they are no longer needed
X = X.drop(columns=['lat', 'long'])

In [43]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [44]:

# Define numerical columns (all columns in this case)
#numerical_cols = X.columns

In [45]:
# Define preprocessing pipeline for numerical data
#numerical_transformer = Pipeline(steps=[
 #   ('imputer', SimpleImputer(strategy='median')),
  #  ('scaler', StandardScaler()),
   # ('power', PowerTransformer(method='yeo-johnson')),
    #('poly', PolynomialFeatures(degree=2, include_bias=False))
#])


In [60]:
# Define preprocessing pipeline for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('power', PowerTransformer(method='yeo-johnson')),
    ('poly', PolynomialFeatures(degree=2, include_bias=False))
])

In [61]:
# Combine preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, features)])


In [71]:

# Define the Gradient Boosting model and parameter grid for hyperparameter tuning
model = GradientBoostingRegressor()
param_grid = {
    'n_estimators': [100, 150],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 4]
}

In [72]:
# Set up the pipeline
#pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

In [73]:
print(X_train.shape)
print(y_train.shape)
print(X_train.dtypes)

(17112, 20)
(17112,)
sqft_living               int64
yr_built                  int64
grade                     int64
zipcode                  object
bedrooms                  int64
bathrooms               float64
sqft_lot                  int64
floors                  float64
sqft_above                int64
sqft_basement             int64
yr_renovated              int64
year                      int32
month                     int32
distance_to_Seattle     float64
distance_to_MSFT        float64
distance_to_Boeing      float64
school_district_rank    float64
area_type                 int64
housing_density_x       float64
housing_density_y       float64
dtype: object


In [74]:
print(X_train.isnull().sum())
print(y_train.isnull().sum())

sqft_living             0
yr_built                0
grade                   0
zipcode                 0
bedrooms                0
bathrooms               0
sqft_lot                0
floors                  0
sqft_above              0
sqft_basement           0
yr_renovated            0
year                    0
month                   0
distance_to_Seattle     0
distance_to_MSFT        0
distance_to_Boeing      0
school_district_rank    0
area_type               0
housing_density_x       0
housing_density_y       0
dtype: int64
0


In [75]:
print(X_train.head())
print(y_train.head())

       sqft_living  yr_built  grade zipcode  bedrooms  bathrooms  sqft_lot  \
4774          1350      1964      7   98103         4       1.00      9000   
5749          1890      1974      7   98117         3       1.75      3825   
10091         1750      1952      7   98115         3       1.50      5400   
12189         1350      1912      7   98107         4       1.00      3333   
930           1490      1915      7   98136         4       3.00      6766   

       floors  sqft_above  sqft_basement  yr_renovated  year  month  \
4774      1.5        1350              0             0  2014      7   
5749      1.0        1290            600             0  2014      5   
10091     1.0        1050            700             0  2015      5   
12189     1.5        1350              0             0  2014     12   
930       1.5        1490              0             0  2014     10   

       distance_to_Seattle  distance_to_MSFT  distance_to_Boeing  \
4774              8.913098         1

In [77]:
# Perform hyperparameter tuning using RandomizedSearchCV
grid_search = RandomizedSearchCV(model, param_grid, cv=3, scoring='r2', n_jobs=-1, random_state=42, n_iter=4)
grid_search.fit(X_train, y_train)

In [78]:
 #Get the best model and parameters
best_model = grid_search.best_estimator_
print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best R2 score: {grid_search.best_score_}")


Best parameters: {'n_estimators': 150, 'max_depth': 4, 'learning_rate': 0.1}
Best R2 score: 0.8620947492408363


In [79]:
# Evaluate models on the test set
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    print(f"Test R2 score for {name}: ", r2_score(y_test, y_pred))

NameError: name 'best_models' is not defined

In [None]:
# Feature importance from Random Forest
numerical_feature_names = preprocessor.transformers_[0][1].named_steps['poly'].get_feature_names_out(numerical_cols)
categorical_feature_names = preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_cols)
feature_names = list(numerical_feature_names) + list(categorical_feature_names)

feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': best_models['RandomForest'].named_steps['model'].feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance.head(20))