In [59]:
# Import necessary packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import pickle
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
# pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.expand_frame_repr', False)  # Do not wrap lines
# pd.set_option('display.width', 0)  # Automatically adjust the width to the terminal

In [60]:
# Import dataset
df = pd.read_csv('american_housing_data.csv')

In [61]:
# View the data and check for missing values
print(df.describe().T)
print(df.isna().sum())

                           count           mean            std          min           25%           50%           75%           max
Zip Code                 39981.0   64833.391336   25614.601116  10013.00000   40215.00000   74136.00000   85730.00000  9.819900e+04
Price                    39981.0  622777.118481  946979.305294   1800.00000  265000.00000  399900.00000  674990.00000  3.800000e+07
Beds                     39981.0       3.171682       1.308796      1.00000       3.00000       3.00000       4.00000  5.400000e+01
Baths                    39981.0       2.466572       1.323042      1.00000       2.00000       2.00000       3.00000  6.600000e+01
Living Space             39981.0    1901.522723    1211.307257      2.00000    1200.00000    1639.00000    2265.00000  7.434000e+04
Zip Code Population      39981.0   37726.201996   18672.647445      0.00000   24465.00000   35049.00000   46816.00000  1.164690e+05
Zip Code Density         39981.0    2379.412483    2946.574792      0.00000 

In [62]:
# Drop missing values in Median Household Income column
df = df.dropna(subset=['Median Household Income'])

# Drop Latitude and Longitude columns
df.drop(columns=["Latitude", "Longitude"], inplace=True)
print(df.head())

   Zip Code      Price  Beds  Baths  Living Space                Address      City     State  Zip Code Population  Zip Code Density    County  Median Household Income
0     10013  3999000.0     2      3          1967      74 GRAND ST APT 3  New York  New York                29563           20967.9  New York                 370046.0
1     10013  3999000.0     2      3          1967      74 GRAND ST APT 3  New York  New York                29563           20967.9  New York                 370046.0
2     10014  1650000.0     1      1           718  140 CHARLES ST APT 4D  New York  New York                29815           23740.9  New York                 249880.0
3     10014   760000.0     3      2          1538            38 JONES ST  New York  New York                29815           23740.9  New York                 249880.0
4     10014  1100000.0     1      1           600   81 BEDFORD ST APT 3F  New York  New York                29815           23740.9  New York                 249880.

In [63]:

# 75th percentile
seventy_fifth = df["Price"].quantile(0.75)

# 25th percentile
twenty_fifth = df["Price"].quantile(0.25)

# Interquartile range
prices_iqr = seventy_fifth - twenty_fifth

# Upper threshold
upper = seventy_fifth + (1.5 * prices_iqr)

# Lower threshold
lower = twenty_fifth - (1.5 * prices_iqr)

df_mod = df[(df["Price"] > lower) & (df["Price"] < upper)]
# Compare datasets before and after removing outliers
print(df.describe().T) # Before
print(df_mod.describe().T) # After


                           count           mean            std      min       25%       50%       75%         max
Zip Code                 39979.0   64832.585632   25614.988511  10013.0   40215.0   74136.0   85730.0     98199.0
Price                    39979.0  622782.635233  947002.598473   1800.0  265000.0  399900.0  674990.0  38000000.0
Beds                     39979.0       3.171615       1.308752      1.0       3.0       3.0       4.0        54.0
Baths                    39979.0       2.466520       1.323050      1.0       2.0       2.0       3.0        66.0
Living Space             39979.0    1901.505090    1211.321442      2.0    1200.0    1639.0    2265.0     74340.0
Zip Code Population      39979.0   37728.089297   18671.207769     39.0   24465.0   35049.0   46835.0    116469.0
Zip Code Density         39979.0    2379.531517    2946.600433      0.6     902.4    1588.7    2736.8     58289.6
Median Household Income  39979.0  110837.259861   47309.055715  27475.0   76640.0  10040

In [64]:
# prepare for modeling

# Store ["Beds", "Baths", "Living Space", "Zip Code Population", "Zip Code"] columns as float values
df_mod[["Beds", "Baths", "Living Space", "Zip Code Population", "Zip Code"]] = df_mod[["Beds", "Baths", "Living Space", "Zip Code Population", "Zip Code"]].astype(float)

# Fix column name spelling error in dataset
df_mod = df_mod.rename(columns={"County":"Country"})

features = ['Beds', 'Baths', 'Living Space', 'City', 'State', 'Zip Code Population', 'Zip Code Density', 'Median Household Income']
target = 'Price'

X = df_mod[features]
y = df_mod[target]

# Preprocessing pipelines for numeric and categorical data
numeric_features = ['Beds', 'Baths', 'Living Space', 'Zip Code Population', 'Zip Code Density', 'Median Household Income']
categorical_features = ['City', 'State']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Evaluate the model
acc = model.score(X_test, y_test)
print("Accuracy: " + str(acc))

# Save the model
with open("american_housing.pickle", "wb") as f:
    pickle.dump(model, f)

# Load the model and make predictions
with open("american_housing.pickle", "rb") as f:
    model = pickle.load(f)

predicted = model.predict(X_test)
for i in range(10):
    print(f"Predicted: {predicted[i]}, Features: {X_test.iloc[i].tolist()}, Actual: {y_test.iloc[i]}")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mod[["Beds", "Baths", "Living Space", "Zip Code Population", "Zip Code"]] = df_mod[["Beds", "Baths", "Living Space", "Zip Code Population", "Zip Code"]].astype(float)


Accuracy: 0.641614708832563
Predicted: 152477.21733007726, Features: [3.0, 1.0, 1080.0, 'Columbus', 'Ohio', 46038.0, 768.8, 67173.0], Actual: 234900.0
Predicted: 494243.4763292466, Features: [1.0, 1.0, 630.0, 'Seattle', 'Washington', 52210.0, 4397.0, 167932.0], Actual: 379900.0
Predicted: 251402.3632422606, Features: [4.0, 2.0, 1968.0, 'Kansas City', 'Kansas', 30874.0, 1038.0, 61534.0], Actual: 240000.0
Predicted: 307911.927814981, Features: [3.0, 2.0, 1160.0, 'Philadelphia', 'Pennsylvania', 45642.0, 9964.3, 58668.0], Actual: 155000.0
Predicted: 715488.2217987448, Features: [3.0, 2.0, 1572.0, 'Nashville', 'Tennessee', 6481.0, 324.5, 230199.0], Actual: 649900.0
Predicted: 414480.59537895786, Features: [4.0, 3.0, 1800.0, 'Philadelphia', 'Pennsylvania', 38840.0, 2147.0, 107171.0], Actual: 419900.0
Predicted: 343944.8262224771, Features: [3.0, 2.0, 1249.0, 'Edmond', 'Oklahoma', 42003.0, 476.0, 140148.0], Actual: 256990.0
Predicted: 573274.299873845, Features: [5.0, 4.0, 3125.0, 'Manor', 'T