In [71]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

my_dir = os.path.dirname(os.path.abspath('__file__'))
os.chdir(my_dir)

data = 'data/cleaned data/zillow_listings_combined_clean.csv'

In [72]:
df = pd.read_csv(data)

In [73]:
df.head()

Unnamed: 0,city,state,zip,year built,cooling type,garage type,sqft,price per sqft,buyers fee,listing price,housing category,HOA yearly
0,Hillsboro,OR,97123,2023,Central Air,1 Attached Garage space,3049.0,428,2,519900,single family,186
1,Hillsboro,OR,97123,2023,Central Air,2 Attached Garage spaces,,320,2,729900,single family,91
2,Hillsboro,OR,97123,2023,ENERGY STAR Qualified Equipment,2 Attached Garage spaces,5227.0,379,2,689900,single family,147
3,Hillsboro,OR,97123,2023,Central Air,2 Attached Garage spaces,,329,2,499900,townhouse,186
4,Hillsboro,OR,97123,2023,ENERGY STAR Qualified Equipment,2 Garage spaces,3049.0,312,2,599900,single family,147


In [74]:
cols_to_drop = ['city', 'state', 'price per sqft']
df = df.drop(cols_to_drop, axis = 1)
categorical_columns = ['cooling type','garage type', 'housing category']

# Perform one-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)


In [75]:
df.describe()

Unnamed: 0,zip,year built,sqft,listing price,HOA yearly
count,987.0,987.0,553.0,987.0,987.0
mean,97700.37386,1980.931104,5940.220615,853761.9,58.10233
std,583.197576,36.198396,2328.450049,911507.1,144.808451
min,97003.0,1884.0,1.0,59900.0,0.0
25%,97218.5,1956.0,4739.0,450000.0,0.0
50%,97402.0,1990.0,5998.0,599999.0,0.0
75%,98133.5,2010.0,7457.0,869995.0,10.0
max,98686.0,2025.0,10454.0,11500000.0,989.0


In [76]:
df_encoded.describe()

Unnamed: 0,zip,year built,sqft,listing price,HOA yearly
count,987.0,987.0,553.0,987.0,987.0
mean,97700.37386,1980.931104,5940.220615,853761.9,58.10233
std,583.197576,36.198396,2328.450049,911507.1,144.808451
min,97003.0,1884.0,1.0,59900.0,0.0
25%,97218.5,1956.0,4739.0,450000.0,0.0
50%,97402.0,1990.0,5998.0,599999.0,0.0
75%,98133.5,2010.0,7457.0,869995.0,10.0
max,98686.0,2025.0,10454.0,11500000.0,989.0


In [77]:
nan_rows = df_encoded.isna().any(axis=1)
print(df_encoded[nan_rows])

       zip  year built  sqft buyers fee  listing price  HOA yearly   
1    97123        2023   NaN          2         729900          91  \
3    97123        2023   NaN          2         499900         186   
6    98665        2023   NaN          2         525995          60   
7    98682        2023   NaN          2         604900         132   
8    98101        2019   NaN         3         1279900           0   
..     ...         ...   ...        ...            ...         ...   
951  98103        2002   NaN         3          625000         560   
952  98404        2002   NaN       2.5          549999         559   
968  98103        2015   NaN       3.0          950000           0   
974  98012        2006   NaN       2.5         1080000           0   
976  98021        1983   NaN         3          965000           0   

     garage type_1 Attached garage space  garage type_1 Carport space   
1                                  False                        False  \
3            

In [78]:
df_encoded[nan_rows]

Unnamed: 0,zip,year built,sqft,buyers fee,listing price,HOA yearly,garage type_1 Attached garage space,garage type_1 Carport space,garage type_1 Covered parking space,garage type_1 Garage space,...,"garage type_Off street, secured","garage type_Off street, secured, condo garage (rent), garage available",garage type_Open parking,"garage type_Rv parking, driveway","garage type_Rv parking, driveway, off street","garage type_Rv parking, none","garage type_Rv parking, off street",housing category_manufactured,housing category_single family,housing category_townhouse
1,97123,2023,,2,729900,91,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3,97123,2023,,2,499900,186,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
6,98665,2023,,2,525995,60,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
7,98682,2023,,2,604900,132,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
8,98101,2019,,3,1279900,0,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
951,98103,2002,,3,625000,560,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
952,98404,2002,,2.5,549999,559,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
968,98103,2015,,3.0,950000,0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
974,98012,2006,,2.5,1080000,0,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [79]:
df_encoded = df_encoded.astype(float)  # Convert to float
df_encoded = df_encoded.fillna(0)  # Fill NaN values with 0
df_encoded = df_encoded.astype(int) 

In [80]:
df_encoded.head()

Unnamed: 0,zip,year built,sqft,buyers fee,listing price,HOA yearly,garage type_1 Attached garage space,garage type_1 Carport space,garage type_1 Covered parking space,garage type_1 Garage space,...,"garage type_Off street, secured","garage type_Off street, secured, condo garage (rent), garage available",garage type_Open parking,"garage type_Rv parking, driveway","garage type_Rv parking, driveway, off street","garage type_Rv parking, none","garage type_Rv parking, off street",housing category_manufactured,housing category_single family,housing category_townhouse
0,97123,2023,3049,2,519900,186,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,97123,2023,0,2,729900,91,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,97123,2023,5227,2,689900,147,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,97123,2023,0,2,499900,186,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,97123,2023,3049,2,599900,147,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [81]:

# Assuming you have a 'target' column containing the target labels
X = df_encoded.drop('listing price', axis=1)  # Features
y = df_encoded['listing price']  # Target labels

# Initialize the Random Forest classifier
random_forest = RandomForestClassifier(random_state=42)

# Perform cross-validation with 5 folds
cv_scores = cross_val_score(random_forest, X, y, cv=5)

# Print the cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())




Cross-Validation Scores: [0.00505051 0.         0.         0.00507614 0.00507614]
Mean Accuracy: 0.003040557862892888


In [83]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [87]:
df.head()

Unnamed: 0,zip,year built,garage type,sqft,buyers fee,listing price,housing category,HOA yearly
0,97123,2023,1 Attached Garage space,3049.0,2,519900,single family,186
1,97123,2023,2 Attached Garage spaces,3049.0,2,729900,single family,91
2,97123,2023,2 Attached Garage spaces,5227.0,2,689900,single family,147
3,97123,2023,2 Attached Garage spaces,5227.0,2,499900,townhouse,186
4,97123,2023,2 Garage spaces,3049.0,2,599900,single family,147


In [89]:
# Fill missing values
df = df.fillna(method='ffill')  # Forward fill (use the previous row's value)

# Convert categorical columns to numerical using LabelEncoder
label_encoders = {}
for column in ['garage type', 'housing category']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le




In [92]:
df

Unnamed: 0,zip,year built,garage type,sqft,buyers fee,listing price,housing category,HOA yearly
0,97123,2023,0,3049.0,2,519900,2,186
1,97123,2023,8,3049.0,2,729900,2,91
2,97123,2023,8,5227.0,2,689900,2,147
3,97123,2023,8,5227.0,2,499900,3,186
4,97123,2023,11,3049.0,2,599900,2,147
...,...,...,...,...,...,...,...,...
982,98405,1909,33,4874.0,2.5,335000,2,0
983,98501,1946,29,8712.0,2.5,295000,2,0
984,98501,1921,4,6895.0,2.5,299000,2,0
985,98501,1924,4,6969.0,2.5,225000,2,0


In [None]:
# Splitting data into training and testing sets
X = df.drop('listing price', axis=1)  # All columns except target variable
y = df['listing price']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [91]:

# Modeling

# Initialize Decision Tree Regressor
tree = DecisionTreeRegressor(random_state=42)

# Train the model
tree.fit(X_train, y_train)

# Predict using the test set
y_pred = tree.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 1724480792421.2524
