In [20]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

my_dir = os.path.dirname(os.path.abspath('__file__'))
os.chdir(my_dir)

data = 'data/cleaned data/zillow_listings_combined_clean.csv'

In [21]:
df = pd.read_csv(data)

In [22]:
df.head()

Unnamed: 0,city,state,zip,year built,cooling type,garage type,sqft,price per sqft,buyers fee,listing price,housing category,HOA yearly
0,Portland,OR,97233,1983,Central air,2 Attached garage spaces,7840.0,340,2.5,402500,single family,0.0
1,Portland,OR,97225,1964,Central air,2 Attached garage spaces,,312,2.5,695000,single family,0.0
2,Portland,OR,97215,1913,Window unit(s),1 Garage space,3920.0,358,2.5,575000,single family,0.0
3,Portland,OR,97229,1957,Central air,2 Garage spaces,,312,2.5,1000000,single family,0.0
4,Portland,OR,97218,1928,Heat pump,Open parking,4791.0,280,2.5,295000,single family,0.0


In [23]:
categorical_columns = ['city', 
                       'state', 
                       'cooling type', 
                       'garage type', 
                       'housing category']

# Perform one-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)


In [25]:
df.describe()

Unnamed: 0,zip,year built,sqft,price per sqft,listing price,HOA yearly
count,318.0,318.0,206.0,318.0,318.0,318.0
mean,97280.361635,1969.839623,5916.873786,316.852201,645532.2,365.880503
std,74.301021,37.279354,2053.418124,95.42715,430384.7,1633.588436
min,97201.0,1880.0,1200.0,37.0,49000.0,0.0
25%,97213.0,1941.0,4791.0,254.25,400625.0,0.0
50%,97266.0,1974.0,5930.0,306.5,537450.0,0.0
75%,97306.0,2001.0,7405.0,370.25,719675.0,0.0
max,97408.0,2023.0,10698.0,668.0,4200000.0,18456.0


In [26]:
df_encoded.describe()

Unnamed: 0,zip,year built,sqft,price per sqft,listing price,HOA yearly
count,318.0,318.0,206.0,318.0,318.0,318.0
mean,97280.361635,1969.839623,5916.873786,316.852201,645532.2,365.880503
std,74.301021,37.279354,2053.418124,95.42715,430384.7,1633.588436
min,97201.0,1880.0,1200.0,37.0,49000.0,0.0
25%,97213.0,1941.0,4791.0,254.25,400625.0,0.0
50%,97266.0,1974.0,5930.0,306.5,537450.0,0.0
75%,97306.0,2001.0,7405.0,370.25,719675.0,0.0
max,97408.0,2023.0,10698.0,668.0,4200000.0,18456.0


In [31]:
nan_rows = df_encoded.isna().any(axis=1)
print(df_encoded[nan_rows])

       zip  year built  sqft  price per sqft buyers fee  listing price   
1    97225        1964   NaN             312       2.5          695000  \
3    97229        1957   NaN             312       2.5         1000000   
6    97215        1927   NaN             337      2.25          750000   
10   97229        1969   NaN             381       2.5          800000   
12   97202        1955   NaN             504       2.5         1295000   
..     ...         ...   ...             ...        ...            ...   
309  97333        1953   NaN             595       2.5          709999   
312  97330        2016   NaN             350       2.5         1825000   
313  97330        1990   NaN             319      2.70          958500   
315  97333        1935   NaN             553       2.5         1350000   
316  97330        1967   NaN             470       2.5         1200000   

     HOA yearly  city_Eugene  city_Portland  city_Salem  ...   
1           0.0        False           True    

In [32]:
df_encoded[nan_rows]

Unnamed: 0,zip,year built,sqft,price per sqft,buyers fee,listing price,HOA yearly,city_Eugene,city_Portland,city_Salem,...,garage type_No data,garage type_No garage,garage type_Off street,"garage type_Off street, secured","garage type_Off street, secured, rv boat storage, garage door opener",garage type_Open parking,garage type_Secured,housing category_manufactured,housing category_single family,housing category_townhouse
1,97225,1964,,312,2.5,695000,0.0,False,True,False,...,False,False,False,False,False,False,False,False,True,False
3,97229,1957,,312,2.5,1000000,0.0,False,True,False,...,False,False,False,False,False,False,False,False,True,False
6,97215,1927,,337,2.25,750000,0.0,False,True,False,...,False,False,False,False,False,False,False,False,True,False
10,97229,1969,,381,2.5,800000,0.0,False,True,False,...,False,False,False,False,False,False,False,False,True,False
12,97202,1955,,504,2.5,1295000,0.0,False,True,False,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309,97333,1953,,595,2.5,709999,0.0,False,False,False,...,False,False,False,False,False,False,False,False,True,False
312,97330,2016,,350,2.5,1825000,0.0,False,False,False,...,False,False,False,False,False,False,False,False,True,False
313,97330,1990,,319,2.70,958500,0.0,False,False,False,...,False,False,False,False,False,False,False,False,True,False
315,97333,1935,,553,2.5,1350000,0.0,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [34]:
df_encoded = df_encoded.astype(float)  # Convert to float
df_encoded = df_encoded.fillna(0)  # Fill NaN values with 0
df_encoded = df_encoded.astype(int) 

In [35]:
df_encoded.head()

Unnamed: 0,zip,year built,sqft,price per sqft,buyers fee,listing price,HOA yearly,city_Eugene,city_Portland,city_Salem,...,garage type_No data,garage type_No garage,garage type_Off street,"garage type_Off street, secured","garage type_Off street, secured, rv boat storage, garage door opener",garage type_Open parking,garage type_Secured,housing category_manufactured,housing category_single family,housing category_townhouse
0,97233,1983,7840,340,2,402500,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,97225,1964,0,312,2,695000,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,97215,1913,3920,358,2,575000,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,97229,1957,0,312,2,1000000,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,97218,1928,4791,280,2,295000,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0


In [36]:

# Assuming you have a 'target' column containing the target labels
X = df_encoded.drop('listing price', axis=1)  # Features
y = df_encoded['listing price']  # Target labels

# Initialize the Random Forest classifier
random_forest = RandomForestClassifier(random_state=42)

# Perform cross-validation with 5 folds
cv_scores = cross_val_score(random_forest, X, y, cv=5)

# Print the cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())




Cross-Validation Scores: [0.03125    0.03125    0.         0.03174603 0.04761905]
Mean Accuracy: 0.028373015873015873
