# Housing Market Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# Data preprocessing

In [2]:
# Load data into dataframe
df = pd.read_csv('NY-House-Dataset.csv')

In [3]:
# Drop irrelevant columns
df = df.drop(columns=['LONGITUDE', 'LATITUDE','LONG_NAME','STREET_NAME','ADMINISTRATIVE_AREA_LEVEL_2','FORMATTED_ADDRESS','MAIN_ADDRESS','BROKERTITLE','LOCALITY'])
df['TYPE'] = df['TYPE'].replace('Condop for sale', 'Condo for sale')


In [4]:
# Replace mean values with nan
df.loc[(df['BATH'] > 2) & (df['BATH'] < 3), 'BATH'] = pd.NA
df.loc[(df['PROPERTYSQFT'] > 2184) & (df['PROPERTYSQFT'] < 2185), 'PROPERTYSQFT'] = pd.NA

In [12]:
# Prepare dataframe to predict number of bathrooms
df_bath = df.drop(columns=['PROPERTYSQFT'])
df_sqft = df.drop(columns=['BATH'])

Unnamed: 0,TYPE,PRICE,BEDS,PROPERTYSQFT,ADDRESS,STATE,SUBLOCALITY
0,Condo for sale,315000,2,1400.0,2 E 55th St Unit 803,"New York, NY 10022",Manhattan
1,Condo for sale,195000000,7,17545.0,Central Park Tower Penthouse-217 W 57th New Yo...,"New York, NY 10019",New York County
2,House for sale,260000,4,2015.0,620 Sinclair Ave,"Staten Island, NY 10312",Richmond County
3,Condo for sale,69000,3,445.0,2 E 55th St Unit 908W33,"Manhattan, NY 10022",New York County
4,Townhouse for sale,55000000,7,14175.0,5 E 64th St,"New York, NY 10065",New York County
...,...,...,...,...,...,...,...
4796,Co-op for sale,599000,1,,222 E 80th St Apt 3A,"Manhattan, NY 10075",New York
4797,Co-op for sale,245000,1,,97-40 62 Dr Unit Lg,"Rego Park, NY 11374",Queens County
4798,Co-op for sale,1275000,1,,427 W 21st St Unit Garden,"New York, NY 10011",New York County
4799,Condo for sale,598125,2,655.0,91-23 Corona Ave Unit 4G,"Elmhurst, NY 11373",Queens


In [6]:
unique_values = df['TYPE'].unique()
unique_values

array(['Condo for sale', 'House for sale', 'Townhouse for sale',
       'Co-op for sale', 'Multi-family home for sale', 'For sale',
       'Contingent', 'Land for sale', 'Foreclosure', 'Pending',
       'Coming Soon', 'Mobile house for sale'], dtype=object)

In [7]:
#Encode categorical values
encoder = OrdinalEncoder(categories=[['Manhattan', 'New York County', 'Richmond County', 'Kings County',
       'New York', 'East Bronx', 'Brooklyn', 'The Bronx', 'Queens',
       'Staten Island', 'Queens County', 'Bronx County', 'Coney Island',
       'Brooklyn Heights', 'Jackson Heights', 'Riverdale', 'Rego Park',
       'Fort Hamilton', 'Flushing', 'Dumbo', 'Snyder Avenue']])
df_bath['SUBLOCALITY_ENCODED'] = encoder.fit_transform(df[['SUBLOCALITY']])

encoder = OrdinalEncoder(categories=[['Condo for sale', 'House for sale', 'Townhouse for sale',
       'Co-op for sale', 'Multi-family home for sale', 'For sale',
       'Contingent', 'Land for sale', 'Foreclosure', 'Pending',
       'Coming Soon', 'Mobile house for sale']])
df_bath['TYPE_ENCODED'] = encoder.fit_transform(df[['TYPE']])

df_bath = df_bath.drop(columns=['SUBLOCALITY','STATE','ADDRESS','TYPE'])
df_bath


Unnamed: 0,PRICE,BEDS,BATH,SUBLOCALITY_ENCODED,TYPE_ENCODED
0,315000,2,2.0,0.0,0.0
1,195000000,7,10.0,1.0,0.0
2,260000,4,2.0,2.0,1.0
3,69000,3,1.0,1.0,0.0
4,55000000,7,,1.0,2.0
...,...,...,...,...,...
4796,599000,1,1.0,4.0,3.0
4797,245000,1,1.0,10.0,3.0
4798,1275000,1,1.0,1.0,3.0
4799,598125,2,1.0,8.0,0.0


# Model Training

In [8]:
# Separate rows with NaN values
df_bath_nan = df_bath[df_bath.isna().any(axis=1)]

# Separate rows without NaN values
df_bath_without_nan = df_bath[df_bath.notna().all(axis=1)]

# KNN Model

In [9]:
# Split into features (X) and target variable (y)
X = df_bath_without_nan.drop(columns=['BATH'])
y = df_bath_without_nan['BATH']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the KNN classifier with k=5
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn_model.predict(X_test)
y_pred

f1 = f1_score(y_test, y_pred,average='weighted', zero_division= np.nan)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred,average='weighted')
precision = precision_score(y_test, y_pred,average='weighted', zero_division= np.nan)
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)
print("Precision:", precision)

Accuracy: 0.5733333333333334
F1 Score: 0.5461869513677541
Recall: 0.5733333333333334
Precision: 0.5366242112431929


# Random Forest

In [10]:
# Split into features (X) and target variable (y)
X = df_bath_without_nan.drop(columns=['BATH'])
y = df_bath_without_nan['BATH']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
y_pred

f1 = f1_score(y_test, y_pred,average='weighted', zero_division= np.nan)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred,average='weighted')
precision = precision_score(y_test, y_pred,average='weighted', zero_division= np.nan)
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)
print("Precision:", precision)

Accuracy: 0.6577777777777778
F1 Score: 0.6593987039716467
Recall: 0.6577777777777778
Precision: 0.6654351256647547


In [11]:
# Split into features (X) and target variable (y)
X = df_bath_without_nan.drop(columns=['BATH'])
y = df_bath_without_nan['BATH']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create an SVM model with a linear kernel
svm_model = SVC(kernel='linear', random_state=42)

# Train the model
svm_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_scaled)

# Evaluate the model's accuracy
f1 = f1_score(y_test, y_pred,average='weighted', zero_division= np.nan)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred,average='weighted', zero_division= np.nan)
precision = precision_score(y_test, y_pred,average='weighted', zero_division= np.nan)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)
print("Precision:", precision)

Accuracy: 0.6133333333333333
F1 Score: 0.5848345244676091
Recall: 0.6133333333333333
Precision: 0.6116510687728469
