# Importing Necessary Libraries

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# Data Preprocessing

In [19]:
# Load data into dataframe
df = pd.read_csv('NY-House-Dataset.csv')

In [20]:
# Drop irrelevant columns
df = df.drop(columns=['LONGITUDE', 'LATITUDE','LONG_NAME','STREET_NAME','ADMINISTRATIVE_AREA_LEVEL_2','FORMATTED_ADDRESS','MAIN_ADDRESS','BROKERTITLE','LOCALITY'])
df['TYPE'] = df['TYPE'].replace('Condop for sale', 'Condo for sale')


In [21]:
# Replace mean values with nan
df.loc[(df['BATH'] > 2) & (df['BATH'] < 3), 'BATH'] = pd.NA
df.loc[(df['PROPERTYSQFT'] > 2184) & (df['PROPERTYSQFT'] < 2185), 'PROPERTYSQFT'] = pd.NA

In [22]:
# Prepare dataframe to predict number of bathrooms
df_bath = df.drop(columns=['PROPERTYSQFT'])
df_sqft = df.drop(columns=['BATH'])

In [23]:
unique_values = df['TYPE'].unique()
unique_values

array(['Condo for sale', 'House for sale', 'Townhouse for sale',
       'Co-op for sale', 'Multi-family home for sale', 'For sale',
       'Contingent', 'Land for sale', 'Foreclosure', 'Pending',
       'Coming Soon', 'Mobile house for sale'], dtype=object)

In [24]:
#Encode categorical values
encoder = OrdinalEncoder(categories=[['Manhattan', 'New York County', 'Richmond County', 'Kings County',
       'New York', 'East Bronx', 'Brooklyn', 'The Bronx', 'Queens',
       'Staten Island', 'Queens County', 'Bronx County', 'Coney Island',
       'Brooklyn Heights', 'Jackson Heights', 'Riverdale', 'Rego Park',
       'Fort Hamilton', 'Flushing', 'Dumbo', 'Snyder Avenue']])
df_bath['SUBLOCALITY_ENCODED'] = encoder.fit_transform(df[['SUBLOCALITY']])
df_sqft['SUBLOCALITY_ENCODED'] = encoder.fit_transform(df[['SUBLOCALITY']])

encoder = OrdinalEncoder(categories=[['Condo for sale', 'House for sale', 'Townhouse for sale',
       'Co-op for sale', 'Multi-family home for sale', 'For sale',
       'Contingent', 'Land for sale', 'Foreclosure', 'Pending',
       'Coming Soon', 'Mobile house for sale']])
df_bath['TYPE_ENCODED'] = encoder.fit_transform(df[['TYPE']])
df_sqft['TYPE_ENCODED'] = encoder.fit_transform(df[['TYPE']])

df_bath = df_bath.drop(columns=['SUBLOCALITY','STATE','ADDRESS','TYPE'])
df_sqft = df_sqft.drop(columns=['SUBLOCALITY','STATE','ADDRESS','TYPE'])
df_bath

Unnamed: 0,PRICE,BEDS,BATH,SUBLOCALITY_ENCODED,TYPE_ENCODED
0,315000,2,2.0,0.0,0.0
1,195000000,7,10.0,1.0,0.0
2,260000,4,2.0,2.0,1.0
3,69000,3,1.0,1.0,0.0
4,55000000,7,,1.0,2.0
...,...,...,...,...,...
4796,599000,1,1.0,4.0,3.0
4797,245000,1,1.0,10.0,3.0
4798,1275000,1,1.0,1.0,3.0
4799,598125,2,1.0,8.0,0.0


In [25]:
df_sqft

Unnamed: 0,PRICE,BEDS,PROPERTYSQFT,SUBLOCALITY_ENCODED,TYPE_ENCODED
0,315000,2,1400.0,0.0,0.0
1,195000000,7,17545.0,1.0,0.0
2,260000,4,2015.0,2.0,1.0
3,69000,3,445.0,1.0,0.0
4,55000000,7,14175.0,1.0,2.0
...,...,...,...,...,...
4796,599000,1,,4.0,3.0
4797,245000,1,,10.0,3.0
4798,1275000,1,,1.0,3.0
4799,598125,2,655.0,8.0,0.0


# Exporting Preprocessed Dataset

In [26]:
df_bath.to_csv('preprocessed_dataset_bath.csv', index=False)
df_sqft.to_csv('preprocessed_dataset_sqft.csv', index=False)