In [468]:
import pandas as pd
import numpy as np
import os
import glob
from pathlib import Path
import datetime as dt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import accuracy_score

### Read realtor data

In [469]:

realtor_data = pd.read_csv(
    Path("realtor_data/2019_realtor_data.csv"))
display(realtor_data)

Unnamed: 0,sold_date,status,price,bed,bath,acre_lot,full_address,street,city,state,zip_code,house_size
0,2019-01-02,for_sale,3480000.0,6.0,7.0,0.19,"116 Bluff Rd, Fort Lee, NJ, 07024",116 Bluff Rd,Fort Lee,New Jersey,7024.0,
1,2019-01-02,for_sale,575000.0,4.0,3.0,0.13,"29 Village Rd, Clifton, NJ, 07013",29 Village Rd,Clifton,New Jersey,7013.0,
2,2019-01-02,for_sale,3480000.0,6.0,7.0,0.19,"116 Bluff Rd, Fort Lee, NJ, 07024",116 Bluff Rd,Fort Lee,New Jersey,7024.0,
3,2019-01-02,for_sale,435000.0,3.0,2.0,0.10,"192 Linwood Ave, Bogota, NJ, 07603",192 Linwood Ave,Bogota,New Jersey,7603.0,
4,2019-01-02,for_sale,349900.0,4.0,2.0,0.24,"16 Dorwin Dr, West Springfield, MA, 01089",16 Dorwin Dr,West Springfield,Massachusetts,1089.0,2002.0
...,...,...,...,...,...,...,...,...,...,...,...,...
24422,2019-12-31,for_sale,259900.0,3.0,2.0,0.18,"20 Russell Rd, Meriden, CT, 06450",20 Russell Rd,Meriden,Connecticut,6450.0,1312.0
24423,2019-12-31,for_sale,299999.0,,,0.18,"439 Edgewood St, Hartford, CT, 06112",439 Edgewood St,Hartford,Connecticut,6112.0,3922.0
24424,2019-12-31,for_sale,1598000.0,3.0,2.0,,"57 L St Apt 10, Boston, MA, 02127",57 L St Apt 10,Boston,Massachusetts,2127.0,1574.0
24425,2019-12-31,for_sale,1598000.0,3.0,2.0,,"57 L St Apt 10, Boston, MA, 02127",57 L St Apt 10,Boston,Massachusetts,2127.0,1574.0


# Prepare the realtor data

In [470]:
#realtor_data.set_index("sold_date", inplace = True)
realtor_data.sort_values(by=["sold_date"], ascending=True, inplace=True)
display(realtor_data)

Unnamed: 0,sold_date,status,price,bed,bath,acre_lot,full_address,street,city,state,zip_code,house_size
0,2019-01-02,for_sale,3480000.0,6.0,7.0,0.19,"116 Bluff Rd, Fort Lee, NJ, 07024",116 Bluff Rd,Fort Lee,New Jersey,7024.0,
112,2019-01-02,for_sale,575000.0,4.0,3.0,0.13,"29 Village Rd, Clifton, NJ, 07013",29 Village Rd,Clifton,New Jersey,7013.0,
113,2019-01-02,for_sale,3480000.0,6.0,7.0,0.19,"116 Bluff Rd, Fort Lee, NJ, 07024",116 Bluff Rd,Fort Lee,New Jersey,7024.0,
114,2019-01-02,for_sale,575000.0,7.0,3.0,0.08,"268 E 21st St, Paterson, NJ, 07513",268 E 21st St,Paterson,New Jersey,7513.0,
115,2019-01-02,for_sale,435000.0,3.0,2.0,0.10,"192 Linwood Ave, Bogota, NJ, 07603",192 Linwood Ave,Bogota,New Jersey,7603.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
24353,2019-12-31,for_sale,1598000.0,3.0,2.0,,"57 L St Apt 10, Boston, MA, 02127",57 L St Apt 10,Boston,Massachusetts,2127.0,1574.0
24352,2019-12-31,for_sale,449900.0,5.0,2.0,0.15,"1893 Park Ave, Bridgeport, CT, 06604",1893 Park Ave,Bridgeport,Connecticut,6604.0,1835.0
24351,2019-12-31,for_sale,299999.0,,,0.18,"439 Edgewood St, Hartford, CT, 06112",439 Edgewood St,Hartford,Connecticut,6112.0,3922.0
24361,2019-12-31,for_sale,349900.0,3.0,2.0,1.52,"29 Leslie Ln, Coventry, CT, 06238",29 Leslie Ln,Coventry,Connecticut,6238.0,1600.0


In [471]:
realtor_data = realtor_data.drop(columns=["full_address", "street", "city", "acre_lot", "status", "sold_date"])
display(realtor_data)

Unnamed: 0,price,bed,bath,state,zip_code,house_size
0,3480000.0,6.0,7.0,New Jersey,7024.0,
112,575000.0,4.0,3.0,New Jersey,7013.0,
113,3480000.0,6.0,7.0,New Jersey,7024.0,
114,575000.0,7.0,3.0,New Jersey,7513.0,
115,435000.0,3.0,2.0,New Jersey,7603.0,
...,...,...,...,...,...,...
24353,1598000.0,3.0,2.0,Massachusetts,2127.0,1574.0
24352,449900.0,5.0,2.0,Connecticut,6604.0,1835.0
24351,299999.0,,,Connecticut,6112.0,3922.0
24361,349900.0,3.0,2.0,Connecticut,6238.0,1600.0


In [472]:
mean_size = realtor_data["house_size"].mean()
mean_size

1951.489481725232

In [473]:
realtor_data["house_size"] = realtor_data["house_size"].fillna(mean_size)
display(realtor_data)

Unnamed: 0,price,bed,bath,state,zip_code,house_size
0,3480000.0,6.0,7.0,New Jersey,7024.0,1951.489482
112,575000.0,4.0,3.0,New Jersey,7013.0,1951.489482
113,3480000.0,6.0,7.0,New Jersey,7024.0,1951.489482
114,575000.0,7.0,3.0,New Jersey,7513.0,1951.489482
115,435000.0,3.0,2.0,New Jersey,7603.0,1951.489482
...,...,...,...,...,...,...
24353,1598000.0,3.0,2.0,Massachusetts,2127.0,1574.000000
24352,449900.0,5.0,2.0,Connecticut,6604.0,1835.000000
24351,299999.0,,,Connecticut,6112.0,3922.000000
24361,349900.0,3.0,2.0,Connecticut,6238.0,1600.000000


# Encode the dataset’s categorical variables using OneHotEncoder

In [474]:
# Create a list of categorical variables 
categorical_variables = list(realtor_data.dtypes[realtor_data.dtypes == "object"].index)
display(categorical_variables)

['state']

In [475]:
enc = OneHotEncoder(sparse=False)

In [476]:
encoded_data = enc.fit_transform(realtor_data[categorical_variables])

In [477]:
encoded_df = pd.DataFrame(
    encoded_data,
    columns = enc.get_feature_names_out(categorical_variables)
)

In [478]:
display(encoded_df)

Unnamed: 0,state_Connecticut,state_Delaware,state_Maine,state_Massachusetts,state_New Hampshire,state_New Jersey,state_New York,state_Pennsylvania,state_Puerto Rico,state_Rhode Island,state_Vermont
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
24422,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24423,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24424,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24425,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [479]:
df = realtor_data[['price', 'bed', 'bath', 'zip_code', 'house_size']].dropna()
display(df)

Unnamed: 0,price,bed,bath,zip_code,house_size
0,3480000.0,6.0,7.0,7024.0,1951.489482
112,575000.0,4.0,3.0,7013.0,1951.489482
113,3480000.0,6.0,7.0,7024.0,1951.489482
114,575000.0,7.0,3.0,7513.0,1951.489482
115,435000.0,3.0,2.0,7603.0,1951.489482
...,...,...,...,...,...
24354,1598000.0,3.0,2.0,2127.0,1574.000000
24353,1598000.0,3.0,2.0,2127.0,1574.000000
24352,449900.0,5.0,2.0,6604.0,1835.000000
24361,349900.0,3.0,2.0,6238.0,1600.000000


In [480]:
# Add the numerical variables from the original DataFrame to the one-hot encoding DataFrame
df.reset_index(inplace=True, drop=True)
encoded_realtor_df = pd.concat([df, encoded_df], axis=1).dropna()
check_nan = encoded_realtor_df.isnull().values.any()
display(encoded_realtor_df)

Unnamed: 0,price,bed,bath,zip_code,house_size,state_Connecticut,state_Delaware,state_Maine,state_Massachusetts,state_New Hampshire,state_New Jersey,state_New York,state_Pennsylvania,state_Puerto Rico,state_Rhode Island,state_Vermont
0,3480000.0,6.0,7.0,7024.0,1951.489482,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,575000.0,4.0,3.0,7013.0,1951.489482,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,3480000.0,6.0,7.0,7024.0,1951.489482,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,575000.0,7.0,3.0,7513.0,1951.489482,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,435000.0,3.0,2.0,7603.0,1951.489482,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22752,1598000.0,3.0,2.0,2127.0,1574.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
22753,1598000.0,3.0,2.0,2127.0,1574.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
22754,449900.0,5.0,2.0,6604.0,1835.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
22755,349900.0,3.0,2.0,6238.0,1600.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# Apply Logistic Regression

## Split training into testing sets (Create X, or features DataFrame, and create y, or target DataFrame)

In [481]:
y = encoded_realtor_df['price'].values
#np.reshape(y, (1, -1))
display(y)


array([3480000.,  575000., 3480000., ...,  449900.,  349900.,  919000.])

In [482]:
X = encoded_realtor_df.drop(columns=['price']).values
#X = X[~np.isnan(X)]
#np.isnan(X.any())
#np.any(np.isnan(X))
display(X)

array([[6.000e+00, 7.000e+00, 7.024e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [4.000e+00, 3.000e+00, 7.013e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [6.000e+00, 7.000e+00, 7.024e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [5.000e+00, 2.000e+00, 6.604e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [3.000e+00, 2.000e+00, 6.238e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [4.000e+00, 2.000e+00, 7.086e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

In [483]:
# Split the preprocessed data into a training and testing dataset
# Assign the function a random_state equal to 1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## Use scikit-learn's StandardScaler to scale the features data.


In [484]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)

# Fit the scaler to the features training dataset
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Create and use a classifier that can predict whether the house sold price price will be higher or lower

In [485]:
logistic_regression_model = LogisticRegression(random_state=1, max_iter = 1000)

## Fit: Train the Model by supplying it with some training it

In [486]:
logistic_regression_model.fit(X_train_scaled, y_train)

## Generate predictions from the model we just fit

In [487]:
# Make a prediction using the testing data
pred = logistic_regression_model.predict(X_test_scaled)

In [488]:
display(pred)

array([ 850000., 1995000., 1085000., ..., 1999000.,  760000.,  399000.])

In [489]:
pred_df = pd.DataFrame(pred)
display(pred_df)

Unnamed: 0,0
0,850000.0
1,1995000.0
2,1085000.0
3,599000.0
4,1349000.0
...,...
5685,9500000.0
5686,575000.0
5687,1999000.0
5688,760000.0


In [490]:
y = pd.DataFrame(y_test)
display(y)

Unnamed: 0,0
0,365000.0
1,925000.0
2,2125000.0
3,460000.0
4,549900.0
...,...
5685,9500000.0
5686,350000.0
5687,1999000.0
5688,950000.0


In [491]:
# Print the balanced_accuracy score of the model
accuracy_score(y_test, pred)

0.06994727592267136

## Apply the fitted model to the test dataset

In [492]:
results_df = pd.DataFrame({
    "Testing Data Predictions": pred,
    "Testing Data Actual Targets": y_test})
results_df

Unnamed: 0,Testing Data Predictions,Testing Data Actual Targets
0,850000.0,365000.0
1,1995000.0,925000.0
2,1085000.0,2125000.0
3,599000.0,460000.0
4,1349000.0,549900.0
...,...,...
5685,9500000.0,9500000.0
5686,575000.0,350000.0
5687,1999000.0,1999000.0
5688,760000.0,950000.0


# We evaluated the model predictions. If high accuracy (closer to 1) it may ean that there is overfitting which may mean that the model won't perform well on new data it was not trained on

# We can categorize the predictions on higher house prices or lower house prices according to a confusion matrix.