In [21]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
import tensorflow
import mlxtend
from scipy import stats
from mlxtend.preprocessing import minmax_scaling
from tensorflow import keras
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from keras import layers
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [22]:
data = pd.read_csv('realtor-data.zip.csv')

data.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,prev_sold_date,price
0,for_sale,3.0,2.0,0.12,Adjuntas,Puerto Rico,601.0,920.0,,105000.0
1,for_sale,4.0,2.0,0.08,Adjuntas,Puerto Rico,601.0,1527.0,,80000.0
2,for_sale,2.0,1.0,0.15,Juana Diaz,Puerto Rico,795.0,748.0,,67000.0
3,for_sale,4.0,2.0,0.1,Ponce,Puerto Rico,731.0,1800.0,,145000.0
4,for_sale,6.0,2.0,0.05,Mayaguez,Puerto Rico,680.0,,,65000.0


Removing the irrelevant data columns that won't be used in the model

In [23]:
clean_dataV1 = data.drop(columns = ["status", "zip_code", "prev_sold_date"])
clean_dataV1.describe

<bound method NDFrame.describe of          bed  bath  acre_lot          city        state  house_size     price
0        3.0   2.0      0.12      Adjuntas  Puerto Rico       920.0  105000.0
1        4.0   2.0      0.08      Adjuntas  Puerto Rico      1527.0   80000.0
2        2.0   1.0      0.15    Juana Diaz  Puerto Rico       748.0   67000.0
3        4.0   2.0      0.10         Ponce  Puerto Rico      1800.0  145000.0
4        6.0   2.0      0.05      Mayaguez  Puerto Rico         NaN   65000.0
...      ...   ...       ...           ...          ...         ...       ...
1004961  2.0   1.0       NaN    Bronxville     New York       950.0  249000.0
1004962  3.0   2.0      0.14  Port Chester     New York      1806.0  599000.0
1004963  4.0   4.0      0.36    Park Ridge   New Jersey         NaN  749000.0
1004964  3.0   2.0      0.21       Yonkers     New York      1797.0  640000.0
1004965  NaN   1.0       NaN      Flushing     New York         NaN  226000.0

[1004966 rows x 7 columns]>

Going to check for any inconsistencies or empty data entries in the dataframe

In [24]:
missing_values_count = clean_dataV1.isnull().sum()

# of missing points in the first ten columns
missing_values_count[0:10]

bed           140531
bath          122850
acre_lot      295046
city              75
state              0
house_size    323284
price             71
dtype: int64

Removing all the rows with incomplete data entries

In [25]:
clean_dataV1.dropna(inplace=True)
clean_dataV1.describe

<bound method NDFrame.describe of          bed  bath  acre_lot           city        state  house_size     price
0        3.0   2.0      0.12       Adjuntas  Puerto Rico       920.0  105000.0
1        4.0   2.0      0.08       Adjuntas  Puerto Rico      1527.0   80000.0
2        2.0   1.0      0.15     Juana Diaz  Puerto Rico       748.0   67000.0
3        4.0   2.0      0.10          Ponce  Puerto Rico      1800.0  145000.0
5        4.0   3.0      0.46  San Sebastian  Puerto Rico      2520.0  179000.0
...      ...   ...       ...            ...          ...         ...       ...
1004958  2.0   2.0      0.05      Tarrytown     New York      1337.0  505000.0
1004959  2.0   1.0      0.14   White Plains     New York       730.0  332500.0
1004960  3.0   2.0      0.11      Scarsdale     New York      1578.0  699000.0
1004962  3.0   2.0      0.14   Port Chester     New York      1806.0  599000.0
1004964  3.0   2.0      0.21        Yonkers     New York      1797.0  640000.0

[461371 rows x 7 

left with a little less than half of our original data 

In [26]:
missing_values_count = clean_dataV1.isnull().sum()

# of missing points in the first ten columns
missing_values_count[0:10]

bed           0
bath          0
acre_lot      0
city          0
state         0
house_size    0
price         0
dtype: int64

Data looks good now, no more empty rows 

In [27]:
clean_dataV1.city.unique().size

2305

In [28]:
clean_dataV1.state.unique().size

14

Taking note of how many different cities and states are in our data. Am going to use these numbers to translate the names of citeies and states into numbers that the model can understand

In [29]:
clean_dataV1.city.unique()

array(['Adjuntas', 'Juana Diaz', 'Ponce', ..., 'Clintondale',
       'Grahamsville', 'Fallsburg'], dtype=object)

In [30]:
clean_dataV1.state.unique()

array(['Puerto Rico', 'Virgin Islands', 'Massachusetts', 'Connecticut',
       'New Jersey', 'New York', 'New Hampshire', 'Vermont',
       'Rhode Island', 'Wyoming', 'Maine', 'Pennsylvania',
       'West Virginia', 'Delaware'], dtype=object)

In [31]:
states = [14]
states = clean_dataV1.state.unique()
o = 0

for x in states:
    clean_dataV1.state[clean_dataV1.state == x] =  o 
    o+= 1

In [32]:
cities = [2305]
cities = clean_dataV1.city.unique()
o = 0

for x in cities:
    clean_dataV1.city[clean_dataV1.city == x] =  o 
    o+= 1

In [33]:
clean_dataV1.state.unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], dtype=object)

In [34]:
clean_dataV1.city.unique()

array([0, 1, 2, ..., 2302, 2303, 2304], dtype=object)

Now all of the cities and states in the data each  correspond to a unique number

In [35]:
clean_dataV1.describe

<bound method NDFrame.describe of          bed  bath  acre_lot  city state  house_size     price
0        3.0   2.0      0.12     0     0       920.0  105000.0
1        4.0   2.0      0.08     0     0      1527.0   80000.0
2        2.0   1.0      0.15     1     0       748.0   67000.0
3        4.0   2.0      0.10     2     0      1800.0  145000.0
5        4.0   3.0      0.46     3     0      2520.0  179000.0
...      ...   ...       ...   ...   ...         ...       ...
1004958  2.0   2.0      0.05  1627     5      1337.0  505000.0
1004959  2.0   1.0      0.14  1666     5       730.0  332500.0
1004960  3.0   2.0      0.11  1667     5      1578.0  699000.0
1004962  3.0   2.0      0.14  2238     5      1806.0  599000.0
1004964  3.0   2.0      0.21  1490     5      1797.0  640000.0

[461371 rows x 7 columns]>

In [36]:
#Scaling data form 0- 1 

Scaled_Clean_Data = minmax_scaling(clean_dataV1,columns=['bed','bath','acre_lot', "city", "state", "house_size"])

# Setting up the predicition input and results as X and y
y = clean_dataV1.price

X = Scaled_Clean_Data
X.describe

<bound method NDFrame.describe of               bed      bath      acre_lot      city     state  house_size
0        0.020408  0.005076  1.200000e-06  0.000000  0.000000    0.000550
1        0.030612  0.005076  8.000000e-07  0.000000  0.000000    0.000969
2        0.010204  0.000000  1.500000e-06  0.000434  0.000000    0.000432
3        0.030612  0.005076  1.000000e-06  0.000868  0.000000    0.001157
5        0.030612  0.010152  4.600000e-06  0.001302  0.000000    0.001654
...           ...       ...           ...       ...       ...         ...
1004958  0.010204  0.005076  5.000000e-07  0.706163  0.384615    0.000838
1004959  0.010204  0.000000  1.400000e-06  0.723090  0.384615    0.000419
1004960  0.020408  0.005076  1.100000e-06  0.723524  0.384615    0.001004
1004962  0.020408  0.005076  1.400000e-06  0.971354  0.384615    0.001161
1004964  0.020408  0.005076  2.100000e-06  0.646701  0.384615    0.001155

[461371 rows x 6 columns]>

Rescaled all of the data values so that different features like the house size don't  skew the models predictions

In [37]:
#Splits data into Training and testing data sets

train_X, val_X, train_y, val_y = train_test_split(X,y,random_state=0)
train_X.head

<bound method NDFrame.head of              bed      bath      acre_lot      city     state  house_size
918629  0.030612  0.020305  1.700000e-05  0.566840  0.384615    0.004583
916152  0.030612  0.005076  9.000000e-07  0.976997  0.384615    0.000931
633261  0.020408  0.010152  1.100000e-06  0.815104  0.307692    0.001370
821544  0.071429  0.010152  2.000000e-07  0.592014  0.384615    0.001685
329623  0.020408  0.010152  1.010000e-04  0.127170  0.538462    0.001367
...          ...       ...           ...       ...       ...         ...
244367  0.030612  0.015228  1.260000e-05  0.133681  0.461538    0.002308
953593  0.040816  0.030457  1.280000e-05  0.721354  0.384615    0.004816
199843  0.030612  0.005076  1.100000e-06  0.263021  0.153846    0.001171
948694  0.030612  0.010152  8.000000e-07  0.728733  0.384615    0.001978
571398  0.010204  0.010152  2.600000e-04  0.623264  0.307692    0.001157

[346028 rows x 6 columns]>

Split the data up into to use for testing training. Split was done randomly to try and remove bias that may be in the data

In [38]:
Algo_model = RandomForestRegressor(random_state=1)

Using a random forest model to get more accurate predicitons

In [39]:
Algo_model.fit(train_X, train_y)

In [None]:
Algo_model.predict(val_X)

array([825000., 335000., 850000., ..., 685000., 359900., 899000.])

In [None]:
predicted_home_prices = Algo_model.predict(val_X)
mean_absolute_error(val_y, predicted_home_prices)

17848.11507305112

After training the model, we get an average guess that is within $180,000 of the acutal price of the home