In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
df = pd.read_csv('realtor-data.csv')
df.head()

Unnamed: 0,status,price,bed,bath,acre_lot,full_address,street,city,state,zip_code,house_size,sold_date
0,for_sale,105000.0,3.0,2.0,0.12,"Sector Yahuecas Titulo # V84, Adjuntas, PR, 00601",Sector Yahuecas Titulo # V84,Adjuntas,Puerto Rico,601.0,920.0,
1,for_sale,80000.0,4.0,2.0,0.08,"Km 78 9 Carr # 135, Adjuntas, PR, 00601",Km 78 9 Carr # 135,Adjuntas,Puerto Rico,601.0,1527.0,
2,for_sale,67000.0,2.0,1.0,0.15,"556G 556-G 16 St, Juana Diaz, PR, 00795",556G 556-G 16 St,Juana Diaz,Puerto Rico,795.0,748.0,
3,for_sale,145000.0,4.0,2.0,0.1,"R5 Comunidad El Paraso Calle De Oro R-5 Ponce,...",R5 Comunidad El Paraso Calle De Oro R-5 Ponce,Ponce,Puerto Rico,731.0,1800.0,
4,for_sale,65000.0,6.0,2.0,0.05,"14 Navarro, Mayaguez, PR, 00680",14 Navarro,Mayaguez,Puerto Rico,680.0,,


In [3]:
df.shape

(512159, 12)

In [4]:
df.isnull().sum()

status               0
price                0
bed              98937
bath             95218
acre_lot        104979
full_address         0
street             616
city                59
state                0
zip_code           197
house_size      116466
sold_date       309652
dtype: int64

In [5]:
df.drop(['sold_date','full_address','street'],axis=1,inplace=True)
df.isnull().sum()

status             0
price              0
bed            98937
bath           95218
acre_lot      104979
city              59
state              0
zip_code         197
house_size    116466
dtype: int64

In [6]:
df2 = df.isnull().sum()/df.isnull().count()
df2 = df2.sort_values(ascending=False)
df2.head(20)   

house_size    0.227402
acre_lot      0.204973
bed           0.193176
bath          0.185915
zip_code      0.000385
city          0.000115
status        0.000000
price         0.000000
state         0.000000
dtype: float64

In [7]:
df.dropna(inplace =True)
df.shape

(294389, 9)

In [8]:
df.columns

Index(['status', 'price', 'bed', 'bath', 'acre_lot', 'city', 'state',
       'zip_code', 'house_size'],
      dtype='object')

In [9]:
cart = ['status',  'city', 'state']
enc =LabelEncoder()
for x in cart:
    df[x] = enc.fit_transform(df[x])
df

Unnamed: 0,status,price,bed,bath,acre_lot,city,state,zip_code,house_size
0,0,105000.0,3.0,2.0,0.12,10,7,601.0,920.0
1,0,80000.0,4.0,2.0,0.08,10,7,601.0,1527.0
2,0,67000.0,2.0,1.0,0.15,656,7,795.0,748.0
3,0,145000.0,4.0,2.0,0.10,1070,7,731.0,1800.0
5,0,179000.0,4.0,3.0,0.46,1190,7,612.0,2520.0
...,...,...,...,...,...,...,...,...,...
512149,0,1598000.0,5.0,3.0,0.05,169,5,11209.0,2064.0
512150,0,849999.0,9.0,3.0,0.06,1301,5,10301.0,3137.0
512152,0,1295000.0,6.0,6.0,0.02,913,5,11207.0,3300.0
512154,0,980000.0,3.0,2.0,0.05,366,5,11369.0,1462.0


In [10]:
x = df.drop('price',axis =1)
y = df['price']

In [11]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y, test_size = 0.3)

In [13]:
from sklearn.pipeline import Pipeline
model1 = Pipeline([('scale',StandardScaler()),('model',LinearRegression())])
model2= Pipeline([('scale',StandardScaler()),('model',Lasso())])
model3 = Pipeline([('scale',StandardScaler()),('model',Ridge())])
model4 = Pipeline([('scale',StandardScaler()),('model',DecisionTreeRegressor())])
model5 = Pipeline([('scale',StandardScaler()),('model',RandomForestRegressor())])

In [14]:
model1.fit(xtrain,ytrain)

Pipeline(steps=[('scale', StandardScaler()), ('model', LinearRegression())])

In [15]:
pred1 = model1.predict(xtest)
print(mean_absolute_error(ytest,pred1))
print(mean_absolute_percentage_error(ytest,pred1))
print(r2_score(ytest,pred1))

424707.5809549359
0.9553951913938262
0.2287875038257915


In [16]:
model2.fit(xtrain,ytrain)
pred1 = model2.predict(xtest)
print(mean_absolute_error(ytest,pred1))
print(mean_absolute_percentage_error(ytest,pred1))
print(r2_score(ytest,pred1))

424707.7500398831
0.9553963648545432
0.22878722679242447


In [17]:
model3.fit(xtrain,ytrain)
pred1 = model3.predict(xtest)
print(mean_absolute_error(ytest,pred1))
print(mean_absolute_percentage_error(ytest,pred1))
print(r2_score(ytest,pred1))

424707.9051557619
0.95539700301369
0.22878723803094336


In [18]:
model4.fit(xtrain,ytrain)
pred1 = model4.predict(xtest)
print(mean_absolute_error(ytest,pred1))
print(mean_absolute_percentage_error(ytest,pred1))
print(r2_score(ytest,pred1))

18957.6177737059
0.013420881920126412
0.6756813006566957


In [19]:
model5.fit(xtrain,ytrain)
pred1 = model5.predict(xtest)
print(mean_absolute_error(ytest,pred1))
print(mean_absolute_percentage_error(ytest,pred1))
print(r2_score(ytest,pred1))

18308.118555787038
0.016521771370132964
0.9245918298048372


In [21]:
x.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size
0,0,3.0,2.0,0.12,10,7,601.0,920.0
1,0,4.0,2.0,0.08,10,7,601.0,1527.0
2,0,2.0,1.0,0.15,656,7,795.0,748.0
3,0,4.0,2.0,0.1,1070,7,731.0,1800.0
5,0,4.0,3.0,0.46,1190,7,612.0,2520.0


In [22]:
model5.predict([[1,5,6,0.25,11,7,731,5000]])



array([3404494.])

In [32]:
x.drop('price',axis = 1,inplace =True)

In [33]:
outcome = model4.predict(x)

In [34]:
x['price'] = outcome
x

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,0,3.0,2.0,0.12,10,7,601.0,920.0,105000.0
1,0,4.0,2.0,0.08,10,7,601.0,1527.0,80000.0
2,0,2.0,1.0,0.15,656,7,795.0,748.0,67000.0
3,0,4.0,2.0,0.10,1070,7,731.0,1800.0,145000.0
5,0,4.0,3.0,0.46,1190,7,612.0,2520.0,179000.0
...,...,...,...,...,...,...,...,...,...
512149,0,5.0,3.0,0.05,169,5,11209.0,2064.0,1598000.0
512150,0,9.0,3.0,0.06,1301,5,10301.0,3137.0,849999.0
512152,0,6.0,6.0,0.02,913,5,11207.0,3300.0,1295000.0
512154,0,3.0,2.0,0.05,366,5,11369.0,1462.0,980000.0


In [26]:
df

Unnamed: 0,status,price,bed,bath,acre_lot,city,state,zip_code,house_size
0,0,105000.0,3.0,2.0,0.12,10,7,601.0,920.0
1,0,80000.0,4.0,2.0,0.08,10,7,601.0,1527.0
2,0,67000.0,2.0,1.0,0.15,656,7,795.0,748.0
3,0,145000.0,4.0,2.0,0.10,1070,7,731.0,1800.0
5,0,179000.0,4.0,3.0,0.46,1190,7,612.0,2520.0
...,...,...,...,...,...,...,...,...,...
512149,0,1598000.0,5.0,3.0,0.05,169,5,11209.0,2064.0
512150,0,849999.0,9.0,3.0,0.06,1301,5,10301.0,3137.0
512152,0,1295000.0,6.0,6.0,0.02,913,5,11207.0,3300.0
512154,0,980000.0,3.0,2.0,0.05,366,5,11369.0,1462.0
