In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import set_config

In [4]:
#config to work with pandas
set_config(transform_output="pandas")

In [5]:
sacramento = pd.read_csv (r'/Users/evgeniakveliashvili/Desktop/DSI_projects/LCR/01_materials/notebooks/dataset/sacramento.csv')

In [6]:
sacramento

Unnamed: 0,street,city,zip,state,beds,baths,sq__ft,type,sale_date,price,latitude,longitude
0,1 KENNELFORD CIR,SACRAMENTO,95823,CA,3,2,1144,Residential,Mon May 19 00:00:00 EDT 2008,200345,38.464520,-121.427606
1,10 SEA FOAM CT,SACRAMENTO,95831,CA,3,3,2052,Residential,Wed May 21 00:00:00 EDT 2008,415000,38.487885,-121.545947
2,100 CHELSEA CT,FOLSOM,95630,CA,3,2,1905,Residential,Mon May 19 00:00:00 EDT 2008,500000,38.694350,-121.177259
3,100 REBECCA WAY,FOLSOM,95630,CA,3,2,2185,Residential,Wed May 21 00:00:00 EDT 2008,344250,38.684790,-121.149199
4,100 TOURMALINE CIR,SACRAMENTO,95834,CA,5,3,3076,Residential,Mon May 19 00:00:00 EDT 2008,240000,38.634370,-121.510779
...,...,...,...,...,...,...,...,...,...,...,...,...
809,9880 IZILDA CT,SACRAMENTO,95829,CA,5,4,3863,Residential,Fri May 16 00:00:00 EDT 2008,598695,38.453260,-121.325730
810,993 MANTON CT,GALT,95632,CA,4,3,2307,Residential,Tue May 20 00:00:00 EDT 2008,300000,38.272942,-121.289148
811,9937 BURLINE ST,SACRAMENTO,95827,CA,3,2,1092,Residential,Fri May 16 00:00:00 EDT 2008,150000,38.559641,-121.323160
812,9949 NESTLING CIR,ELK GROVE,95757,CA,3,2,1543,Residential,Fri May 16 00:00:00 EDT 2008,275000,38.397455,-121.468391


In [13]:
sacramento_train, sacramento_test = train_test_split(
    sacramento, train_size=0.75, random_state=42
)

In [15]:
# step 1 define predictor and response
x_train = sacramento_train["sq__ft"]
y_train = sacramento_train["price"]

In [18]:
lm = LinearRegression()

In [21]:
lm.fit(
    sacramento_train [["sq__ft"]],
    sacramento_train["price"]
    )

In [22]:
pd.DataFrame({"slope": [lm.coef_[0]], "intercept": [lm.intercept_]})

Unnamed: 0,slope,intercept
0,132.075278,18547.098325


In [23]:
mlm = LinearRegression()

In [26]:
mlm.fit(
    sacramento_train [["sq__ft", "beds"]],
    sacramento_train["price"]
)

In [27]:
mlm.coef_

array([   151.1817206 , -20996.77714783])

In [28]:
mlm.coef_[0]

151.18172060169593

In [29]:
mlm.coef_[1]

-20996.777147832974

In [30]:
mlm.intercept_

56341.60511002343

In [32]:
sacramento_test["predicted"] = mlm.predict(sacramento_test[["sq__ft", "beds"]])

In [34]:
mlm_rmspe = mean_squared_error(
    y_true=sacramento_test["price"],
    y_pred= sacramento_test["predicted"]
) ** 0.5

In [35]:
mlm_rmspe

104438.56729994333

In [36]:
mlm_r2 = r2_score(
    y_true = sacramento_test["price"],
    y_pred= sacramento_test["predicted"]
)
mlm_r2

0.3167844099607501

In [39]:
return_dictionary_mlm = cross_validate(
    estimator=mlm,
    cv=5,
    X=sacramento[["sq__ft"]],
    y=sacramento["price"],
    scoring='neg_root_mean_squared_error'
)

In [40]:
return_dictionary_mlm

{'fit_time': array([0.00266099, 0.00161886, 0.00151801, 0.00101972, 0.00152302]),
 'score_time': array([0.00101185, 0.00069237, 0.00093603, 0.00061703, 0.00060701]),
 'test_score': array([-115422.28570704,  -99220.02660767,  -77991.04857821,
         -66840.12829727,  -64944.27385676])}

In [42]:
cv_5_df = pd.DataFrame(return_dictionary_mlm)
cv_5_df

Unnamed: 0,fit_time,score_time,test_score
0,0.002661,0.001012,-115422.285707
1,0.001619,0.000692,-99220.026608
2,0.001518,0.000936,-77991.048578
3,0.00102,0.000617,-66840.128297
4,0.001523,0.000607,-64944.273857


In [43]:
cv_5_df["test_score"] = cv_5_df["test_score"].abs()

In [44]:
cv_5_df

Unnamed: 0,fit_time,score_time,test_score
0,0.002661,0.001012,115422.285707
1,0.001619,0.000692,99220.026608
2,0.001518,0.000936,77991.048578
3,0.00102,0.000617,66840.128297
4,0.001523,0.000607,64944.273857


In [45]:
cv_5_metrics_mlm = cv_5_df.agg (["mean", "sem"])
cv_5_metrics_mlm

Unnamed: 0,fit_time,score_time,test_score
mean,0.001668,0.000773,84883.552609
sem,0.000269,8.4e-05,9769.327806


In [7]:
sacramento["type"].unique()

array(['Residential', 'Multi-Family', 'Condo'], dtype=object)

In [8]:
mlm = LinearRegression()

In [9]:
sacramento_encoded = pd.get_dummies(sacramento, columns = ['type'], drop_first=True)
sacramento_encoded

Unnamed: 0,street,city,zip,state,beds,baths,sq__ft,sale_date,price,latitude,longitude,type_Multi-Family,type_Residential
0,1 KENNELFORD CIR,SACRAMENTO,95823,CA,3,2,1144,Mon May 19 00:00:00 EDT 2008,200345,38.464520,-121.427606,False,True
1,10 SEA FOAM CT,SACRAMENTO,95831,CA,3,3,2052,Wed May 21 00:00:00 EDT 2008,415000,38.487885,-121.545947,False,True
2,100 CHELSEA CT,FOLSOM,95630,CA,3,2,1905,Mon May 19 00:00:00 EDT 2008,500000,38.694350,-121.177259,False,True
3,100 REBECCA WAY,FOLSOM,95630,CA,3,2,2185,Wed May 21 00:00:00 EDT 2008,344250,38.684790,-121.149199,False,True
4,100 TOURMALINE CIR,SACRAMENTO,95834,CA,5,3,3076,Mon May 19 00:00:00 EDT 2008,240000,38.634370,-121.510779,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
809,9880 IZILDA CT,SACRAMENTO,95829,CA,5,4,3863,Fri May 16 00:00:00 EDT 2008,598695,38.453260,-121.325730,False,True
810,993 MANTON CT,GALT,95632,CA,4,3,2307,Tue May 20 00:00:00 EDT 2008,300000,38.272942,-121.289148,False,True
811,9937 BURLINE ST,SACRAMENTO,95827,CA,3,2,1092,Fri May 16 00:00:00 EDT 2008,150000,38.559641,-121.323160,False,True
812,9949 NESTLING CIR,ELK GROVE,95757,CA,3,2,1543,Fri May 16 00:00:00 EDT 2008,275000,38.397455,-121.468391,False,True


In [None]:
mlm.fit(sacramento_encoded["sq__ft", "type_Multi-Family"])

In [10]:
import numpy as np
import pandas as pd
from sklearn import set_config
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [None]:
pd