In [19]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import pickle

In [20]:
df = pd.read_csv("data2.csv")
df_with_dummy = pd.get_dummies(df, columns=["propertyType"])
df_with_dummy.drop(columns=["Unnamed: 0"], inplace=True)

y = np.log(df_with_dummy.loc[:, "soldPrice"])
X = df_with_dummy.loc[:, "bednumber":]
X = X.loc[:, ~X.columns.isin(["latitude", "longitude"])]

In [22]:
X["nearest_hospital_distance"] = X["nearest_hospital_distance"] * 1000

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()

model.fit(X_train, y_train)

predictions = model.predict(X_test)

print(f"R squrad: {model.score(X_test, y_test)}")
print(f"MSE: {mean_squared_error(predictions, y_test)}")

R squrad: 0.5012424840071016
MSE: 0.10513298886742027


\begin{align}
log(Price) = &\beta_0\ + \\
            &\beta_1Number\ of\  bathrooms \ + \\
            &\beta_2Old/New \ + \\
            &\beta_3Property Type \ + \\
            &\beta_4Latitude\ + \\
            &\beta_5Longitude
\end{align}

In [24]:
df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,soldPrice,bednumber,propertyType,duration_free_not_lease,isNew,latitude,longitude,nearest_hospital_distance
0,0,0,155000,2,Flat,0,0,51.286798,-2.820970,0.021107
1,1,1,253000,3,Semi-Detached,1,0,51.288002,-2.814496,0.014623
2,2,2,347500,4,Detached,1,0,51.287815,-2.813015,0.013142
3,3,3,320000,4,Detached,1,0,51.286798,-2.822261,0.022397
4,4,4,234000,2,Detached,1,0,51.287799,-2.810176,0.010303
5,5,5,219000,2,Terraced,1,0,51.287296,-2.815401,0.015527
6,6,6,343000,4,Detached,1,0,51.288177,-2.811358,0.011483
7,7,7,151500,2,Flat,0,0,51.285478,-2.816930,0.017065
8,8,8,580000,6,Terraced,1,0,51.287640,-2.821185,0.021309
9,9,9,230000,2,Semi-Detached,1,0,51.286836,-2.819365,0.019503


In [18]:
X

Unnamed: 0,bednumber,duration_free_not_lease,isNew,propertyType_Detached,propertyType_Flat,propertyType_Semi-Detached,propertyType_Terraced
0,2,0,0,0,1,0,0
1,3,1,0,0,0,1,0
2,4,1,0,1,0,0,0
3,4,1,0,1,0,0,0
4,2,1,0,1,0,0,0
5,2,1,0,0,0,0,1
6,4,1,0,1,0,0,0
7,2,0,0,0,1,0,0
8,6,1,0,0,0,0,1
9,2,1,0,0,0,1,0


In [4]:
X["bednumber"] = X["bednumber"] ** 2

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()

model.fit(X_train, y_train)

predictions = model.predict(X_test)

print(f"R squrad: {model.score(X_test, y_test)}")
print(f"MSE: {mean_squared_error(predictions, y_test)}")

R squrad: 0.6233126294655015
MSE: 0.07940184932163534


In [4]:
def serialize(obj, path):
    """
    Pickle a Python object
    """
    with open(path, "wb") as pfile:
        pickle.dump(obj, pfile)

In [5]:
pkl_path = "./app/linear_regression.pkl"
serialize(model, pkl_path)

In [None]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# import numpy as np
# import matplotlib.pyplot as plt

# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)

# model = RandomForestRegressor(n_jobs=-1)

# estimators = np.arange(10, 200, 10)
# scores = []
# for n in estimators:
#     model.set_params(n_estimators=n)
#     model.fit(X_train, y_train)
#     scores.append(model.score(X_test, y_test))
# plt.title("Effect of n_estimators")
# plt.xlabel("n_estimator")
# plt.ylabel("score")
# plt.plot(estimators, scores)

# scores