In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC

In [2]:
#-------------------------------------------------------------------
#                    Divide data into test and training
#-------------------------------------------------------------------

# read the data
df = pd.read_csv('../data/cleaned_data.csv',encoding = "ISO-8859–1")

#shuffle it's content
df = df.sample(frac=1).reset_index(drop=True)

#get training and test data
msk = np.random.rand(len(df)) < 0.8
train_data = df[msk]
test_data = df[~msk]

#save the datas into csv file
df_train = pd.DataFrame(train_data)
df_train.to_csv("../data/training_data.csv",index=False)
df_test = pd.DataFrame(test_data)
df_test.to_csv("../data/test_data.csv",index=False)

In [3]:
df

Unnamed: 0,Type of Property,Floor Size,Levies,Rates and Taxes,Pets Allowed,Lounge,Garage,Garden,price,Erf Size,Kitchen,Pool,Dining Room,Parking,Bathroom,Bedroom
0,0.0,175.0,1932.0,850.0,1,0.0,1.0,1.0,1350000,1.0,0.0,0.0,0.0,1.0,3.0,1.0
1,1.0,227.0,3558.0,1151.0,1,0.0,1.0,0.0,2990000,689.0,1.0,1.0,0.0,2.0,3.0,1.0
2,0.0,570.0,3620.0,2550.0,1,0.0,2.0,1.0,4300000,1.0,0.0,0.0,1.0,0.0,3.0,1.0
3,0.0,282.0,1932.0,1151.0,1,0.0,0.0,0.0,2950000,21.0,0.0,0.0,0.0,0.0,3.0,1.0
4,1.0,100.0,2250.0,1600.0,1,0.0,0.0,0.0,1599000,9102.0,1.0,1.0,1.0,1.0,3.0,1.0
5,0.0,478.0,1932.0,1863.0,1,1.0,1.0,1.0,3300000,2.0,0.0,0.0,1.0,2.0,3.0,1.0
6,0.0,100.0,1932.0,1151.0,1,0.0,0.0,0.0,860000,400.0,1.0,0.0,0.0,0.0,3.0,1.0
7,2.0,65.0,900.0,1151.0,1,1.0,1.0,0.0,645000,9102.0,0.0,0.0,0.0,0.0,3.0,1.0
8,0.0,137.0,1932.0,141.0,1,0.0,0.0,0.0,900000,291.0,1.0,0.0,1.0,0.0,3.0,1.0
9,0.0,210.0,1932.0,1151.0,1,0.0,2.0,1.0,1350000,1.0,0.0,0.0,0.0,0.0,3.0,1.0


In [4]:
#set X_train, X_test and y_train,y_test
X_train = df_train.loc[:, df_train.columns != 'price'].values
y_train = df_train["price"].values

X_test = df_test.loc[:,df_test.columns != 'price'].values
y_test = df_test["price"].values

print(X_test)

[[0.000e+00 1.750e+02 1.932e+03 ... 1.000e+00 3.000e+00 1.000e+00]
 [0.000e+00 2.100e+02 1.932e+03 ... 0.000e+00 3.000e+00 1.000e+00]
 [0.000e+00 2.820e+02 1.932e+03 ... 0.000e+00 3.000e+00 1.000e+00]
 ...
 [0.000e+00 2.820e+02 1.932e+03 ... 0.000e+00 3.000e+00 1.000e+00]
 [2.000e+00 8.600e+01 1.090e+03 ... 1.000e+00 3.000e+00 1.000e+00]
 [0.000e+00 2.820e+02 1.932e+03 ... 0.000e+00 3.000e+00 1.000e+00]]


In [17]:
#---------------------------------------------------
#             Test model:k_cross validation
#---------------------------------------------------
lasso = linear_model.Lasso()
ridge = linear_model.Ridge()
linear = linear_model.LinearRegression()
bayesian = linear_model.BayesianRidge()

model = linear_model
means = np.array(4)
models = {}
k = 10

#test lasso 
lasso_result = cross_validate(lasso,X_train,y_train,cv=k)
print("lasso_results: ",lasso_result["test_score"].mean())

#test ridge
ridge_result = cross_validate(ridge,X_train,y_train,cv=k)
print("ridge_result: ",ridge_result["test_score"].mean())

#test linear
linear_result = cross_validate(linear,X_train,y_train,cv=k)
print("linear_result: ",linear_result["test_score"].mean())

#test bayesian
bayesian_result = cross_validate(bayesian,X_train,y_train,cv=k)
print("bayesian_result: ",bayesian_result["test_score"].mean())

models={"lasso": lasso_result["test_score"].mean(), "ridge":ridge_result["test_score"].mean(), "linear": linear_result["test_score"].mean(), "bayesian":bayesian_result["test_score"].mean() }
key_list = list(models.keys())
val_list = list(models.values())

means = [lasso_result["test_score"].mean(),ridge_result["test_score"].mean(),linear_result["test_score"].mean(),bayesian_result["test_score"].mean()]
print("model:",np.amin(means))

key= val_list.index(np.amin(means))

if(key_list[key] == "lasso"):
    model = model.Lasso()
    
elif(key_list[key] == "ridge"):
    model = model.Ridge()

elif(key_list[key] == "linear"):
    model = model.LinearRegression()
    
elif(key_list[key] == "bayesian"):
    model=model.BayesianRidge()

lasso_results:  0.3242063541310323
ridge_result:  0.3244694406727555
linear_result:  0.32420606237858085
bayesian_result:  0.24578430613149563
model: 0.24578430613149563


In [18]:
#test model with test features
model.fit(X_train,y_train)
y_predict = model.predict(X_test)
print(y_predict)

352912872.45606536
