In [1]:
import numpy as np
import pandas as pd

In [2]:
# first vizualize the data you are working with
data = pd.read_csv('house_price_prediction.csv')
data = data.dropna()
data.drop(columns=['address'], inplace=True)

# A very simplyfying way to do this is separating the data into training and testing data with a 80-20 split
y_data = data['price']
x_data = data.drop(columns=['price'])

X_train = x_data[:int(0.8*len(x_data))]
X_test = x_data[int(0.8*len(x_data)):]

Y_train = y_data[:int(0.8*len(y_data))]
Y_test = y_data[int(0.8*len(y_data)):]

In [16]:
data

Unnamed: 0,avg_income,avg_area_house_age,avg_area_num_rooms,avg_bedrooms,avg_population,price
0,79545.45857,5.682861,7.009188,4.09,23086.80050,1.059034e+06
1,79248.64245,6.002900,6.730821,3.09,40173.07217,1.505891e+06
2,61287.06718,5.865890,8.512727,5.13,36882.15940,1.058988e+06
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1.260617e+06
4,59982.19723,5.040555,7.839388,4.23,26354.10947,6.309435e+05
...,...,...,...,...,...,...
5000,65510.58180,5.992305,6.792336,4.07,46501.28380,1.298950e+06
5005,69639.14090,5.007510,7.778375,6.05,54056.12843,1.381831e+06
5007,60567.94414,7.830362,6.137356,3.46,22837.36103,1.060194e+06
5010,68001.33124,5.534388,7.130144,5.44,42625.62016,1.198657e+06


In [4]:
# now we can start to code the functions for the linear regression, which is a strong assumption for the house market pricing 
# but it serve  as a good example for the purpose for learning the technique


def mean_square_error(y_predicted, y_data):
    return np.mean((y_predicted - y_data) ** 2)

def batch_linear_regression(x_data, y_data, learning_rate: float, epochs: int):
    m, n = x_data.shape  # m: number of samples, n: number of features
    
    weights = np.array([0 for _ in range(n)])
    bias = 0

    for i in range(epochs):
        y_predicted = np.dot(x_data, weights) + bias
        cost =  mean_square_error(y_predicted, y_data)
        weights_gradient = (2/m) * np.dot(x_data.T, (y_predicted - y_data))
        bias_gradient = (2/m) * np.sum(y_predicted - y_data)
        
        weights = weights - learning_rate * weights_gradient
        bias = bias - learning_rate * bias_gradient
        
        print(f'epoch: {i}, cost: {cost}')

    return weights, bias



In [5]:
weights , bias = batch_linear_regression(X_train, Y_train, 1e-10, 10000)

epoch: 0, cost: 1652214496024.905
epoch: 1, cost: 143808447473.3131
epoch: 2, cost: 67121280098.774895
epoch: 3, cost: 63203466211.71586
epoch: 4, cost: 62985040176.99282
epoch: 5, cost: 62955420826.246666
epoch: 6, cost: 62936147795.87427
epoch: 7, cost: 62918120284.89779
epoch: 8, cost: 62900846587.28971
epoch: 9, cost: 62884273743.29322
epoch: 10, cost: 62868372225.68453
epoch: 11, cost: 62853114785.16426
epoch: 12, cost: 62838475331.12843
epoch: 13, cost: 62824428832.540344
epoch: 14, cost: 62810951272.36092
epoch: 15, cost: 62798019606.342995
epoch: 16, cost: 62785611723.62255
epoch: 17, cost: 62773706408.91272
epoch: 18, cost: 62762283306.22929
epoch: 19, cost: 62751322884.0856
epoch: 20, cost: 62740806402.09717
epoch: 21, cost: 62730715878.93887
epoch: 22, cost: 62721034061.600105
epoch: 23, cost: 62711744395.885185
epoch: 24, cost: 62702830998.10851
epoch: 25, cost: 62694278627.93638
epoch: 26, cost: 62686072662.32856
epoch: 27, cost: 62678199070.53565
epoch: 28, cost: 62670644

In [6]:
# Now lets calculate the mean square error with the test part of the data
error1 = mean_square_error(np.dot(X_test, weights)+bias ,Y_test)
print(error1)

61382675514.23135


In [7]:
# Now, you can use the sklearn linear regression
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train , Y_train)


In [8]:
predictions = model.predict(X_test)

In [9]:
error2 = mean_square_error(predictions, Y_test)
print(error2)

10407734541.932314


In [10]:
error1 > error2

True