In [2]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
# Load data set
df = pd.read_csv("housing_data.csv")

In [5]:
# Check data set
print(df.head())



   Unnamed: 0             address  asked_price  \
0         0.0    Markviksvägen 15    6475000.0   
1         1.0  Mälarhöjdsvägen 38    9950000.0   
2         2.0     Häradsvägen 202    5795000.0   
3         3.0     Hemmansvägen 18   11950000.0   
4         4.0     Alholmsbacken 8    7950000.0   

                                coordinate  final_price  \
0                  [59.371685, 17.8250389]      6950000   
1                 [59.3008766, 17.9478054]     11300000   
2  [59.27202322587505, 17.954076342308372]      6200000   
3   [59.28390406370304, 18.07625832074728]     12700000   
4                   [59.28134, 17.9009438]      8100000   

   pourcentage_difference  land_area   area  \
0                     7.0      635.0  117.0   
1                    14.0      882.0  195.0   
2                     7.0     1224.0   90.0   
3                     6.0      542.0  110.0   
4                     2.0      690.0  157.0   

                                       commune  price_per_are

In [6]:
print(df.describe())

       Unnamed: 0   asked_price   final_price  pourcentage_difference  \
count  500.000000  5.000000e+02  5.000000e+02              500.000000   
mean    24.950000  9.375150e+06  1.004715e+07                8.914000   
std     14.448183  3.911988e+06  4.014927e+06                8.797579   
min      0.000000  1.475000e+06  1.490000e+06                0.000000   
25%     12.475000  6.995000e+06  7.450000e+06                1.000000   
50%     24.950000  8.475000e+06  8.852500e+06                7.000000   
75%     37.425000  1.012375e+07  1.130000e+07               14.000000   
max     49.900000  2.990000e+07  3.110000e+07               48.000000   

          land_area         area  price_per_area       rooms  \
count    495.000000   499.000000      499.000000  490.000000   
mean    1012.278788   142.759519    79120.142285    6.165306   
std     1508.117107   116.952734    29949.142598    2.292341   
min       93.000000    21.000000    20617.000000    2.000000   
25%      557.500000   

In [7]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              500 non-null    float64
 1   address                 500 non-null    object 
 2   asked_price             500 non-null    float64
 3   coordinate              500 non-null    object 
 4   final_price             500 non-null    int64  
 5   pourcentage_difference  500 non-null    float64
 6   land_area               495 non-null    float64
 7   area                    499 non-null    float64
 8   commune                 500 non-null    object 
 9   price_per_area          499 non-null    float64
 10  rooms                   490 non-null    float64
 11  sale_date               500 non-null    object 
 12  supplemental_area       500 non-null    float64
dtypes: float64(8), int64(1), object(4)
memory usage: 50.9+ KB
None


In [8]:
# Data cleaning
# Drop any missing values
df.dropna(inplace=True)

In [9]:
# Encode categorical variables into dummy variables
df = pd.get_dummies(df, drop_first=True)

In [None]:
# Data analysis
# Plot the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.show()


In [None]:
# Plot the distribution of the target variable (final_price)
plt.figure(figsize=(8, 6))
sns.histplot(df["final_price"], kde=True)
plt.show()

In [None]:
# Plot the relationship between the asked_price and the final_price
plt.figure(figsize=(8, 6))
sns.scatterplot(x="asked_price", y="final_price", data=df)
plt.show()

In [None]:
# Build predictive model
# Define features and target variable
X = df.drop("final_price", axis=1)
y = df["final_price"]

In [None]:

# Split data into training and validation sets (80/20 ratio)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Choose linear regression as the machine learning algorithm
lr = LinearRegression()


In [None]:
# Train the model on the training set
lr.fit(X_train, y_train)

In [None]:
# Validation
# Predict on the validation set
y_pred = lr.predict(X_val)


In [None]:
# Evaluate the model performance using mean squared error and R-squared score
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
print(f"Mean squared error: {mse:.2f}")
print(f"R-squared score: {r2:.2f}")

In [None]:
# Compare the model with a baseline model (mean value)
y_mean = np.mean(y_val)
y_base = np.full(len(y_val), y_mean)
mse_base = mean_squared_error(y_val, y_base)
r2_base = r2_score(y_val, y_base)
print(f"Mean squared error of baseline model: {mse_base:.2f}")
print(f"R-squared score of baseline model: {r2_base:.2f}")

In [None]:
# Plot the actual vs predicted values
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_val, y=y_pred)
plt.plot([y_min, y_max], [y_min, y_max], color="red", linestyle="--")
plt.xlabel("Actual final price")
plt.ylabel("Predicted final price")
plt.show()