In [1]:
# dipendenze
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures

In [None]:
# import dataset

CSV_PATH = "../sample_data/housing.csv"

df = pd.read_csv(CSV_PATH, index_col=0)

In [None]:
# correlation matrix

df.corr()

In [None]:
# heatmap

def heatmap(df):
    plt.figure(figsize=(12, 8))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
    plt.show()

heatmap(df)

In [None]:
# simple linear regression with highest feature correlation

X = df[['LSTAT']]
Y = df["PRICE"]

regressor = LinearRegression()

regressor.fit(X , Y)

Y_pred = regressor.predict(X)

# plot MSE and R2

def plot_mse_r2(Y, Y_pred, X=None, degree=1):
    mse = mean_squared_error(Y, Y_pred)
    r2 = r2_score(Y, Y_pred)


    if X is not None:
        plt.plot(X, Y_pred, c="red")
        plt.scatter(X, Y, color="lightblue", s=10)
    plt.title("Predicted prices")
    if(degree > 1): plt.title(f"Predicted prices - Degree {degree}")
    plt.text(23, 45, f"MSE: {mse}\nR2: {r2}")
    plt.show()

plot_mse_r2(Y, Y_pred, X=X)

In [None]:
# polinomial regression with graph

def polinomial_regression(X, Y, degree=5):
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(X)

    regressor.fit(X_poly, Y)

    Y_pred = regressor.predict(X_poly)

    # sort X and Y_pred
    sorted_indices = np.argsort(X.flatten())
    X_sorted = X.flatten()[sorted_indices]
    Y_pred_sorted = Y_pred[sorted_indices]


    plt.plot(X_sorted, Y_pred_sorted, c="red")
    plt.scatter(X, Y, color="lightblue", s=10)
    plt.title(f"Predicted prices - Degree {degree}")
    plt.text(23, 45, f"MSE: {mean_squared_error(Y, Y_pred)}\nR2: {r2_score(Y, Y_pred)}")
    plt.show()

X = df[['LSTAT']].values
Y = df["PRICE"].values

polinomial_regression(X, Y, degree=7)

In [None]:
# multiple linear regression with the two most correlated features to target

X = df[['LSTAT', 'RM']]
Y = df["PRICE"]

regressor.fit(X , Y)

Y_pred = regressor.predict(X)

In [None]:
# polinomial regression with the three most correlated features

X_cols = df[["LSTAT", "RM", "PTRATIO"]]
Y = df["PRICE"]

degrees = [2, 3, 4, 5]

for degree in degrees:
    poly = PolynomialFeatures(degree=degree)

    X = poly.fit_transform(X_cols)

    regressor.fit(X, Y)

    Y_pred = regressor.predict(X)

In [None]:
# polinomial regression without bias

X_cols = df[["LSTAT", "RM", "PTRATIO"]]
Y = df["PRICE"]

degrees = [2, 3, 4, 5]

for degree in degrees:
    poly = PolynomialFeatures(degree=degree, include_bias=False)

    X = poly.fit_transform(X_cols)

    regressor.fit(X, Y)

    Y_pred = regressor.predict(X)

In [None]:
# linear regression with all features

X = df.drop("PRICE", axis=1)
Y = df["PRICE"]

regressor.fit(X, Y)

Y_pred = regressor.predict(X)

In [None]:
#normalization

X_cols = df.drop("PRICE", axis=1)
Y = df["PRICE"]

X = (X_cols - X_cols.min()) / (X_cols.max() - X_cols.min())

regressor.fit(X, Y)

y_pred = regressor.predict(X)

In [None]:
# data standardization

X_cols = df.drop("PRICE", axis=1)
Y = df["PRICE"]

scaler = StandardScaler()

X = scaler.fit_transform(X_cols)

regressor.fit(X, Y)

Y_pred = regressor.predict(X)

In [None]:
# some predictions!

X_pred = pd.read_csv("../sample_data/housing_predict.csv")

X = X_pred.drop("OWNER", axis=1)

X = scaler.transform(X)

Y_pred = regressor.predict(X)

Y_pred

In [None]:
# save results to excel

to_save = pd.DataFrame()

to_save["OWNER"] = X_pred["OWNER"]
to_save["PRICE"] = Y_pred

to_save.to_excel("housing_predictions.xlsx", index=False)