# ML per previsione di dati immobiliari

## Dipendenze

In [1]:
# dipendenze
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer #per applicare le trasformazioni alle colonne
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler #per normalizzare i dati
from sklearn.preprocessing import LabelEncoder #per trasformare le variabili categoriche in numeriche

## Data Preprocessing

In [2]:
CSV = "https://raw.githubusercontent.com/ProfAI/machine-learning-fondamenti/main/datasets/housing_dirty.csv"

df = pd.read_csv(CSV, index_col=0)


In [3]:
x_names = df.columns.drop("PRICE")

y_name = "PRICE"

In [4]:
#Verifica tipologia di dati

for column in df.columns:
    if(df[column].dtype == "object"):
        print(df[column].value_counts())

CRIM
HIGH         130
LOW          127
VERY HIGH    127
MODERATE     122
Name: count, dtype: int64
CHAS
NO     471
YES     35
Name: count, dtype: int64


In [5]:
#Verifica dati mancanti

df.isna().sum()

CRIM         0
ZN           2
INDUS        3
CHAS         0
NOX          7
RM           5
AGE          4
DIS          5
RAD          3
TAX          2
PTRATIO      5
B            3
LSTAT      199
PRICE        4
dtype: int64

In [6]:
# Rimozione valori mancanti nella colonna target

df = df.dropna(subset=y_name)

Rimozione di colonne e righe con valori mancanti > 50%

In [7]:
# Rimozione di righe/colonne con troppi valori mancanti (Threshold > 50%)

df = df.dropna(axis=1, thresh=df.shape[0]*0.5)

df = df.dropna(thresh=df.shape[1]*0.5)

df.isna().sum()

CRIM         0
ZN           2
INDUS        3
CHAS         0
NOX          7
RM           4
AGE          3
DIS          4
RAD          2
TAX          1
PTRATIO      4
B            2
LSTAT      197
PRICE        0
dtype: int64

## Definizione delle pipelines di trasformazione

In [8]:
# Verifica se ci sono colonne con valori nulli e sostituisci i valori nulli con la moda o la media

def replace_nulls(df):
    for column in df.columns: 
        if(column == y_name):
            continue
        if df[column].dtype == "object": #verifica tipo di dato
            replace_with = df[column].mode()[0]
            df[column] = df[column].fillna(replace_with)
        else:
            replace_with = round(df[column].mean(), 1) #arrotondamento scelto in base ad altri valori del dataset
            df[column] = df[column].fillna(replace_with)

In [9]:
# Split tra features numeriche e categoriche

def split_features(df):
    ## Manual
    if("CHAS" in df.columns and "CRIM" in df.columns):
        categorical_ordinal_columns = ["CRIM"] # su cui eseguire LabelEncoding .map()
        categorical_nominal_columns = ["CHAS"] # su cui eseguire OneHotEncoding pd.get_dummies()
    else:
        categorical_ordinal_columns = []
        categorical_nominal_columns = []

    numerical_columns = [col for col in df.columns if df[col].dtype in ["int64", "float64"]]

    if(y_name in numerical_columns):
        numerical_columns.remove(y_name)

    return categorical_ordinal_columns, categorical_nominal_columns, numerical_columns

categorical_ordinal_columns, categorical_nominal_columns, numerical_columns = split_features(df)

print(categorical_ordinal_columns, categorical_nominal_columns, numerical_columns)

['CRIM'] ['CHAS'] ['ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


In [10]:
# Label Encoding

def label_encoding(df, columns, map):
    for column in columns:
        df[column] = df[column].map(map)

In [11]:
def scale_data(df):
    df = (df - df.mean()) / df.std()
    return df

In [12]:
def one_hot_encoding(df):
    for column in df.columns:
        df = pd.get_dummies(df, columns=[column])
        return df

In [13]:
# Trasformazione dei dati

def transform_data(df):

    replace_nulls(df)

    categorical_ordinal_columns, categorical_nominal_columns, numerical_columns = split_features(df)

    label_encoding(df, categorical_ordinal_columns, {"LOW":0, "MODERATE":1, "HIGH":2, "VERY HIGH":3})

    df = pd.concat([df, one_hot_encoding(df[categorical_nominal_columns])], axis=1).drop(categorical_nominal_columns, axis=1)

    df[numerical_columns] = scale_data(df[numerical_columns])

    df[categorical_ordinal_columns] = scale_data(df[categorical_ordinal_columns])

    return df

In [14]:
#Train test split

X_train, X_test, Y_train, Y_test = train_test_split(df.drop(y_name, axis=1), df[y_name], test_size=0.3)

X_train = transform_data(X_train)

X_test = transform_data(X_test)

X_train

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,CHAS_NO,CHAS_YES
92.0,-1.340391,0.789344,0.524071,-0.022465,0.437373,-0.572603,-0.289963,-0.660591,-0.844754,-0.120048,0.446617,-0.732271,True,False
19.0,0.429937,-0.468329,-0.474833,0.306409,0.094215,-0.001621,-0.289857,-0.660591,-0.627166,1.177049,0.406115,0.000906,True,False
51.0,-1.340391,0.474926,-0.836755,-0.133571,0.280432,-0.235041,-0.287407,-0.660591,-1.003534,-0.768596,0.436242,0.000906,True,False
389.0,1.315101,-0.468329,0.967063,-2.081479,-2.651824,1.054157,-0.291536,1.604366,1.484021,0.806450,0.465471,0.000906,True,False
377.0,1.315101,-0.468329,0.967063,0.897492,0.606312,1.050566,0.809458,1.604366,1.484021,0.806450,0.465471,0.000906,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159.0,0.429937,-0.468329,1.181321,1.786339,-2.651286,1.093659,-0.291506,-0.547343,-0.062615,-1.741418,0.140358,-0.859421,True,False
63.0,1.315101,-0.001194,-0.002887,0.000201,0.000002,-0.001621,0.000032,-0.003753,-0.762424,0.574826,0.452303,-0.510997,True,False
168.0,0.429937,-0.468329,1.181321,0.604173,0.378340,0.953607,-0.291235,-0.547343,-0.062615,-1.741418,-0.530218,-0.246789,True,False
388.0,1.315101,-0.468329,0.967063,-2.081479,-2.652069,1.093659,-0.291649,1.604366,1.484021,0.806450,0.226250,0.000906,True,False


In [15]:
# Creazione del modello