# ML per previsione di dati immobiliari

## Dipendenze

In [None]:
# dipendenze
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer #per applicare le trasformazioni alle colonne
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler #per normalizzare i dati
from sklearn.preprocessing import LabelEncoder #per trasformare le variabili categoriche in numeriche

## Data Preprocessing

In [None]:
CSV = "https://raw.githubusercontent.com/ProfAI/machine-learning-fondamenti/main/datasets/housing_dirty.csv"

df = pd.read_csv(CSV, index_col=0)


In [None]:
x_names = df.columns.drop("PRICE")

y_name = "PRICE"

In [None]:
#Verifica tipologia di dati

for column in df.columns:
    if(df[column].dtype == "object"):
        print(df[column].value_counts())

In [None]:
#Verifica dati mancanti

df.isna().sum()

In [None]:
# Rimozione valori mancanti nella colonna target

df = df.dropna(subset=y_name)

Rimozione di colonne e righe con valori mancanti > 50%

In [None]:
# Rimozione di righe/colonne con troppi valori mancanti (Threshold > 50%)

df = df.dropna(axis=1, thresh=df.shape[0]*0.5)

df = df.dropna(thresh=df.shape[1]*0.5)

df.isna().sum()

## Definizione delle pipelines di trasformazione

In [None]:
# Verifica se ci sono colonne con valori nulli e sostituisci i valori nulli con la moda o la media

def replace_nulls(df):
    for column in df.columns: 
        if(column == y_name):
            continue
        if df[column].dtype == "object": #verifica tipo di dato
            replace_with = df[column].mode()[0]
            df[column] = df[column].fillna(replace_with)
        else:
            replace_with = round(df[column].mean(), 1) #arrotondamento scelto in base ad altri valori del dataset
            df[column] = df[column].fillna(replace_with)

In [None]:
# Split tra features numeriche e categoriche

def split_features(df):
    ## Manual
    if("CHAS" in df.columns and "CRIM" in df.columns):
        categorical_ordinal_columns = ["CRIM"] # su cui eseguire LabelEncoding .map()
        categorical_nominal_columns = ["CHAS"] # su cui eseguire OneHotEncoding pd.get_dummies()
    else:
        categorical_ordinal_columns = []
        categorical_nominal_columns = []

    numerical_columns = [col for col in df.columns if df[col].dtype in ["int64", "float64"]]

    if(y_name in numerical_columns):
        numerical_columns.remove(y_name)

    return categorical_ordinal_columns, categorical_nominal_columns, numerical_columns

categorical_ordinal_columns, categorical_nominal_columns, numerical_columns = split_features(df)

print(categorical_ordinal_columns, categorical_nominal_columns, numerical_columns)

In [None]:
# Label Encoding

def label_encoding(df, columns, map):
    for column in columns:
        df[column] = df[column].map(map)

In [None]:
# Gestione delle variabili categoriche sconnesse

categorical_transformer = Pipeline(steps=[
    ("ohe", OneHotEncoder())
])

In [None]:
#Gestione delle variabili numeriche

numerical_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

In [None]:
#Applicazione delle trasformazioni

preprocessor_cat = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_nominal_columns),
    ],
    remainder="passthrough"
)

In [None]:
preprocessor_num = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_columns + categorical_ordinal_columns)
    ],
    remainder="passthrough"
)

In [None]:
# Trasformazione dei dati

def transform_data(preprocessors, df):

    replace_nulls(df)

    categorical_ordinal_columns, categorical_nominal_columns, numerical_columns = split_features(df)

    label_encoding(df, categorical_ordinal_columns, {"LOW":0, "MODERATE":1, "HIGH":2, "VERY HIGH":3})

    for preprocessor in preprocessors:
        X = preprocessor.transform()
        
    return X

In [None]:
#Train test split

X_train, X_test, Y_train, Y_test = train_test_split(df.drop(y_name, axis=1), df[y_name], test_size=0.3)

preprocessor_cat.fit(X_train)

#Applicare le trasformazioni di preprocessor_cat prima di preprocessor_num

X_train = transform_data([preprocessor_cat, preprocessor_num], X_train)


# preprocessor_num.fit(X_train)

# X_test = transform_data([preprocessor_cat, preprocessor_num], X_test)

X_train

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
453.0,VERY HIGH,0.0,18.10,NO,713.00,7393.00,99.3,2.4527,24.0,666.0,20.2,375.87,
176.0,LOW,0.0,4.05,NO,0.51,6.02,47.2,3.5549,5.0,296.0,16.6,393.23,10.11
411.0,VERY HIGH,0.0,18.10,NO,597.00,6657.00,100.0,1.5275,24.0,666.0,20.2,35.05,21.22
162.0,HIGH,0.0,19.58,YES,605.00,7802.00,98.2,2.0407,5.0,403.0,14.7,389.61,1.92
469.0,VERY HIGH,0.0,18.10,NO,0.58,5713.00,56.7,2.8237,24.0,666.0,20.2,396.90,14.76
...,...,...,...,...,...,...,...,...,...,...,...,...,...
51.0,LOW,21.0,5.64,NO,439.00,6115.00,63.0,6.8147,4.0,243.0,16.8,393.97,
491.0,MODERATE,0.0,27.74,NO,609.00,5983.00,98.8,1.8681,4.0,711.0,20.1,390.11,
183.0,MODERATE,0.0,2.46,NO,488.00,6563.00,95.6,2847.0000,3.0,193.0,17.8,396.90,5.68
85.0,LOW,0.0,4.49,NO,449.00,6.63,56.1,4.4377,3.0,247.0,18.5,392.30,6.53
