#Preprocessing

## Gestione delle dipendenze

Import Pandas per il preprocessing. Eventualmente utilizzabile SKLearn (scikit-learn).

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder

## Import dei contenuti in Py (pandas.DataFrame)

In [4]:
CSV = "https://raw.githubusercontent.com/ProfAI/machine-learning-fondamenti/main/datasets/housing_dirty.csv"

df = pd.read_csv(CSV, index_col=0)

In [5]:
df.shape

(506, 14)

In [8]:
for column in df.columns:
    print("\n")
    if(df[column].dtype == "object"):
        print(df[column].value_counts())
    



CRIM
HIGH         130
LOW          127
VERY HIGH    127
MODERATE     122
Name: count, dtype: int64






CHAS
NO     471
YES     35
Name: count, dtype: int64






















In [8]:
print(df.isna().sum())

CRIM         0
ZN           2
INDUS        3
CHAS         0
NOX          7
RM           5
AGE          4
DIS          5
RAD          3
TAX          2
PTRATIO      5
B            3
LSTAT      199
PRICE        4
dtype: int64


In [9]:
#remove columns with isna().sum() > 30%

df = df.dropna(axis=1, thresh=df.shape[0]*0.7) #Almeno il 70% dei valori delle colonne deve essere non nulli

df = df.dropna(thresh=df.shape[1]*0.75) #Almeno il 75% dei valori delle righe deve essere non nulli

df = df.dropna(subset=["PRICE"]) #Rimuoviamo le righe con valori nulli nella colonna PRICE


print(df.isna().sum())

CRIM       0
ZN         0
INDUS      1
CHAS       0
NOX        4
RM         0
AGE        0
DIS        1
RAD        0
TAX        0
PTRATIO    3
B          1
PRICE      0
dtype: int64


In [10]:
for column in df.columns: # Verifica se ci sono colonne con valori nulli e sostituisci i valori nulli con la moda o la media
    if(column == "PRICE"):
        continue
    if df[column].dtype == "object": #verifica tipo di dato
        replace_with = df[column].mode()[0]
    else:
        replace_with = round(df[column].mean(), 1)
        df[column] = df[column].fillna(replace_with)

print(df.isna().sum())


CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
PRICE      0
dtype: int64


In [12]:
#Column CRIM -> ordering

mapper = {"LOW":0, "MODERATE":1, "HIGH":2, "VERY HIGH":3}

fmap = np.vectorize(lambda x: mapper[x])

df["CRIM"] = fmap(df["CRIM"])

In [13]:
#trasformazione con ColumnTransformer:

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler

column_transformer = ColumnTransformer(
    [
        ("onehot", OneHotEncoder(), ["CHAS"]),
        ("scaler", MinMaxScaler(), ["ZN", "INDUS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B"]) #non applicare sulla variabile da prevedere
    ],
    remainder="passthrough"
)

df= pd.DataFrame(column_transformer.fit_transform(df))

df.to_csv("housing_clean.csv")