#Preprocessing

## Gestione delle dipendenze

Import Pandas per il preprocessing. Eventualmente utilizzabile SKLearn (scikit-learn).

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder

## Import dei contenuti in Py (pandas.DataFrame)

In [2]:
CSV = "https://raw.githubusercontent.com/ProfAI/machine-learning-fondamenti/main/datasets/housing_dirty.csv"

df = pd.read_csv(CSV, index_col=0)

In [3]:
df.shape

(506, 14)

In [4]:
for column in df.columns:
    print("\n")
    if(df[column].dtype == "object"):
        print(df[column].value_counts())
    



CRIM
HIGH         130
LOW          127
VERY HIGH    127
MODERATE     122
Name: count, dtype: int64






CHAS
NO     471
YES     35
Name: count, dtype: int64






















In [5]:
print(df.isna().sum())

CRIM         0
ZN           2
INDUS        3
CHAS         0
NOX          7
RM           5
AGE          4
DIS          5
RAD          3
TAX          2
PTRATIO      5
B            3
LSTAT      199
PRICE        4
dtype: int64


In [6]:
#remove columns with isna().sum() > 30%

df = df.dropna(axis=1, thresh=df.shape[0]*0.7) #Almeno il 70% dei valori delle colonne deve essere non nulli

df = df.dropna(thresh=df.shape[1]*0.75) #Almeno il 75% dei valori delle righe deve essere non nulli

df = df.dropna(subset=["PRICE"]) #Rimuoviamo le righe con valori nulli nella colonna PRICE


print(df.isna().sum())

CRIM       0
ZN         0
INDUS      1
CHAS       0
NOX        4
RM         0
AGE        0
DIS        1
RAD        0
TAX        0
PTRATIO    3
B          1
PRICE      0
dtype: int64


In [7]:
for column in df.columns: # Verifica se ci sono colonne con valori nulli e sostituisci i valori nulli con la moda o la media
    if(column == "PRICE"):
        continue
    if df[column].dtype == "object": #verifica tipo di dato
        replace_with = df[column].mode()[0]
        df[column] = df[column].fillna(replace_with)
    else:
        replace_with = round(df[column].mean(), 1) #arrotondamento scelto in base ad altri valori del dataset
        df[column] = df[column].fillna(replace_with)

print(df.isna().sum())


CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
PRICE      0
dtype: int64


In [8]:
#Column CRIM -> ordering #LabelEncoding

mapper = {"LOW":0, "MODERATE":1, "HIGH":2, "VERY HIGH":3}

fmap = np.vectorize(lambda x: mapper[x])

df["CRIM"] = fmap(df["CRIM"])

In [9]:
#trasformazione con ColumnTransformer:

from sklearn.compose import ColumnTransformer #per applicare le trasformazioni alle colonne
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler #per normalizzare i dati
from sklearn.preprocessing import LabelEncoder #per trasformare le variabili categoriche in numeriche

column_transformer = ColumnTransformer(
    [
        ("onehot", OneHotEncoder(), ["CHAS"]),
        ("scaler", StandardScaler(), ["ZN", "INDUS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B"]) #non applicare sulla variabile da prevedere, da applicare solo sulle variabili continue
    ],
    remainder="passthrough"
)

df_test = column_transformer.fit_transform(df)

df.shape


(497, 13)

In [None]:
# Determina il numero di colonne generate

onehot_columns = column_transformer.named_transformers_['onehot'].get_feature_names_out(["CHAS"])
scaler_columns = ["ZN", "INDUS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B"]
all_columns = list(onehot_columns) + ["CRIM"] + scaler_columns + ["PRICE"] 

print(all_columns)

['CHAS_NO', 'CHAS_YES', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'PRICE']


In [11]:
df_new = pd.DataFrame(df_test, columns=all_columns)

df_new.head(15)

ValueError: Shape of passed values is (497, 14), indices imply (497, 13)