# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [2]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from  sklearn  import  set_config
set_config(display='diagram')

# Custom functions

In [3]:
def save_dataset(nome_file, file):
    with open(nome_file +'.pkl', 'wb') as f:
        pickle.dump(file, f)
    return

In [4]:
def dataset_parameters(df):
    data_type_numerical = ['int64', 'float64']
    data_type_object = ['object', 'bool', 'category']
    data_type_date = ['datetime64']
    categorical_features  = list(filter(lambda x: df[x].dtype in data_type_object, df.columns))
    numerical_features = list(filter(lambda x: df[x].dtype in data_type_numerical, df.columns))
    date_features = list(filter(lambda x: df[x].dtype in data_type_date, df.columns))
    all_features  = numerical_features  + categorical_features + date_features 
    return categorical_features, numerical_features, date_features, all_features

In [5]:
def numeric_summary_parameters(df, column_names):
    result = {}
    for col in column_names:
        min_value = df[col].min()
        max_value = df[col].max()
        mean_value = df[col].mean()
        result[col] = [min_value, max_value, mean_value]
    return result

In [6]:
def dataframe_with_null(df):
    percentuale = round(df[df.isnull().any(axis=1)].shape[0]/df.shape[0]*100, 2)
    print('The dataset has {}% of records with at least one NaN value'.format(percentuale))

In [7]:
def unique_values_dataframe(df, categorical_features):
    result = {}
    for col in categorical_features:
        unique_values = df[col].unique()
        result[col] = unique_values.tolist()        
    unique_df = pd.DataFrame.from_dict(result, orient='index')
    unique_df = unique_df.transpose()
    return unique_df   

# Import dataset and visualize properties

In [8]:
df = pd.read_csv('wine.csv')

In [9]:
df.sample(10, random_state=13)

Unnamed: 0,Alcohol,Malic_Acid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,OD280/OD315_of_diluted_wines,Proline
145,13.16,3.57,2.15,21.0,102.0,1.5,0.55,0.43,1.3,4.0,0.6,1.68,830.0
7,14.06,2.15,2.61,17.6,121.0,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295.0
150,13.5,3.12,2.62,24.0,123.0,1.4,1.57,0.22,1.25,8.6,0.59,1.3,500.0
110,11.46,3.74,1.82,19.5,107.0,3.18,2.58,0.24,3.58,2.9,0.75,2.81,562.0
104,12.51,1.73,1.98,20.5,85.0,2.2,1.92,0.32,1.48,2.94,1.04,3.57,672.0
87,11.65,1.67,2.62,26.0,88.0,1.92,1.61,0.4,1.34,2.6,1.36,3.21,562.0
159,13.48,1.67,2.64,22.5,89.0,2.6,1.1,0.52,2.29,11.75,0.57,1.78,620.0
44,13.05,1.77,2.1,17.0,107.0,3.0,3.0,0.28,2.03,5.04,0.88,3.35,885.0
89,12.08,1.33,2.3,23.6,70.0,2.2,1.59,0.42,1.38,1.74,1.07,3.21,625.0
64,12.17,1.45,2.53,19.0,104.0,1.89,1.75,0.45,1.03,2.95,1.45,2.23,355.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Alcohol                       178 non-null    float64
 1   Malic_Acid                    178 non-null    float64
 2   Ash                           178 non-null    float64
 3   Alcalinity_of_ash             178 non-null    float64
 4   Magnesium                     178 non-null    float64
 5   Total_phenols                 178 non-null    float64
 6   Flavanoids                    178 non-null    float64
 7   Nonflavanoid_phenols          178 non-null    float64
 8   Proanthocyanins               178 non-null    float64
 9   Color_intensity               178 non-null    float64
 10  Hue                           178 non-null    float64
 11  OD280/OD315_of_diluted_wines  178 non-null    float64
 12  Proline                       178 non-null    float64
dtypes: fl

# Remove unnecessary records

In [11]:
categorical_features, numerical_features, date_features, all_features = dataset_parameters(df)

In [12]:
dataframe_with_null(df)

The dataset has 0.0% of records with at least one NaN value


In [13]:
df[df.isnull().any(axis=1)]

Unnamed: 0,Alcohol,Malic_Acid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,OD280/OD315_of_diluted_wines,Proline


In [14]:
#df = df.dropna()

In [15]:
unique_values_dataframe(df, categorical_features).fillna('')

In [16]:
df.describe()

Unnamed: 0,Alcohol,Malic_Acid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,OD280/OD315_of_diluted_wines,Proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


# Save DataFrame

In [17]:
save_dataset('wine_df', df)

# Transform  categorical and numerical  features

In [18]:
transformers = [    
    ('scale', StandardScaler(), numerical_features)
]

In [19]:
ct = ColumnTransformer(transformers)

In [20]:
ct

In [21]:
df_transformed = ct.fit_transform(df)

# Save X and y dataset

In [22]:
dataset_X = df_transformed

In [23]:
save_dataset('wine_dataset', dataset_X)

In [24]:
dataset_X

array([[ 1.51861254, -0.5622498 ,  0.23205254, ...,  0.36217728,
         1.84791957,  1.01300893],
       [ 0.24628963, -0.49941338, -0.82799632, ...,  0.40605066,
         1.1134493 ,  0.96524152],
       [ 0.19687903,  0.02123125,  1.10933436, ...,  0.31830389,
         0.78858745,  1.39514818],
       ...,
       [ 0.33275817,  1.74474449, -0.38935541, ..., -1.61212515,
        -1.48544548,  0.28057537],
       [ 0.20923168,  0.22769377,  0.01273209, ..., -1.56825176,
        -1.40069891,  0.29649784],
       [ 1.39508604,  1.58316512,  1.36520822, ..., -1.52437837,
        -1.42894777, -0.59516041]])

# Save processed DataFrame

In [25]:
df_processed = pd.DataFrame(ct.fit_transform(df), columns=ct.get_feature_names_out())

In [26]:
df_processed.columns = [col.split('__')[1] for col in df_processed.columns]

In [27]:
save_dataset('wine_df_processed', df_processed)

In [28]:
df_processed.sample(10)

Unnamed: 0,Alcohol,Malic_Acid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,OD280/OD315_of_diluted_wines,Proline
110,-1.903071,1.260006,-1.997705,0.001518,0.509638,1.417883,0.552915,-0.981875,3.485073,-0.933525,-0.910151,0.280108,-0.588791
39,1.50626,1.484422,0.52448,-1.890277,1.984119,1.129464,1.01474,-1.304188,0.856946,0.018129,-0.295924,1.297067,0.041738
151,-0.260169,0.299507,0.41482,0.752231,0.860705,-1.30608,-0.671924,-0.981875,-0.579763,2.483778,-2.094732,-1.612565,-0.84992
111,-0.593691,0.084068,-0.718336,0.451946,-0.824415,0.408415,0.241685,-0.820719,-0.649846,-1.322838,-0.25205,0.237735,-1.343516
108,-0.96427,-0.939268,-1.559065,-0.148624,-0.543562,0.103972,0.010773,0.226796,0.856946,-1.020039,-0.427544,0.576721,-1.384915
147,-0.161348,2.040973,0.41482,0.602088,-0.964842,-0.953567,-1.384741,0.87142,-1.280596,1.121183,-1.831492,-1.061713,-0.388168
164,0.962743,0.380297,-0.243142,0.752231,-0.683988,-1.514383,-1.354622,0.387952,-0.982742,1.956043,-1.129518,-1.315952,-0.420013
86,-1.038386,-0.652016,-0.206588,0.992459,-0.683988,-0.825381,-0.340615,0.549108,-0.054137,-1.128181,1.634506,-0.496736,-0.802153
61,-0.445459,-0.876432,-1.266637,-0.809251,0.018145,-0.440821,-0.621726,1.354888,-1.701097,0.2993,0.098937,-1.443072,-0.945455
131,-0.148995,0.586759,0.122392,0.151661,0.298998,-1.5945,-0.81248,-0.981875,-1.333159,0.1479,-0.954024,-1.683187,-0.690695
