# Tarea de selección de variables

## Importamos las librerías necesarias

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
import math

## Cargamos los datos

In [5]:
np.random.seed(1)
wine_df = pd.read_csv('wine/winequality-white.csv', sep=';')
wine_df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [7]:
def CFS(df, attributes, class_v):
    """Realiza el método CFS
    
        :param df: Dataframe de Pandas con el dataset a tratar
        :param attributes: Lista de los nombres de las columnas a tratar
        :param class_v: Nombre de la columna a tratar como clase
        :type df: <class 'pandas.core.frame.DataFrame'>
        :type attributes: str list
        :type class_v: str
        :return: Retorna el valor de calidad de selección los atributos con respecto a la clase
        :rtype: double

    """
    rca = df[attributes + [class_v]].corr().abs()[class_v].iloc[:-1].mean()
    raa = df[attributes].corr().abs()[attributes].mean().mean()
    k = len(attributes)
    return (k*rca)/math.sqrt(k+(k*(k-1)*raa))

In [19]:
def MIFS(df, attributes, class_v, beta=1):
    """Realiza el método MIFS
    
        :param df: Dataframe de Pandas con el dataset a tratar
        :param attributes: Lista de los nombres de las columnas a tratar
        :param class_v: Nombre de la columna a tratar como clase
        :type df: <class 'pandas.core.frame.DataFrame'>
        :type attributes: str list
        :type class_v: str
        :return: Retorna el valor de calidad de selección los atributos con respecto a la clase
        :rtype: double

    """
    df2 = df.copy()
    I = mutual_info_classif(df2[attributes], df2[class_v].astype(str))[-1]
    if len(attributes) == 1:
        return I
    return I - beta * np.sum(mutual_info_regression(df2.loc[:, attributes[:-1]], df2.loc[:, attributes[-1]]))


In [20]:
def SFS(dataframe, features=4, metric='cfs'):
    """Realiza el método de búsqueda SFS
    
        :param dataframe: Dataframe de Pandas con el dataset a tratar
        :param features: Número de variables que se desean obtener
        :param metric: Nombre de la métrica a usar para la evaluación de calidad
        :type dataframe: <class 'pandas.core.frame.DataFrame'>
        :type features: int
        :type metric: str ('cfs' o 'mifs')
        :return: Retorna una lista con los nombres de las variables seleccionadas
        :rtype: str list

    """
    df = dataframe.copy()
    xk = []
    k = 0

    while k < features:
        k = k+1
        values = []
        unexplored_set = np.setdiff1d(df.columns[:-1], xk)
        for x in unexplored_set:
            criterion = CFS(df, xk + [x], 'quality') if metric == 'cfs' else MIFS(df, xk + [x], 'quality')
            values.append(criterion)

        xk.append(unexplored_set[values.index(max(values))])
    
    print(f'End SFS with {features} variables')
    return xk

In [21]:
cfs_results = {}
for i in range(1, wine_df.shape[1]-1):
    cfs_results[i] = SFS(wine_df, i, "cfs")

End SFS with 1 variables
End SFS with 2 variables
End SFS with 3 variables
End SFS with 4 variables
End SFS with 5 variables
End SFS with 6 variables
End SFS with 7 variables
End SFS with 8 variables
End SFS with 9 variables
End SFS with 10 variables


In [22]:
mifs_results = {}
for i in range(1, wine_df.shape[1]-1):
    mifs_results[i] = SFS(wine_df, i, "mifs")

End SFS with 1 variables
End SFS with 2 variables
End SFS with 3 variables
End SFS with 4 variables
End SFS with 5 variables
End SFS with 6 variables
End SFS with 7 variables
End SFS with 8 variables
End SFS with 9 variables
End SFS with 10 variables


In [23]:
for i in range(1, wine_df.shape[1]-1):
    print(f'Nº de variables:\n{i}\n')
    print(f'Resultado en CFS:\n{cfs_results[i]}\n')
    print(f'Resultado en MIFS:\n{mifs_results[i]}')
    print('----------------------------------------------------------------------------------------------------------')

Nº de variables:
1

Resultado en CFS:
['alcohol']

Resultado en MIFS:
['density']
----------------------------------------------------------------------------------------------------------
Nº de variables:
2

Resultado en CFS:
['alcohol', 'density']

Resultado en MIFS:
['density', 'volatile acidity']
----------------------------------------------------------------------------------------------------------
Nº de variables:
3

Resultado en CFS:
['alcohol', 'density', 'volatile acidity']

Resultado en MIFS:
['density', 'volatile acidity', 'fixed acidity']
----------------------------------------------------------------------------------------------------------
Nº de variables:
4

Resultado en CFS:
['alcohol', 'density', 'volatile acidity', 'chlorides']

Resultado en MIFS:
['density', 'volatile acidity', 'fixed acidity', 'sulphates']
----------------------------------------------------------------------------------------------------------
Nº de variables:
5

Resultado en CFS:
['alcohol', '