# Notebook para testear las funciones de la librería `toolbox_ML`.

## Importar librerías

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.datasets import load_diabetes, load_iris

import toolbox_ML as tb

## Cargar datasets

In [2]:
# Variables que NO son DataFrames
variable_list = [1, 2, 3]

# Cargar dataset en local
df_ads = pd.read_csv('./data/advertising_ml.csv', index_col=None)
df_boston = pd.read_csv('./data/boston_houses.csv', sep='|')
df_housing = pd.read_csv('./data/ejemplo_housing.csv')
df_lifesat = pd.read_csv('./data/lifesat_full.csv')
df_titanic = pd.read_csv('./data/titanic.csv')

# Importar datasets de sklearn.datasets
df_iris = pd.DataFrame(load_iris()['data'], columns=load_iris()['feature_names'])
df_diabetes = pd.DataFrame(load_diabetes()['data'], columns=load_diabetes()['feature_names'])

## Función "describe_df".

In [3]:
tb.describe_df(df_titanic)

COL_N,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
DATA_TYPE,int64,int64,object,float64,int64,int64,float64,object,object,object,bool,object,object,object,bool
MISSING (%),0.0,0.0,0.0,19.87,0.0,0.0,0.0,0.22,0.0,0.0,0.0,77.22,0.22,0.0,0.0
UNIQUE_VALUES,2,3,2,88,7,7,248,3,3,3,2,7,3,2,2
CARDIN (%),0.22,0.34,0.22,9.88,0.79,0.79,27.83,0.34,0.34,0.34,0.22,0.79,0.34,0.22,0.22


In [4]:
# NO cuenta bien los missings (APARECEN EL DOBLE) y los unique_values. Checkeado con "get_cardinality()"
tb.describe_df_JUANMA(df_titanic)

COL_N,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
DATA_TYPE,int64,int64,object,float64,int64,int64,float64,object,object,object,bool,object,object,object,bool
MISSINGS (%),0.0,0.0,0.0,39.73,0.0,0.0,0.0,0.45,0.0,0.0,0.0,154.43,0.45,0.0,0.0
UNIQUE_VALUES,2,3,2,89,7,7,248,4,3,3,2,8,4,2,2
CARDIN (%),0.22,0.34,0.22,9.99,0.79,0.79,27.83,0.45,0.34,0.34,0.22,0.9,0.45,0.22,0.22


In [5]:
tb.get_cardinality(df_titanic).T

pandas.DataFrame shape:  (891, 15)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
Card,2,3,2,88,7,7,248,3,3,3,2,7,3,2,2
%_Card,0.224467,0.3367,0.224467,9.876543,0.785634,0.785634,27.833895,0.3367,0.3367,0.3367,0.224467,0.785634,0.3367,0.224467,0.224467
NaN_Values,0,0,0,177,0,0,0,2,0,0,0,688,2,0,0
Type,int64,int64,object,float64,int64,int64,float64,object,object,object,bool,object,object,object,bool
Class,Binary,Categoric,Binary,Numeric - Discrete,Categoric,Categoric,Numeric - Discrete,Categoric,Categoric,Categoric,Binary,Categoric,Categoric,Binary,Binary


In [6]:
tb.describe_df(df_iris)

COL_N,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
DATA_TYPE,float64,float64,float64,float64
MISSING (%),0.0,0.0,0.0,0.0
UNIQUE_VALUES,35,23,43,22
CARDIN (%),23.33,15.33,28.67,14.67


In [7]:
tb.describe_df(variable_list)

TypeError: Expected a pandas DataFrame

## Función "typify_variables".

In [None]:
tb.typify_variables(df_boston)

Unnamed: 0,nombre_variable,tipo_sugerido
0,CRIM,Numérica Continua
1,ZN,Numérica Discreta
2,INDUS,Numérica Discreta
3,CHAS,Binaria
4,NOX,Numérica Discreta
5,RM,Numérica Continua
6,AGE,Numérica Continua
7,DIS,Numérica Continua
8,RAD,Categórica
9,TAX,Numérica Discreta


In [None]:
tb.typify_variables_JUANMA(df_boston, 10, 30)

Unnamed: 0_level_0,TIPO_SUGERIDO
COL_N,Unnamed: 1_level_1
CRIM,NUMERICA CONTINUA
ZN,NUMERICA DISCRETA
INDUS,NUMERICA CONTINUA
CHAS,BINARIO
NOX,NUMERICA CONTINUA
RM,NUMERICA CONTINUA
AGE,NUMERICA CONTINUA
DIS,NUMERICA CONTINUA
RAD,CATEGORICA
TAX,NUMERICA CONTINUA


In [None]:
tb.typify_variables(df_boston, umbral_categoria=12.6, umbral_continua=30)

TypeError: Thresholds must be an integer and a float, respectively

In [None]:
tb.typify_variables(varible_list, umbral_categoria=12.6, umbral_continua=30)

TypeError: Expected a pandas DataFrame

## Función "get_features_num_regression".

In [None]:
tb.get_features_num_regression(variable_list, 'survived', umbral_corr=0.07, pvalue=0.40)

Error: No se ha introducido un DataFrame válido.


In [None]:
# La supervivencia del Titanic es un ejemplo de clasificación, y por tanto la cardinalidad del problema es baja
tb.get_features_num_regression(df_titanic, 'survived', umbral_corr=0.07, pvalue=0.40)

Error: survived es una columna discreta con baja cardinalidad.


In [None]:
tb.get_features_num_regression(df_housing, 'median_house_value', umbral_corr=0.07, pvalue=0.40)

['housing_median_age', 'total_rooms', 'median_income']

In [None]:
tb.get_features_num_regression_LUIS(df_housing, "median_house_value", umbral_corr = 0.07, pvalue = 0.40)

['housing_median_age', 'total_rooms', 'median_income']

In [None]:
tb.get_features_num_regression(df_housing, 'ocean_proximity', umbral_corr=0.07, pvalue=0.40)

Error: ocean_proximity no es una columna numérica.


In [None]:
tb.get_features_num_regression(df_housing, 'xxxx', umbral_corr=0.07, pvalue=0.40)

Error: xxxx no es una columna del DataFrame.
