In [2]:
import numpy as np
import pandas as pd

In [6]:
bankruptcy = pd.read_csv("https://raw.githubusercontent.com/christianolivamoya/MIAX11-ML/main/data/bankruptcy.csv", index_col='Company')
bankruptcy.sample(5)

Unnamed: 0_level_0,WC/TA,RE/TA,EBIT/TA,S/TA,BVE/BVL,Bankrupt
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Advanced Radio Telecom,42.6,-60.1,-10.1,0.3,4.13,yes
Net2000 Communications,19.1,-66.3,-25.5,22.3,0.46,yes
Global Telesystems,24.6,-29.0,-2.0,21.3,1.968,yes
Equant,8.2,-15.6,0.3,87.7,5.444,no
Qwest,-6.1,0.0,9.4,22.6,2.123,no


In [10]:
features = bankruptcy.drop("Bankrupt", axis=1)
target = bankruptcy["Bankrupt"]

Recordamos la media y varianza de las características

In [12]:
features.describe()

Unnamed: 0,WC/TA,RE/TA,EBIT/TA,S/TA,BVE/BVL
count,50.0,50.0,50.0,50.0,50.0
mean,4.818,-52.936,-7.622,39.794,3.00424
std,34.269012,90.073681,26.611454,30.862939,5.425038
min,-127.5,-433.1,-98.7,0.3,0.028
25%,-5.7,-75.225,-12.275,21.075,0.566
50%,5.35,-38.55,-2.45,33.55,1.4055
75%,25.05,0.0,9.025,51.025,3.31725
max,74.7,54.6,27.9,127.8,35.178


Normalizamos los datos:

In [13]:
def normalizar(x):
    return (x - x.mean()) / x.std()

In [15]:
features = features.apply(normalizar)
features.sample(5)

Unnamed: 0_level_0,WC/TA,RE/TA,EBIT/TA,S/TA,BVE/BVL
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
WebLink Wireless,-3.861156,-0.758979,0.526916,0.839389,-0.508059
Openwave Systems,0.451778,-0.092857,0.357816,-0.414543,5.930605
Equant,0.09869,0.414505,0.297691,1.552218,0.449722
Aether Systems,0.752342,0.427828,0.102287,-1.218095,0.088066
Garmin,2.039218,1.193867,1.334839,1.12776,0.131936


In [16]:
features.describe()

Unnamed: 0,WC/TA,RE/TA,EBIT/TA,S/TA,BVE/BVL
count,50.0,50.0,50.0,50.0,50.0
mean,-2.8310690000000003e-17,6.716849e-17,-8.881784e-18,1.287859e-16,-6.661338000000001e-17
std,1.0,1.0,1.0,1.0,1.0
min,-3.861156,-4.220589,-3.422511,-1.279658,-0.5486118
25%,-0.3069245,-0.247453,-0.1748495,-0.6065203,-0.449442
50%,0.01552423,0.1597137,0.1943524,-0.2023139,-0.2946966
75%,0.5903876,0.5876966,0.6255577,0.3638992,0.05769729
max,2.039218,1.193867,1.334839,2.851511,5.930605


Como alternativa más adecuada podemos utilizar los "Scalers" de sklearn que nos permiten
guardar la transformación para aplicarlos sobre otros datasets, por ejemplo el de test.

In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
features = bankruptcy.drop('Bankrupt', axis=1)
target = bankruptcy['Bankrupt']

In [19]:
scaler = StandardScaler()

In [22]:
ft_z_array = scaler.fit_transform(features)
features_z = pd.DataFrame(ft_z_array, columns=features.columns)
features_z.head()

Unnamed: 0,WC/TA,RE/TA,EBIT/TA,S/TA,BVE/BVL
0,0.132117,0.50731,0.350061,-1.004623,0.134393
1,1.113705,-0.080342,-0.094063,-1.29265,0.209619
2,-0.990963,-1.685171,-1.646599,-0.821334,-0.538727
3,-0.068328,-4.263439,0.06157,-0.343472,-0.196488
4,0.627333,-0.050063,-0.602718,0.471512,-0.400193


In [23]:
features_z.describe()

Unnamed: 0,WC/TA,RE/TA,EBIT/TA,S/TA,BVE/BVL
count,50.0,50.0,50.0,50.0,50.0
mean,-3.608225e-18,9.325873e-17,2.220446e-18,9.325873e-17,-6.661338000000001e-17
std,1.010153,1.010153,1.010153,1.010153,1.010153
min,-3.900357,-4.263439,-3.457258,-1.29265,-0.5541816
25%,-0.3100406,-0.2499652,-0.1766247,-0.6126781,-0.454005
50%,0.01568184,0.1613352,0.1963256,-0.2043679,-0.2976885
75%,0.5963815,0.5936633,0.6319087,0.3675937,0.05828307
max,2.059922,1.205988,1.348391,2.880461,5.990816


La normalización de rango funciona de forma similar con el *MinMaxScaler*

In [24]:
from sklearn.preprocessing import MinMaxScaler

In [25]:
ft_norm_arr = MinMaxScaler().fit_transform(features)
pd.DataFrame(ft_norm_arr, columns=features.columns).describe()

Unnamed: 0,WC/TA,RE/TA,EBIT/TA,S/TA,BVE/BVL
count,50.0,50.0,50.0,50.0,50.0
mean,0.654392,0.779504,0.719415,0.309757,0.084673
std,0.169481,0.184691,0.210201,0.242062,0.15434
min,0.0,0.0,0.0,0.0,0.0
25%,0.602374,0.733802,0.682662,0.162941,0.015306
50%,0.657023,0.809001,0.760269,0.260784,0.039189
75%,0.754451,0.888046,0.850908,0.397843,0.093578
max,1.0,1.0,1.0,1.0,1.0
