In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.metrics.pairwise import nan_euclidean_distances
from sklearn.preprocessing import MinMaxScaler

# Drop NA

In [2]:
df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
                   "toy":  [np.nan, 'Batmobile', 'Bullwhip'],
                   "born": [pd.NaT, pd.Timestamp("1940-04-25"),pd.NaT]})
df

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT


In [3]:
df.dropna()

Unnamed: 0,name,toy,born
1,Batman,Batmobile,1940-04-25


In [4]:
df.dropna(axis = "columns")

Unnamed: 0,name
0,Alfred
1,Batman
2,Catwoman


In [5]:
df.dropna(axis = "rows")

Unnamed: 0,name,toy,born
1,Batman,Batmobile,1940-04-25


In [6]:
df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman', np.nan],
                   "toy": [np.nan, 'Batmobile', 'Bullwhip', np.nan],
                   "born": [pd.NaT, pd.Timestamp("1940-04-25"),pd.NaT, np.nan],
                   "info": [np.nan,np.nan,np.nan,np.nan]})
df

Unnamed: 0,name,toy,born,info
0,Alfred,,NaT,
1,Batman,Batmobile,1940-04-25,
2,Catwoman,Bullwhip,NaT,
3,,,NaT,


In [7]:
df.dropna(how = "all")

Unnamed: 0,name,toy,born,info
0,Alfred,,NaT,
1,Batman,Batmobile,1940-04-25,
2,Catwoman,Bullwhip,NaT,


In [8]:
# Keep only the rows with at least 2 non-NA values.
df.dropna(thresh = 2)

Unnamed: 0,name,toy,born,info
1,Batman,Batmobile,1940-04-25,
2,Catwoman,Bullwhip,NaT,


In [9]:
df.dropna(subset=["name", "toy"])

Unnamed: 0,name,toy,born,info
1,Batman,Batmobile,1940-04-25,
2,Catwoman,Bullwhip,NaT,


# KNNImputer

## Criando a base de dados 1

In [28]:
aux_dict1 = {"Barbarian":[80, 90, np.nan, 95,65], 
             "Prey":     [60, 65, 56, np.nan,60], 
             "Mandy" :   [np.nan, 57, 80, 78,68],
             "Upgrade" : [78,83,67,np.nan,55]}
  
df_orig_1 = pd.DataFrame(aux_dict1)
df_orig_1

Unnamed: 0,Barbarian,Prey,Mandy,Upgrade
0,80.0,60.0,,78.0
1,90.0,65.0,57.0,83.0
2,,56.0,80.0,67.0
3,95.0,,78.0,
4,65.0,60.0,68.0,55.0


In [13]:
#nan_euclidean_distances(df_orig_1)

array([[ 0.        , 14.14213562, 16.55294536, 30.        , 31.70699187],
       [14.14213562,  0.        , 33.9803865 , 30.52867504, 39.43348831],
       [16.55294536, 33.9803865 ,  0.        ,  4.        , 20.13289183],
       [30.        , 30.52867504,  4.        ,  0.        , 44.72135955],
       [31.70699187, 39.43348831, 20.13289183, 44.72135955,  0.        ]])

## Aplicando KNNImputer

In [19]:
imputer = KNNImputer(n_neighbors = 3)

df_imputation1 = imputer.fit_transform(df_orig_1)
pd.DataFrame(df_imputation1, 
             columns = ["Barbarian","Prey","Mandy","Upgrade"])

Unnamed: 0,Barbarian,Prey,Mandy,Upgrade
0,80.0,60.0,71.666667,78.0
1,90.0,65.0,57.0,83.0
2,80.0,56.0,80.0,67.0
3,95.0,60.333333,78.0,76.0
4,65.0,60.0,68.0,55.0


In [21]:
df_orig_1

Unnamed: 0,Barbarian,Prey,Mandy,Upgrade
0,80.0,60.0,,78.0
1,90.0,65.0,57.0,83.0
2,,56.0,80.0,67.0
3,95.0,,78.0,
4,65.0,60.0,68.0,55.0


In [22]:
(67+78)/2

72.5

## Criando a base de dados 2

In [23]:
aux_dict2 = {"Barbarian":[80,   90,   50,  np.nan,  50, 60], 
             "Prey":     [90,   85,   56,  65,  50, 60], 
             "Mandy" :   [95,   97,   60,  78,  55, 55],
             "Upgrade" : [85,   93,   47,  55,  45, 40],
             "Premiado": ["Yes","Yes","No","No","No",np.nan  ] }
  
df_orig_2 = pd.DataFrame(aux_dict2)
df_orig_2["Premiado"] = df_orig_2["Premiado"].map({"Yes":1, "No":0})
df_orig_2

Unnamed: 0,Barbarian,Prey,Mandy,Upgrade,Premiado
0,80.0,90,95,85,1.0
1,90.0,85,97,93,1.0
2,50.0,56,60,47,0.0
3,,65,78,55,0.0
4,50.0,50,55,45,0.0
5,60.0,60,55,40,


## Padronizando

In [30]:
scaler = MinMaxScaler()
df_orig_pad_2 = scaler.fit_transform(df_orig_2)
df_orig_pad_2

array([[0.75      , 1.        , 0.95238095, 0.8490566 , 1.        ],
       [1.        , 0.875     , 1.        , 1.        , 1.        ],
       [0.        , 0.15      , 0.11904762, 0.13207547, 0.        ],
       [       nan, 0.375     , 0.54761905, 0.28301887, 0.        ],
       [0.        , 0.        , 0.        , 0.09433962, 0.        ],
       [0.25      , 0.25      , 0.        , 0.        ,        nan]])

In [31]:
# create an object for KNNImputer
imputer = KNNImputer(n_neighbors=2)
df_imputation2 = imputer.fit_transform(df_orig_pad_2)
df_imputation2

array([[0.75      , 1.        , 0.95238095, 0.8490566 , 1.        ],
       [1.        , 0.875     , 1.        , 1.        , 1.        ],
       [0.        , 0.15      , 0.11904762, 0.13207547, 0.        ],
       [0.        , 0.375     , 0.54761905, 0.28301887, 0.        ],
       [0.        , 0.        , 0.        , 0.09433962, 0.        ],
       [0.25      , 0.25      , 0.        , 0.        , 0.        ]])

In [32]:
df_imputation2 = scaler.inverse_transform(df_imputation2)
df_imputation2 = pd.DataFrame(df_imputation2, columns = ["Barbarian","Prey","Mandy","Upgrade","Premiado"])
df_imputation2

Unnamed: 0,Barbarian,Prey,Mandy,Upgrade,Premiado
0,80.0,90.0,95.0,85.0,1.0
1,90.0,85.0,97.0,93.0,1.0
2,50.0,56.0,60.0,47.0,0.0
3,50.0,65.0,78.0,55.0,0.0
4,50.0,50.0,55.0,45.0,0.0
5,60.0,60.0,55.0,40.0,0.0


In [37]:
# create an object for KNNImputer
imputer = KNNImputer(n_neighbors=2)
df_imputation2 = imputer.fit_transform(df_orig_2)
df_imputation2

array([[80., 90., 95., 85.,  1.],
       [90., 85., 97., 93.,  1.],
       [50., 56., 60., 47.,  0.],
       [50., 65., 78., 55.,  0.],
       [50., 50., 55., 45.,  0.],
       [60., 60., 55., 40.,  0.]])

In [38]:
df_orig_2

Unnamed: 0,Barbarian,Prey,Mandy,Upgrade,Premiado
0,80.0,90,95,85,1.0
1,90.0,85,97,93,1.0
2,50.0,56,60,47,0.0
3,,65,78,55,0.0
4,50.0,50,55,45,0.0
5,60.0,60,55,40,
