___

<a href='https://oxiane-institut.com/'> <img src='../oxiane.jpg' /></a>\n
___

# Missing values

It's a common problem in data analysis to encounter missing values in a dataset.

Let's see how to handle them with pandas.

In [3]:
import pandas as pd
import numpy as np


## NaN

Means "Not a Number"

In [25]:
np.nan

nan

In [26]:
np.nan == np.nan

False

## Filling NaN

In [4]:
df = pd.DataFrame({'col1':[1,2,3,np.nan],
                   'col2':[np.nan,555,666,np.nan],
                   'col3':['abc','def','ghi','xyz']})
df.head()

Unnamed: 0,col1,col2,col3
0,1.0,,abc
1,2.0,555.0,def
2,3.0,666.0,ghi
3,,,xyz


In [5]:
df.isnull()

Unnamed: 0,col1,col2,col3
0,False,True,False
1,False,False,False
2,False,False,False
3,True,True,False


In [6]:
# Drop rows with NaN Values
df.dropna()

Unnamed: 0,col1,col2,col3
1,2.0,555.0,def
2,3.0,666.0,ghi


In [7]:
df.dropna(axis=1)

Unnamed: 0,col3
0,abc
1,def
2,ghi
3,xyz


In [8]:
df.dropna(thresh=2)

Unnamed: 0,col1,col2,col3
0,1.0,,abc
1,2.0,555.0,def
2,3.0,666.0,ghi


In [9]:
df.fillna('FILL')

Unnamed: 0,col1,col2,col3
0,1.0,FILL,abc
1,2.0,555.0,def
2,3.0,666.0,ghi
3,FILL,FILL,xyz


## Cleaning

In [10]:
df = pd.read_csv("data/heart.csv")

### fill df with NaN

In [11]:
import random

np.random.seed(42)

df_with_nan = pd.read_csv("data/heart.csv")

nb_cells = df_with_nan.shape[0] * df_with_nan.shape[1]
nb_nan = int(nb_cells * 0.05)

print(f"{nb_nan = }")
for _ in range(nb_nan):
    index = random.randint(0, df_with_nan.shape[0] - 1)
    column = random.randint(0, df_with_nan.shape[1] - 1)
    # print(f"Setting NaN on {index = } & {column = }")
    df_with_nan.iloc[index, column] = np.nan

df_with_nan.to_csv("./data/heart_with_nan.csv", index=False)
df_with_nan

nb_nan = 175


Unnamed: 0,age,sexe,type_douleur,pression,cholester,sucre,electro,taux_max,angine,depression,pic,vaisseau,coeur
0,70.0,masculin,D,130.0,322.0,A,C,109.0,non,24.0,2.0,D,presence
1,67.0,feminin,C,115.0,564.0,A,C,160.0,,16.0,2.0,A,absence
2,,masculin,B,124.0,261.0,A,A,141.0,non,3.0,1.0,A,presence
3,64.0,masculin,D,128.0,263.0,,A,105.0,oui,2.0,,B,absence
4,74.0,,B,120.0,269.0,A,C,121.0,oui,2.0,1.0,B,absence
...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52.0,masculin,C,172.0,199.0,B,A,162.0,non,5.0,1.0,A,absence
266,44.0,masculin,B,120.0,263.0,A,A,173.0,non,0.0,1.0,A,absence
267,56.0,feminin,B,140.0,294.0,A,C,153.0,non,,2.0,A,
268,57.0,masculin,D,140.0,192.0,A,A,148.0,non,4.0,2.0,A,absence


In [12]:
df_to_clean = pd.read_csv("data/heart_with_nan.csv")
df_to_clean

Unnamed: 0,age,sexe,type_douleur,pression,cholester,sucre,electro,taux_max,angine,depression,pic,vaisseau,coeur
0,70.0,masculin,D,130.0,322.0,A,C,109.0,non,24.0,2.0,D,presence
1,67.0,feminin,C,115.0,564.0,A,C,160.0,,16.0,2.0,A,absence
2,,masculin,B,124.0,261.0,A,A,141.0,non,3.0,1.0,A,presence
3,64.0,masculin,D,128.0,263.0,,A,105.0,oui,2.0,,B,absence
4,74.0,,B,120.0,269.0,A,C,121.0,oui,2.0,1.0,B,absence
...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52.0,masculin,C,172.0,199.0,B,A,162.0,non,5.0,1.0,A,absence
266,44.0,masculin,B,120.0,263.0,A,A,173.0,non,0.0,1.0,A,absence
267,56.0,feminin,B,140.0,294.0,A,C,153.0,non,,2.0,A,
268,57.0,masculin,D,140.0,192.0,A,A,148.0,non,4.0,2.0,A,absence


### Drop NaN

In [13]:
df_to_clean.isna().sum()

age             13
sexe            17
type_douleur     9
pression         9
cholester       12
sucre           12
electro         13
taux_max        12
angine          13
depression      16
pic             20
vaisseau         7
coeur           16
dtype: int64

In [14]:
df_droped = df_to_clean.dropna(
    axis=0,
    how="any",          # other option is "all"
)
df_droped

Unnamed: 0,age,sexe,type_douleur,pression,cholester,sucre,electro,taux_max,angine,depression,pic,vaisseau,coeur
0,70.0,masculin,D,130.0,322.0,A,C,109.0,non,24.0,2.0,D,presence
5,65.0,masculin,D,120.0,177.0,A,A,140.0,non,4.0,1.0,A,absence
6,56.0,masculin,C,130.0,256.0,B,C,142.0,oui,6.0,2.0,B,presence
8,60.0,masculin,D,140.0,293.0,A,C,170.0,non,12.0,2.0,C,presence
12,44.0,masculin,C,140.0,235.0,A,C,180.0,non,0.0,1.0,A,absence
...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,60.0,masculin,D,130.0,206.0,A,C,132.0,oui,24.0,2.0,C,presence
262,58.0,masculin,B,120.0,284.0,A,C,160.0,non,18.0,2.0,A,presence
265,52.0,masculin,C,172.0,199.0,B,A,162.0,non,5.0,1.0,A,absence
266,44.0,masculin,B,120.0,263.0,A,A,173.0,non,0.0,1.0,A,absence


In [15]:
df_droped.isna().sum()

age             0
sexe            0
type_douleur    0
pression        0
cholester       0
sucre           0
electro         0
taux_max        0
angine          0
depression      0
pic             0
vaisseau        0
coeur           0
dtype: int64

In [16]:
df_droped.shape

(138, 13)

In [17]:
df_to_clean.shape

(270, 13)

### Fill NaN

In [18]:
df_static_filled = df_to_clean.fillna(
    value=0,
)

In [19]:
df_static_filled.isna().sum()

age             0
sexe            0
type_douleur    0
pression        0
cholester       0
sucre           0
electro         0
taux_max        0
angine          0
depression      0
pic             0
vaisseau        0
coeur           0
dtype: int64

In [20]:
df_static_filled.head(10)

Unnamed: 0,age,sexe,type_douleur,pression,cholester,sucre,electro,taux_max,angine,depression,pic,vaisseau,coeur
0,70.0,masculin,D,130.0,322.0,A,C,109.0,non,24.0,2.0,D,presence
1,67.0,feminin,C,115.0,564.0,A,C,160.0,0,16.0,2.0,A,absence
2,0.0,masculin,B,124.0,261.0,A,A,141.0,non,3.0,1.0,A,presence
3,64.0,masculin,D,128.0,263.0,0,A,105.0,oui,2.0,0.0,B,absence
4,74.0,0,B,120.0,269.0,A,C,121.0,oui,2.0,1.0,B,absence
5,65.0,masculin,D,120.0,177.0,A,A,140.0,non,4.0,1.0,A,absence
6,56.0,masculin,C,130.0,256.0,B,C,142.0,oui,6.0,2.0,B,presence
7,59.0,masculin,D,110.0,239.0,A,C,142.0,oui,0.0,2.0,B,presence
8,60.0,masculin,D,140.0,293.0,A,C,170.0,non,12.0,2.0,C,presence
9,0.0,feminin,D,150.0,407.0,A,C,154.0,non,40.0,2.0,D,presence


In [21]:
df_to_clean.dtypes

age             float64
sexe             object
type_douleur     object
pression        float64
cholester       float64
sucre            object
electro          object
taux_max        float64
angine           object
depression      float64
pic             float64
vaisseau         object
coeur            object
dtype: object

In [22]:
colname_and_dtype = list(zip(df_to_clean.columns, df_to_clean.dtypes))
# print(f"{colname_and_dtype = }")

num_cols = []
str_cols = []
for colname, dtype in colname_and_dtype:
    if dtype == "object":
        str_cols.append(colname)
    else:
        num_cols.append(colname)

print(f"{num_cols = }")
print(f"{str_cols = }")


num_cols = ['age', 'pression', 'cholester', 'taux_max', 'depression', 'pic']
str_cols = ['sexe', 'type_douleur', 'sucre', 'electro', 'angine', 'vaisseau', 'coeur']


In [23]:
df_dynamic_filled = df_to_clean.copy()

df_dynamic_filled[num_cols] = df_dynamic_filled[num_cols].fillna(
    value=df_to_clean[num_cols].median(),
)
df_dynamic_filled.head(10)

Unnamed: 0,age,sexe,type_douleur,pression,cholester,sucre,electro,taux_max,angine,depression,pic,vaisseau,coeur
0,70.0,masculin,D,130.0,322.0,A,C,109.0,non,24.0,2.0,D,presence
1,67.0,feminin,C,115.0,564.0,A,C,160.0,,16.0,2.0,A,absence
2,55.0,masculin,B,124.0,261.0,A,A,141.0,non,3.0,1.0,A,presence
3,64.0,masculin,D,128.0,263.0,,A,105.0,oui,2.0,2.0,B,absence
4,74.0,,B,120.0,269.0,A,C,121.0,oui,2.0,1.0,B,absence
5,65.0,masculin,D,120.0,177.0,A,A,140.0,non,4.0,1.0,A,absence
6,56.0,masculin,C,130.0,256.0,B,C,142.0,oui,6.0,2.0,B,presence
7,59.0,masculin,D,110.0,239.0,A,C,142.0,oui,8.0,2.0,B,presence
8,60.0,masculin,D,140.0,293.0,A,C,170.0,non,12.0,2.0,C,presence
9,55.0,feminin,D,150.0,407.0,A,C,154.0,non,40.0,2.0,D,presence


In [24]:
for str_col in str_cols:
    df_dynamic_filled[str_col] = df_dynamic_filled[str_col].fillna(
        value=df_to_clean[str_col].mode()[0],
    )
df_dynamic_filled.head(10)

Unnamed: 0,age,sexe,type_douleur,pression,cholester,sucre,electro,taux_max,angine,depression,pic,vaisseau,coeur
0,70.0,masculin,D,130.0,322.0,A,C,109.0,non,24.0,2.0,D,presence
1,67.0,feminin,C,115.0,564.0,A,C,160.0,non,16.0,2.0,A,absence
2,55.0,masculin,B,124.0,261.0,A,A,141.0,non,3.0,1.0,A,presence
3,64.0,masculin,D,128.0,263.0,A,A,105.0,oui,2.0,2.0,B,absence
4,74.0,masculin,B,120.0,269.0,A,C,121.0,oui,2.0,1.0,B,absence
5,65.0,masculin,D,120.0,177.0,A,A,140.0,non,4.0,1.0,A,absence
6,56.0,masculin,C,130.0,256.0,B,C,142.0,oui,6.0,2.0,B,presence
7,59.0,masculin,D,110.0,239.0,A,C,142.0,oui,8.0,2.0,B,presence
8,60.0,masculin,D,140.0,293.0,A,C,170.0,non,12.0,2.0,C,presence
9,55.0,feminin,D,150.0,407.0,A,C,154.0,non,40.0,2.0,D,presence
