# Missing Values
![](title_pict/missing2.png)

We will use the following dataset throughout this chapter.

In [1]:
import pandas as pd
df_stock = pd.read_excel('https://raw.githubusercontent.com/datasmp/datasets/main/stock.xlsx', index_col=0)
df_stock.head()

Unnamed: 0_level_0,APPLE,TESLA,AMAZON,VISA,SP500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,74.33,86.05,1898.01,189.66,3257.85
2020-01-03,73.61,88.6,1874.97,188.15,3234.85
2020-01-06,74.2,90.31,1902.88,187.74,3246.28
2020-01-07,73.85,93.81,1906.86,187.24,3237.18
2020-01-08,75.04,98.43,1891.97,190.45,3253.05


Now, we will replace some values in the dataframe with missing values (NaN).

In [2]:
import random
import numpy as np

random.seed(0)

r = 0.4
a,b = df_stock.shape
missing = int(a*b*r)
for i in range( missing ):
    k = random.randint(2,b-1)
    p = random.randint(0,a-1)
    df_stock.iloc[p,k] = np.nan
df_stock.head(10)

Unnamed: 0_level_0,APPLE,TESLA,AMAZON,VISA,SP500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,74.33,86.05,1898.01,,3257.85
2020-01-03,73.61,88.6,1874.97,188.15,3234.85
2020-01-06,74.2,90.31,,187.74,3246.28
2020-01-07,73.85,93.81,,187.24,
2020-01-08,75.04,98.43,1891.97,190.45,3253.05
2020-01-09,76.63,96.27,,191.77,
2020-01-10,76.8,95.63,1883.16,192.29,3265.35
2020-01-13,78.44,104.97,1891.3,193.83,
2020-01-14,77.39,107.58,,,3283.15
2020-01-15,77.05,103.7,,,


In [3]:
df_stock.info()

<class 'pandas.core.frame.DataFrame'>
Index: 252 entries, 2020-01-02 to 2020-12-30
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   APPLE   252 non-null    float64
 1   TESLA   252 non-null    float64
 2   AMAZON  126 non-null    float64
 3   VISA    134 non-null    float64
 4   SP500   139 non-null    float64
dtypes: float64(5)
memory usage: 11.8+ KB


In [4]:
df_stock.isnull().head()

Unnamed: 0_level_0,APPLE,TESLA,AMAZON,VISA,SP500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,False,False,False,True,False
2020-01-03,False,False,False,False,False
2020-01-06,False,False,True,False,False
2020-01-07,False,False,True,False,True
2020-01-08,False,False,False,False,False


In [5]:
df_stock.notnull().head()

Unnamed: 0_level_0,APPLE,TESLA,AMAZON,VISA,SP500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,True,True,True,False,True
2020-01-03,True,True,True,True,True
2020-01-06,True,True,False,True,True
2020-01-07,True,True,False,True,False
2020-01-08,True,True,True,True,True


In [6]:
df_stock.isnull().sum()

APPLE       0
TESLA       0
AMAZON    126
VISA      118
SP500     113
dtype: int64

In [7]:
df_stock.isnull().sum().sum()

357

# Imputation Methods

## Filling with a constant number

In [8]:
df_stock.fillna(999).head(10)

Unnamed: 0_level_0,APPLE,TESLA,AMAZON,VISA,SP500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,74.33,86.05,1898.01,999.0,3257.85
2020-01-03,73.61,88.6,1874.97,188.15,3234.85
2020-01-06,74.2,90.31,999.0,187.74,3246.28
2020-01-07,73.85,93.81,999.0,187.24,999.0
2020-01-08,75.04,98.43,1891.97,190.45,3253.05
2020-01-09,76.63,96.27,999.0,191.77,999.0
2020-01-10,76.8,95.63,1883.16,192.29,3265.35
2020-01-13,78.44,104.97,1891.3,193.83,999.0
2020-01-14,77.39,107.58,999.0,999.0,3283.15
2020-01-15,77.05,103.7,999.0,999.0,999.0


## Filling with a string

In [9]:
df_stock.fillna('EMPTY').head(10)

Unnamed: 0_level_0,APPLE,TESLA,AMAZON,VISA,SP500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,74.33,86.05,1898.01,EMPTY,3257.85
2020-01-03,73.61,88.6,1874.97,188.15,3234.85
2020-01-06,74.2,90.31,EMPTY,187.74,3246.28
2020-01-07,73.85,93.81,EMPTY,187.24,EMPTY
2020-01-08,75.04,98.43,1891.97,190.45,3253.05
2020-01-09,76.63,96.27,EMPTY,191.77,EMPTY
2020-01-10,76.8,95.63,1883.16,192.29,3265.35
2020-01-13,78.44,104.97,1891.3,193.83,EMPTY
2020-01-14,77.39,107.58,EMPTY,EMPTY,3283.15
2020-01-15,77.05,103.7,EMPTY,EMPTY,EMPTY


## Filling with forward fill

In [10]:
df_stock.ffill().head(10)

Unnamed: 0_level_0,APPLE,TESLA,AMAZON,VISA,SP500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,74.33,86.05,1898.01,,3257.85
2020-01-03,73.61,88.6,1874.97,188.15,3234.85
2020-01-06,74.2,90.31,1874.97,187.74,3246.28
2020-01-07,73.85,93.81,1874.97,187.24,3246.28
2020-01-08,75.04,98.43,1891.97,190.45,3253.05
2020-01-09,76.63,96.27,1891.97,191.77,3253.05
2020-01-10,76.8,95.63,1883.16,192.29,3265.35
2020-01-13,78.44,104.97,1891.3,193.83,3265.35
2020-01-14,77.39,107.58,1891.3,193.83,3283.15
2020-01-15,77.05,103.7,1891.3,193.83,3283.15


## Filling with backward fill

In [11]:
df_stock.bfill().head(10)

Unnamed: 0_level_0,APPLE,TESLA,AMAZON,VISA,SP500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,74.33,86.05,1898.01,188.15,3257.85
2020-01-03,73.61,88.6,1874.97,188.15,3234.85
2020-01-06,74.2,90.31,1891.97,187.74,3246.28
2020-01-07,73.85,93.81,1891.97,187.24,3253.05
2020-01-08,75.04,98.43,1891.97,190.45,3253.05
2020-01-09,76.63,96.27,1883.16,191.77,3265.35
2020-01-10,76.8,95.63,1883.16,192.29,3265.35
2020-01-13,78.44,104.97,1891.3,193.83,3283.15
2020-01-14,77.39,107.58,1864.72,203.13,3283.15
2020-01-15,77.05,103.7,1864.72,203.13,3316.81


## Filling with mean

In [12]:
df_stock.mean()

APPLE       94.753056
TESLA      288.347579
AMAZON    2672.675635
VISA       191.660373
SP500     3222.287482
dtype: float64

In [13]:
df_stock.fillna(df_stock.mean()).head(10)

Unnamed: 0_level_0,APPLE,TESLA,AMAZON,VISA,SP500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,74.33,86.05,1898.01,191.660373,3257.85
2020-01-03,73.61,88.6,1874.97,188.15,3234.85
2020-01-06,74.2,90.31,2672.675635,187.74,3246.28
2020-01-07,73.85,93.81,2672.675635,187.24,3222.287482
2020-01-08,75.04,98.43,1891.97,190.45,3253.05
2020-01-09,76.63,96.27,2672.675635,191.77,3222.287482
2020-01-10,76.8,95.63,1883.16,192.29,3265.35
2020-01-13,78.44,104.97,1891.3,193.83,3222.287482
2020-01-14,77.39,107.58,2672.675635,191.660373,3283.15
2020-01-15,77.05,103.7,2672.675635,191.660373,3222.287482


## Filling with median

In [14]:
df_stock.median()

APPLE       90.960
TESLA      232.830
AMAZON    2821.555
VISA       194.565
SP500     3276.020
dtype: float64

In [15]:
df_stock.fillna(df_stock.median()).head(10)

Unnamed: 0_level_0,APPLE,TESLA,AMAZON,VISA,SP500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,74.33,86.05,1898.01,194.565,3257.85
2020-01-03,73.61,88.6,1874.97,188.15,3234.85
2020-01-06,74.2,90.31,2821.555,187.74,3246.28
2020-01-07,73.85,93.81,2821.555,187.24,3276.02
2020-01-08,75.04,98.43,1891.97,190.45,3253.05
2020-01-09,76.63,96.27,2821.555,191.77,3276.02
2020-01-10,76.8,95.63,1883.16,192.29,3265.35
2020-01-13,78.44,104.97,1891.3,193.83,3276.02
2020-01-14,77.39,107.58,2821.555,194.565,3283.15
2020-01-15,77.05,103.7,2821.555,194.565,3276.02


## Dropping Missing Values
- drop the rows with at least one missing nan value

In [16]:
df_stock.dropna().head(10)

Unnamed: 0_level_0,APPLE,TESLA,AMAZON,VISA,SP500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-03,73.61,88.6,1874.97,188.15,3234.85
2020-01-08,75.04,98.43,1891.97,190.45,3253.05
2020-01-10,76.8,95.63,1883.16,192.29,3265.35
2020-01-22,78.63,113.91,1887.46,206.31,3321.75
2020-02-06,80.49,149.79,2050.23,201.49,3345.78
2020-02-19,80.28,183.48,2170.22,211.98,3386.15
2020-02-28,67.81,133.6,1883.75,180.63,2954.22
2020-03-03,71.77,149.1,1908.99,184.57,3003.37
2020-03-06,71.7,140.7,1901.09,183.21,2972.37
2020-03-10,70.79,129.07,1891.82,181.46,2882.23


In [17]:
# drop columns
df_stock.dropna(axis=1).head(10)

Unnamed: 0_level_0,APPLE,TESLA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-02,74.33,86.05
2020-01-03,73.61,88.6
2020-01-06,74.2,90.31
2020-01-07,73.85,93.81
2020-01-08,75.04,98.43
2020-01-09,76.63,96.27
2020-01-10,76.8,95.63
2020-01-13,78.44,104.97
2020-01-14,77.39,107.58
2020-01-15,77.05,103.7


In [18]:
df_stock.dropna(subset=['APPLE']).head(10)

Unnamed: 0_level_0,APPLE,TESLA,AMAZON,VISA,SP500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,74.33,86.05,1898.01,,3257.85
2020-01-03,73.61,88.6,1874.97,188.15,3234.85
2020-01-06,74.2,90.31,,187.74,3246.28
2020-01-07,73.85,93.81,,187.24,
2020-01-08,75.04,98.43,1891.97,190.45,3253.05
2020-01-09,76.63,96.27,,191.77,
2020-01-10,76.8,95.63,1883.16,192.29,3265.35
2020-01-13,78.44,104.97,1891.3,193.83,
2020-01-14,77.39,107.58,,,3283.15
2020-01-15,77.05,103.7,,,


In [19]:
df_stock.dropna(subset=['APPLE', 'AMAZON'], how='any').head(10)

Unnamed: 0_level_0,APPLE,TESLA,AMAZON,VISA,SP500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,74.33,86.05,1898.01,,3257.85
2020-01-03,73.61,88.6,1874.97,188.15,3234.85
2020-01-08,75.04,98.43,1891.97,190.45,3253.05
2020-01-10,76.8,95.63,1883.16,192.29,3265.35
2020-01-13,78.44,104.97,1891.3,193.83,
2020-01-17,78.88,102.1,1864.72,203.13,
2020-01-21,78.35,109.44,1892.0,,3320.79
2020-01-22,78.63,113.91,1887.46,206.31,3321.75
2020-01-23,79.01,114.44,1884.58,,
2020-01-28,78.62,113.38,1853.25,,3276.24


In [20]:
df_stock.dropna(subset=['APPLE', 'AMAZON'], how='all').head(10)

Unnamed: 0_level_0,APPLE,TESLA,AMAZON,VISA,SP500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,74.33,86.05,1898.01,,3257.85
2020-01-03,73.61,88.6,1874.97,188.15,3234.85
2020-01-06,74.2,90.31,,187.74,3246.28
2020-01-07,73.85,93.81,,187.24,
2020-01-08,75.04,98.43,1891.97,190.45,3253.05
2020-01-09,76.63,96.27,,191.77,
2020-01-10,76.8,95.63,1883.16,192.29,3265.35
2020-01-13,78.44,104.97,1891.3,193.83,
2020-01-14,77.39,107.58,,,3283.15
2020-01-15,77.05,103.7,,,


# Imputer Methods

## Simple Imputer

In [21]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')   # mean, median, most_frequent, constant

In [22]:
imputer.fit(df_stock)
imputer.transform(df_stock)

array([[  74.33      ,   86.05      , 1898.01      ,  191.66037313,
        3257.85      ],
       [  73.61      ,   88.6       , 1874.97      ,  188.15      ,
        3234.85      ],
       [  74.2       ,   90.31      , 2672.67563492,  187.74      ,
        3246.28      ],
       ...,
       [ 136.49      ,  663.69      , 3283.96      ,  212.3       ,
        3735.36      ],
       [ 134.67      ,  665.99      , 2672.67563492,  214.04      ,
        3222.28748201],
       [ 133.52      ,  694.78      , 3285.85      ,  218.02      ,
        3222.28748201]])

In [23]:
pd.DataFrame(imputer.transform(df_stock), columns=df_stock.columns, index=df_stock.index).head(10)

Unnamed: 0_level_0,APPLE,TESLA,AMAZON,VISA,SP500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,74.33,86.05,1898.01,191.660373,3257.85
2020-01-03,73.61,88.6,1874.97,188.15,3234.85
2020-01-06,74.2,90.31,2672.675635,187.74,3246.28
2020-01-07,73.85,93.81,2672.675635,187.24,3222.287482
2020-01-08,75.04,98.43,1891.97,190.45,3253.05
2020-01-09,76.63,96.27,2672.675635,191.77,3222.287482
2020-01-10,76.8,95.63,1883.16,192.29,3265.35
2020-01-13,78.44,104.97,1891.3,193.83,3222.287482
2020-01-14,77.39,107.58,2672.675635,191.660373,3283.15
2020-01-15,77.05,103.7,2672.675635,191.660373,3222.287482


## knn Imputer

imputed using the mean value from n_neighbors nearest neighbors found in the training set
- https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html


In [24]:
from sklearn.impute import KNNImputer
imputer_knn = KNNImputer(n_neighbors=2)
imputer_knn.fit_transform(df_stock)

array([[  74.33,   86.05, 1898.01,  187.49, 3257.85],
       [  73.61,   88.6 , 1874.97,  188.15, 3234.85],
       [  74.2 ,   90.31, 1883.47,  187.74, 3246.28],
       ...,
       [ 136.49,  663.69, 3283.96,  212.3 , 3735.36],
       [ 134.67,  665.99, 3260.02,  214.04, 3719.21],
       [ 133.52,  694.78, 3285.85,  218.02, 3719.21]])

In [25]:
pd.DataFrame(imputer_knn.fit_transform(df_stock), columns=df_stock.columns, index=df_stock.index).head(10)

Unnamed: 0_level_0,APPLE,TESLA,AMAZON,VISA,SP500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,74.33,86.05,1898.01,187.49,3257.85
2020-01-03,73.61,88.6,1874.97,188.15,3234.85
2020-01-06,74.2,90.31,1883.47,187.74,3246.28
2020-01-07,73.85,93.81,1883.47,187.24,3240.565
2020-01-08,75.04,98.43,1891.97,190.45,3253.05
2020-01-09,76.63,96.27,1887.565,191.77,3259.2
2020-01-10,76.8,95.63,1883.16,192.29,3265.35
2020-01-13,78.44,104.97,1891.3,193.83,3299.98
2020-01-14,77.39,107.58,1878.01,198.48,3283.15
2020-01-15,77.05,103.7,1878.01,198.48,3299.98
