# Missing Values

In [1]:
import pandas as pd
import numpy as np

In [28]:
df = pd.read_csv("Economy_of_US_na.csv")
df

Unnamed: 0,Year,GDP_Nominal,GDP_Growth
0,1980.0,2857.3,
1,1981.0,3207.0,0.025
2,1982.0,3343.8,-0.018
3,1983.0,,
4,1984.0,4037.7,0.072
5,1985.0,4339.0,
6,1986.0,,
7,1987.0,4855.3,
8,1988.0,5236.4,0.042
9,1989.0,,


## Detecting and reporting missing values

In [3]:
df.isnull()

Unnamed: 0,Year,GDP_Nominal,GDP_Growth
0,False,False,True
1,False,False,False
2,False,False,False
3,False,True,True
4,False,False,False
5,False,False,True
6,False,True,True
7,False,False,True
8,False,False,False
9,False,True,True


In [5]:
for c in df.columns:
    miss = df[c].isnull().sum()
    print(f"{c} have {miss} columns with NaN")

Year have 1 columns with NaN
GDP_Nominal have 4 columns with NaN
GDP_Growth have 7 columns with NaN


## Dropping missing values

In [6]:
# Shallow copy

df2 = df.dropna()
df2

Unnamed: 0,Year,GDP_Nominal,GDP_Growth
1,1981.0,3207.0,0.025
2,1982.0,3343.8,-0.018
4,1984.0,4037.7,0.072
8,1988.0,5236.4,0.042
10,1990.0,5963.1,0.019
12,1992.0,6520.3,0.035


In [7]:
df

Unnamed: 0,Year,GDP_Nominal,GDP_Growth
0,1980.0,2857.3,
1,1981.0,3207.0,0.025
2,1982.0,3343.8,-0.018
3,1983.0,,
4,1984.0,4037.7,0.072
5,1985.0,4339.0,
6,1986.0,,
7,1987.0,4855.3,
8,1988.0,5236.4,0.042
9,1989.0,,


In [8]:
df.dropna(axis = 1)

0
1
2
3
4
5
6
7
8
9
10


In [9]:
df.dropna(thresh = 3)

Unnamed: 0,Year,GDP_Nominal,GDP_Growth
1,1981.0,3207.0,0.025
2,1982.0,3343.8,-0.018
4,1984.0,4037.7,0.072
8,1988.0,5236.4,0.042
10,1990.0,5963.1,0.019
12,1992.0,6520.3,0.035


In [10]:
df.dropna(thresh = 2)

Unnamed: 0,Year,GDP_Nominal,GDP_Growth
0,1980.0,2857.3,
1,1981.0,3207.0,0.025
2,1982.0,3343.8,-0.018
4,1984.0,4037.7,0.072
5,1985.0,4339.0,
7,1987.0,4855.3,
8,1988.0,5236.4,0.042
10,1990.0,5963.1,0.019
12,1992.0,6520.3,0.035


In [11]:
df.dropna(thresh = 1)

Unnamed: 0,Year,GDP_Nominal,GDP_Growth
0,1980.0,2857.3,
1,1981.0,3207.0,0.025
2,1982.0,3343.8,-0.018
3,1983.0,,
4,1984.0,4037.7,0.072
5,1985.0,4339.0,
6,1986.0,,
7,1987.0,4855.3,
8,1988.0,5236.4,0.042
9,1989.0,,


# Filling the data

### Filling with constant

In [12]:
df.fillna("NA")

Unnamed: 0,Year,GDP_Nominal,GDP_Growth
0,1980.0,2857.3,
1,1981.0,3207.0,0.025
2,1982.0,3343.8,-0.018
3,1983.0,,
4,1984.0,4037.7,0.072
5,1985.0,4339.0,
6,1986.0,,
7,1987.0,4855.3,
8,1988.0,5236.4,0.042
9,1989.0,,


In [14]:
df.fillna(0)

Unnamed: 0,Year,GDP_Nominal,GDP_Growth
0,1980.0,2857.3,0.0
1,1981.0,3207.0,0.025
2,1982.0,3343.8,-0.018
3,1983.0,0.0,0.0
4,1984.0,4037.7,0.072
5,1985.0,4339.0,0.0
6,1986.0,0.0,0.0
7,1987.0,4855.3,0.0
8,1988.0,5236.4,0.042
9,1989.0,0.0,0.0


### Filling with ffill (foward fill) or bfill (backward fill)

In [17]:
df.fillna(method = "ffill")

Unnamed: 0,Year,GDP_Nominal,GDP_Growth
0,1980.0,2857.3,
1,1981.0,3207.0,0.025
2,1982.0,3343.8,-0.018
3,1983.0,3343.8,-0.018
4,1984.0,4037.7,0.072
5,1985.0,4339.0,0.072
6,1986.0,4339.0,0.072
7,1987.0,4855.3,0.072
8,1988.0,5236.4,0.042
9,1989.0,5236.4,0.042


In [18]:
df.fillna(method = "bfill")

Unnamed: 0,Year,GDP_Nominal,GDP_Growth
0,1980.0,2857.3,0.025
1,1981.0,3207.0,0.025
2,1982.0,3343.8,-0.018
3,1983.0,4037.7,0.072
4,1984.0,4037.7,0.072
5,1985.0,4339.0,0.042
6,1986.0,4855.3,0.042
7,1987.0,4855.3,0.042
8,1988.0,5236.4,0.042
9,1989.0,5963.1,0.019


### Filling with mean

In [23]:
df["GDP_Nominal"] = df["GDP_Nominal"].fillna(df["GDP_Nominal"].mean())
df

Unnamed: 0,Year,GDP_Nominal,GDP_Growth
0,1980.0,2857.3,
1,1981.0,3207.0,0.025
2,1982.0,3343.8,-0.018
3,1983.0,4484.433333,
4,1984.0,4037.7,0.072
5,1985.0,4339.0,
6,1986.0,4484.433333,
7,1987.0,4855.3,
8,1988.0,5236.4,0.042
9,1989.0,4484.433333,


In [26]:
df["GDP_Growth"] = df["GDP_Growth"].fillna(df["GDP_Growth"].mean())
df

Unnamed: 0,Year,GDP_Nominal,GDP_Growth
0,1980.0,2857.3,0.029167
1,1981.0,3207.0,0.025
2,1982.0,3343.8,-0.018
3,1983.0,4484.433333,0.029167
4,1984.0,4037.7,0.072
5,1985.0,4339.0,0.029167
6,1986.0,4484.433333,0.029167
7,1987.0,4855.3,0.029167
8,1988.0,5236.4,0.042
9,1989.0,4484.433333,0.029167


### Filling with mode

In [30]:
df["GDP_Nominal"] = df["GDP_Nominal"].fillna(df["GDP_Nominal"].mode()[0])
df

Unnamed: 0,Year,GDP_Nominal,GDP_Growth
0,1980.0,2857.3,
1,1981.0,3207.0,0.025
2,1982.0,3343.8,-0.018
3,1983.0,4037.7,
4,1984.0,4037.7,0.072
5,1985.0,4339.0,
6,1986.0,5236.4,
7,1987.0,4855.3,
8,1988.0,5236.4,0.042
9,1989.0,4037.7,


In [32]:
df["GDP_Growth"] = df["GDP_Growth"].fillna(df["GDP_Growth"].mode()[0])
df

Unnamed: 0,Year,GDP_Nominal,GDP_Growth
0,1980.0,2857.3,-0.018
1,1981.0,3207.0,0.025
2,1982.0,3343.8,-0.018
3,1983.0,4037.7,-0.018
4,1984.0,4037.7,0.072
5,1985.0,4339.0,-0.018
6,1986.0,5236.4,-0.018
7,1987.0,4855.3,-0.018
8,1988.0,5236.4,0.042
9,1989.0,4037.7,-0.018
