# Dealing with nulls in pandas

In [1]:
import pandas as pd
import numpy as np

data = pd.read_excel("/Users/danielcorcoran/PycharmProjects/daniels_mac_proj/datasets/for_testing/test_null.xlsx")

## Original Data


In [2]:
'''printing out data which was imported'''

data

Unnamed: 0,col1,col2,col3,col4,col5
0,,1.0,1.0,1.0,
1,1.0,1.0,,,
2,1.0,,,,
3,,,,,
4,,1.0,,1.0,
5,,,,,
6,,,1.0,,
7,1.0,,,,
8,1.0,1.0,1.0,,


## Check every cell for null value as table

In [7]:
'''return a dataframe of true/false for each null'''

data.isnull()

Unnamed: 0,col1,col2,col3,col4,col5
0,True,False,False,False,True
1,False,False,True,True,True
2,False,True,True,True,True
3,True,True,True,True,True
4,True,False,True,False,True
5,True,True,True,True,True
6,True,True,False,True,True
7,False,True,True,True,True
8,False,False,False,True,True


## Check count of nulls for each column

In [11]:
data.isnull().sum()

col1    5
col2    5
col3    6
col4    7
col5    9
dtype: int64

## Check percentage of null for each column

In [12]:
data.isnull().sum()/data.shape[0] * 100

#note: data.isnull().sum() counts nulls per column
#note: data.shape[0] counts number of rows in dataframe

col1     55.555556
col2     55.555556
col3     66.666667
col4     77.777778
col5    100.000000
dtype: float64

## Drop columns containing 100% nulls

In [6]:
data.dropna(how = "all", axis = 1)

#note: drops column 5 only

Unnamed: 0,col1,col2,col3,col4
0,,1.0,1.0,1.0
1,1.0,1.0,,
2,1.0,,,
3,,,,
4,,1.0,,1.0
5,,,,
6,,,1.0,
7,1.0,,,
8,1.0,1.0,1.0,


## Drop rows containing 100% nulls

In [7]:
data.dropna(how="all", axis = 0)

#note: drops row indexed 5 only

Unnamed: 0,col1,col2,col3,col4,col5
0,,1.0,1.0,1.0,
1,1.0,1.0,,,
2,1.0,,,,
4,,1.0,,1.0,
6,,,1.0,,
7,1.0,,,,
8,1.0,1.0,1.0,,


## Drop rows where a particular column contains nulls

In [8]:
data.dropna(axis = 0, how = "all", subset = ["col2"])

Unnamed: 0,col1,col2,col3,col4,col5
0,,1.0,1.0,1.0,
1,1.0,1.0,,,
4,,1.0,,1.0,
8,1.0,1.0,1.0,,


## Replace all Nulls with 5

In [13]:
data2 = data.fillna(5)

data2

Unnamed: 0,col1,col2,col3,col4,col5
0,5.0,1.0,1.0,1.0,5.0
1,1.0,1.0,5.0,5.0,5.0
2,1.0,5.0,5.0,5.0,5.0
3,5.0,5.0,5.0,5.0,5.0
4,5.0,1.0,5.0,1.0,5.0
5,5.0,5.0,5.0,5.0,5.0
6,5.0,5.0,1.0,5.0,5.0
7,1.0,5.0,5.0,5.0,5.0
8,1.0,1.0,1.0,5.0,5.0


## Replace Nulls in a particular column

In [10]:
data["col1"] = data["col1"].fillna(.2976)

data

Unnamed: 0,col1,col2,col3,col4,col5
0,0.2976,1.0,1.0,1.0,
1,1.0,1.0,,,
2,1.0,,,,
3,0.2976,,,,
4,0.2976,1.0,,1.0,
5,0.2976,,,,
6,0.2976,,1.0,,
7,1.0,,,,
8,1.0,1.0,1.0,,


## Replace Nulls with preceding values

In [15]:
data.fillna(method = "ffill")

#note col5 had no preceding values, as did cell index 0 of col1

Unnamed: 0,col1,col2,col3,col4,col5
0,,1.0,1.0,1.0,
1,1.0,1.0,1.0,1.0,
2,1.0,1.0,1.0,1.0,
3,1.0,1.0,1.0,1.0,
4,1.0,1.0,1.0,1.0,
5,1.0,1.0,1.0,1.0,
6,1.0,1.0,1.0,1.0,
7,1.0,1.0,1.0,1.0,
8,1.0,1.0,1.0,1.0,
