# Missing Data in Pandas

In [1]:
import pandas as pd
import numpy as np



In [2]:
s=pd.Series(["Sam",np.nan,"Tim","Kim"])
s

0    Sam
1    NaN
2    Tim
3    Kim
dtype: object

In [3]:
#Check for null/missing values, returns True is value is null/missing
s.isnull()

0    False
1     True
2    False
3    False
dtype: bool

In [4]:
#Returns True if value is not missing or not null
s.notnull()

0     True
1    False
2     True
3     True
dtype: bool

In [5]:
s[3]=None   #assigns None (similar to null/missing value) to index 3
print(s)
s.isnull()

0     Sam
1     NaN
2     Tim
3    None
dtype: object


0    False
1     True
2    False
3     True
dtype: bool

In [6]:
#drop missing values
s.dropna()

0    Sam
2    Tim
dtype: object

In [7]:
from numpy import nan as NA

In [8]:
df=pd.DataFrame([[1,2,3],[4,NA,5],
                 [NA,NA,NA]])
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0
2,,,


In [9]:
#Dropping rows with null values 
df.dropna()

Unnamed: 0,0,1,2
0,1.0,2.0,3.0


In [10]:
df.dropna(how="all") # Drop rwo if all the values in it are null

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0


In [11]:
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0
2,,,


In [12]:
df[0]=NA    #assign null to column 1
df

Unnamed: 0,0,1,2
0,,2.0,3.0
1,,,5.0
2,,,


In [13]:
df.dropna(axis=1,how="all") # drop column if all the values in it are null

Unnamed: 0,1,2
0,2.0,3.0
1,,5.0
2,,


In [None]:
df

In [14]:
# Creating a DataFrame with missing values
df = pd.DataFrame({
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, np.nan, 8],
    'C': [9, 10, 11, np.nan]
})

print("Original DataFrame:")
print(df)

# Drop rows that have less than 2 non-NA values
df_thresh = df.dropna(thresh=2, axis=0)

print("\nDataFrame after dropna(thresh=2):")
print(df_thresh)


Original DataFrame:
     A    B     C
0  1.0  5.0   9.0
1  2.0  NaN  10.0
2  NaN  NaN  11.0
3  4.0  8.0   NaN

DataFrame after dropna(thresh=2):
     A    B     C
0  1.0  5.0   9.0
1  2.0  NaN  10.0
3  4.0  8.0   NaN


In [15]:
df.fillna(0)   # fill all null values with zeros

Unnamed: 0,A,B,C
0,1.0,5.0,9.0
1,2.0,0.0,10.0
2,0.0,0.0,11.0
3,4.0,8.0,0.0


In [16]:
df.fillna({'A':15,'B':25,'C':35})  # fill missing values in the dataframe as per the dictionary

Unnamed: 0,A,B,C
0,1.0,5.0,9.0
1,2.0,25.0,10.0
2,15.0,25.0,11.0
3,4.0,8.0,35.0


In [17]:
print(df)

     A    B     C
0  1.0  5.0   9.0
1  2.0  NaN  10.0
2  NaN  NaN  11.0
3  4.0  8.0   NaN


In [18]:
df.fillna(0,inplace=True) # inplace=True makes the change in the original dataframe itself
#df

In [19]:
print(df)

     A    B     C
0  1.0  5.0   9.0
1  2.0  0.0  10.0
2  0.0  0.0  11.0
3  4.0  8.0   0.0


In [20]:
df=pd.DataFrame([[1,2,3],[4,NA,5],[NA,6,NA]])
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0
2,,6.0,


In [24]:
# fill values column wise. The null value is filled with the first value encoutered before the null value while parsing from 
# the bottom
df.fillna(method="bfill")

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,6.0,5.0
2,,6.0,


In [25]:
# fill values column wise. The null value is filled with the first value encoutered before the null value while parsing from 
# the top with a limit of 2 
df.fillna(method="ffill",limit=2)

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,2.0,5.0
2,4.0,6.0,5.0


In [26]:
data=pd.Series([1,0,NA,5])
data

0    1.0
1    0.0
2    NaN
3    5.0
dtype: float64

In [27]:
data.fillna(data.mean())  # fill missing values with a mean of other values in the series

0    1.0
1    0.0
2    2.0
3    5.0
dtype: float64

In [28]:
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0
2,,6.0,


In [29]:
df.mean()

0    2.5
1    4.0
2    4.0
dtype: float64

In [30]:
df.fillna(df.mean())

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,4.0,5.0
2,2.5,6.0,4.0
