In [1]:
import pandas as pd
import numpy as np

In [6]:
people = {
    "first": ["Shankar", "Niru", "Akshara", np.nan, None, "NA"],
    "last": ["Rajappa", "Shankar", "Shankar", np.nan, np.nan, "Missing"],
    "email": ["shankar@email.com", "niru@email.com", "akshara@email.com", None, np.nan, np.nan],
    "age": ["41", "40", "5", None, None, "Missing"]
}

In [7]:
df = pd.DataFrame(people)
df

Unnamed: 0,first,last,email,age
0,Shankar,Rajappa,shankar@email.com,41
1,Niru,Shankar,niru@email.com,40
2,Akshara,Shankar,akshara@email.com,5
3,,,,
4,,,,
5,,Missing,,Missing


In [9]:
df.dropna() # it uses default argument to decide which row to drop --> df.dropna(axis="index", how="any")
# axis = index means drop the row that NA values, while axis = column would drop the column if all values have NA
# how = any means drop axis (index or column) even if one value is NA
# while how = all would mean to drop only if all values in row/column are NA

Unnamed: 0,first,last,email,age
0,Shankar,Rajappa,shankar@email.com,41
1,Niru,Shankar,niru@email.com,40
2,Akshara,Shankar,akshara@email.com,5


In [10]:
# df.dropna(axis="index", how="any")
df.dropna(axis="index", how="all")

Unnamed: 0,first,last,email,age
0,Shankar,Rajappa,shankar@email.com,41
1,Niru,Shankar,niru@email.com,40
2,Akshara,Shankar,akshara@email.com,5
5,,Missing,,Missing


In [11]:
# there are no columns that has all values as NA so the result should be all the data as is
df.dropna(axis="columns", how="all")

Unnamed: 0,first,last,email,age
0,Shankar,Rajappa,shankar@email.com,41
1,Niru,Shankar,niru@email.com,40
2,Akshara,Shankar,akshara@email.com,5
3,,,,
4,,,,
5,,Missing,,Missing


In [13]:
# however, if we change how = any, then all the columns would be dropped, showing empty result (or just index column)
df.dropna(axis="columns", how="any")

0
1
2
3
4
5


In [15]:
# assume if we want to drop a row only if email is not available
df.dropna(axis="index", how="any", subset=["email"]) # in this how is not doing much

Unnamed: 0,first,last,email,age
0,Shankar,Rajappa,shankar@email.com,41
1,Niru,Shankar,niru@email.com,40
2,Akshara,Shankar,akshara@email.com,5


In [17]:
# drop if either last name or email is not there
df.dropna(axis="index", how="any", subset=["email", "last"]) # we can inplace=True to permanently to drop row/column

Unnamed: 0,first,last,email,age
0,Shankar,Rajappa,shankar@email.com,41
1,Niru,Shankar,niru@email.com,40
2,Akshara,Shankar,akshara@email.com,5


In [23]:
# assume we want to replace unexpected values like Missing or NA with np.nan, then try the following
people2 = {
    "first": ["Shankar", "Niru", "Akshara", np.nan, None, "NA"],
    "last": ["Rajappa", "Shankar", "Shankar", np.nan, np.nan, "Missing"],
    "email": ["shankar@email.com", "niru@email.com", "akshara@email.com", None, np.nan, np.nan],
    "age": ["41", "40", "5", None, None, "Missing"]
}
df2 = pd.DataFrame(people2)
df2.replace("NA", np.nan, inplace=True)
df2.replace("Missing", np.nan, inplace=True)
df2

Unnamed: 0,first,last,email,age
0,Shankar,Rajappa,shankar@email.com,41.0
1,Niru,Shankar,niru@email.com,40.0
2,Akshara,Shankar,akshara@email.com,5.0
3,,,,
4,,,,
5,,,,


In [35]:
df2.dropna()

Unnamed: 0,first,last,email,age
0,Shankar,Rajappa,shankar@email.com,41
1,Niru,Shankar,niru@email.com,40
2,Akshara,Shankar,akshara@email.com,5


In [36]:
# to find if each of the value will be considered as na or not
df2.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,True,True,True,True
4,True,True,True,True
5,True,True,True,True


In [37]:
# to replace na with some specific value... for numeric sometimes one might prefer to replace with 0
df2.fillna("MISSING")

Unnamed: 0,first,last,email,age
0,Shankar,Rajappa,shankar@email.com,41
1,Niru,Shankar,niru@email.com,40
2,Akshara,Shankar,akshara@email.com,5
3,MISSING,MISSING,MISSING,MISSING
4,MISSING,MISSING,MISSING,MISSING
5,MISSING,MISSING,MISSING,MISSING


In [38]:
df2.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [33]:
# to get average age of all the people in df
# df["age"].mean() # this will not work as age is seen as string (object in dtype)

# If we try to convert string to int (for example, the age column values)
# then NaN will fail or expect it to be converted as a float

# Note NaN is actually float under the hood... try this type(np.nan) => float

df2["age"] = df2["age"].astype(int) # this will fail as some age has NaN values

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

In [43]:
df2["age"] = df2["age"].astype(float) # this will work
df2
df2.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [44]:
df2["age"].mean()

28.666666666666668