In [4]:
import pandas as pd
import numpy as np

In [16]:
people = {
    "first_name": ["Corey", "Cate", "Dan", None, None, "Alex", "NA"],
    "last_name": ["Schafer", "Materazzi", "Scheineder", "Kamau", None, "Bilal", "Missing"],
    "age": [30, 42, 51, np.nan, np.nan, 19, np.nan],
    "email": ["corey@gmail.com", "cate@gmail.com", None, "kamaaa@gmail.com", None, "bilzalex@gmail.com", "anonymous@gmail.com"]
}
df = pd.DataFrame(people)

df.replace("NA", np.nan, inplace=True) # Replaces custom NA values with NaN values
df.replace("Missing", np.nan, inplace=True)
df

Unnamed: 0,first_name,last_name,age,email
0,Corey,Schafer,30.0,corey@gmail.com
1,Cate,Materazzi,42.0,cate@gmail.com
2,Dan,Scheineder,51.0,
3,,Kamau,,kamaaa@gmail.com
4,,,,
5,Alex,Bilal,19.0,bilzalex@gmail.com
6,,,,anonymous@gmail.com


In [17]:
# Let's clean this data by dropping all rows that have NaN or None values
df.dropna()

Unnamed: 0,first_name,last_name,age,email
0,Corey,Schafer,30.0,corey@gmail.com
1,Cate,Materazzi,42.0,cate@gmail.com
5,Alex,Bilal,19.0,bilzalex@gmail.com


In [18]:
"""
The df.dropna() method contains 2 default args, axis and how.
Args:
    (i) axis: This defines if we are dropping the rows or columns with NaN/None values. 
              For example: axis="index" drops rows with NaN values and axis="columns" drops columns with NaN values

    (ii) how: This defines if we are dropping rows/columns with all or any NaN values
              For example: how="all" drops rows/columns with all values == NaN and how="any" drops rows/columns with NaN values
"""
df.dropna(axis="index", how="all") # This drops rows which has all values as NaN 

Unnamed: 0,first_name,last_name,age,email
0,Corey,Schafer,30.0,corey@gmail.com
1,Cate,Materazzi,42.0,cate@gmail.com
2,Dan,Scheineder,51.0,
3,,Kamau,,kamaaa@gmail.com
5,Alex,Bilal,19.0,bilzalex@gmail.com
6,,,,anonymous@gmail.com


In [19]:
df.isna() #Checks if value is NaN and returns true if it is.

Unnamed: 0,first_name,last_name,age,email
0,False,False,False,False
1,False,False,False,False
2,False,False,False,True
3,True,False,True,False
4,True,True,True,True
5,False,False,False,False
6,True,True,True,False


In [21]:
# Let's cast the Age column to a float datatype using the pd.astype() method
df["age"] = df["age"].astype(float)

In [23]:
# Checking the dtypes of columns in the dataframe
df.dtypes

first_name     object
last_name      object
age           float64
email          object
dtype: object

In [24]:
# Let's get the average age
df["age"].mean()

35.5