In [1]:
import pandas as pd
import numpy as np

In [2]:
data = {
    'first': ['John', 'Bob', 'Eve', 'Charlie', None, np.nan, 'Sophia', 'NA', 'MISSING'],
    'last': ['Doe', 'Johnson', 'Anderson', 'Brown', 'Miller', 'Wilson', 'Davis', 'Martinez', 'OKAY'],
    'email': ['john.doe@example.com', 'NA', None, 'charlie.brown@example.com',
              'grace.miller@example.com', 'will@gmail.com', np.nan, 'liam.martinez@example.com', 'MISSING'],
    'age': [25, np.nan, None, 40, 28, 45, np.nan, 37, 'MISSING'],
    'dob': ['1998-03-15', '1980-05-10', '1975-09-07', '1995-08-18', '1995-08-18', '1989-06-25',
            'NA', 'NA', 'MISSING']
}

df = pd.DataFrame(data)
df

Unnamed: 0,first,last,email,age,dob
0,John,Doe,john.doe@example.com,25,1998-03-15
1,Bob,Johnson,,,1980-05-10
2,Eve,Anderson,,,1975-09-07
3,Charlie,Brown,charlie.brown@example.com,40,1995-08-18
4,,Miller,grace.miller@example.com,28,1995-08-18
5,,Wilson,will@gmail.com,45,1989-06-25
6,Sophia,Davis,,,
7,,Martinez,liam.martinez@example.com,37,
8,MISSING,OKAY,MISSING,MISSING,MISSING


In [3]:
# Convert 'NA' and 'MISSING' to actual NaN values
df = df.replace(['NA', 'MISSING'], np.nan)
df

# df.replace('NA', np.nan, inplace=True)
# df.replace('MISSING', np.nan, inplace=True)

Unnamed: 0,first,last,email,age,dob
0,John,Doe,john.doe@example.com,25.0,1998-03-15
1,Bob,Johnson,,,1980-05-10
2,Eve,Anderson,,,1975-09-07
3,Charlie,Brown,charlie.brown@example.com,40.0,1995-08-18
4,,Miller,grace.miller@example.com,28.0,1995-08-18
5,,Wilson,will@gmail.com,45.0,1989-06-25
6,Sophia,Davis,,,
7,,Martinez,liam.martinez@example.com,37.0,
8,,OKAY,,,


In [4]:
# Detect missing values.
df.isna()

Unnamed: 0,first,last,email,age,dob
0,False,False,False,False,False
1,False,False,True,True,False
2,False,False,True,True,False
3,False,False,False,False,False
4,True,False,False,False,False
5,True,False,False,False,False
6,False,False,True,True,True
7,True,False,False,False,True
8,True,False,True,True,True


In [5]:
# Fill NA/NaN values using the specified method.
df.fillna("<MISSING>")

Unnamed: 0,first,last,email,age,dob
0,John,Doe,john.doe@example.com,25.0,1998-03-15
1,Bob,Johnson,<MISSING>,<MISSING>,1980-05-10
2,Eve,Anderson,<MISSING>,<MISSING>,1975-09-07
3,Charlie,Brown,charlie.brown@example.com,40.0,1995-08-18
4,<MISSING>,Miller,grace.miller@example.com,28.0,1995-08-18
5,<MISSING>,Wilson,will@gmail.com,45.0,1989-06-25
6,Sophia,Davis,<MISSING>,<MISSING>,<MISSING>
7,<MISSING>,Martinez,liam.martinez@example.com,37.0,<MISSING>
8,<MISSING>,OKAY,<MISSING>,<MISSING>,<MISSING>


In [6]:
# Remove row's missing values 
df.dropna() # axis=0/index

Unnamed: 0,first,last,email,age,dob
0,John,Doe,john.doe@example.com,25.0,1998-03-15
3,Charlie,Brown,charlie.brown@example.com,40.0,1995-08-18


In [7]:
# Remove column if there is single missing value 
df.dropna(axis='columns')

Unnamed: 0,last
0,Doe
1,Johnson
2,Anderson
3,Brown
4,Miller
5,Wilson
6,Davis
7,Martinez
8,OKAY


In [8]:
# Exercise: Drop all rows that contain at least one missing value
df.dropna(axis='index', how='any')

Unnamed: 0,first,last,email,age,dob
0,John,Doe,john.doe@example.com,25.0,1998-03-15
3,Charlie,Brown,charlie.brown@example.com,40.0,1995-08-18


In [9]:
# Exercise: Drop only the rows where column 'Age' has missing values
df.dropna(subset=['email'])

Unnamed: 0,first,last,email,age,dob
0,John,Doe,john.doe@example.com,25.0,1998-03-15
3,Charlie,Brown,charlie.brown@example.com,40.0,1995-08-18
4,,Miller,grace.miller@example.com,28.0,1995-08-18
5,,Wilson,will@gmail.com,45.0,1989-06-25
7,,Martinez,liam.martinez@example.com,37.0,


In [10]:
df

Unnamed: 0,first,last,email,age,dob
0,John,Doe,john.doe@example.com,25.0,1998-03-15
1,Bob,Johnson,,,1980-05-10
2,Eve,Anderson,,,1975-09-07
3,Charlie,Brown,charlie.brown@example.com,40.0,1995-08-18
4,,Miller,grace.miller@example.com,28.0,1995-08-18
5,,Wilson,will@gmail.com,45.0,1989-06-25
6,Sophia,Davis,,,
7,,Martinez,liam.martinez@example.com,37.0,
8,,OKAY,,,


In [11]:
# Exercise:  Drop rows with more than Four missing values
df.dropna(thresh=4)
# non-missing values required to keep a row or column.
# If the number of non-missing/valid values is below the specified threshold, the row or column is dropped.

Unnamed: 0,first,last,email,age,dob
0,John,Doe,john.doe@example.com,25.0,1998-03-15
3,Charlie,Brown,charlie.brown@example.com,40.0,1995-08-18
4,,Miller,grace.miller@example.com,28.0,1995-08-18
5,,Wilson,will@gmail.com,45.0,1989-06-25


In [12]:
print(df) 
df.dropna(how='any') # Drop the row if any of its elements are missing.

     first      last                      email   age         dob
0     John       Doe       john.doe@example.com  25.0  1998-03-15
1      Bob   Johnson                        NaN   NaN  1980-05-10
2      Eve  Anderson                       None   NaN  1975-09-07
3  Charlie     Brown  charlie.brown@example.com  40.0  1995-08-18
4     None    Miller   grace.miller@example.com  28.0  1995-08-18
5      NaN    Wilson             will@gmail.com  45.0  1989-06-25
6   Sophia     Davis                        NaN   NaN         NaN
7      NaN  Martinez  liam.martinez@example.com  37.0         NaN
8      NaN      OKAY                        NaN   NaN         NaN


Unnamed: 0,first,last,email,age,dob
0,John,Doe,john.doe@example.com,25.0,1998-03-15
3,Charlie,Brown,charlie.brown@example.com,40.0,1995-08-18


In [13]:
df.replace('OKAY', np.nan, inplace=True)
print(df)
df.dropna(how='all') # Drop the row only if all of its elements are missing.

     first      last                      email   age         dob
0     John       Doe       john.doe@example.com  25.0  1998-03-15
1      Bob   Johnson                        NaN   NaN  1980-05-10
2      Eve  Anderson                       None   NaN  1975-09-07
3  Charlie     Brown  charlie.brown@example.com  40.0  1995-08-18
4     None    Miller   grace.miller@example.com  28.0  1995-08-18
5      NaN    Wilson             will@gmail.com  45.0  1989-06-25
6   Sophia     Davis                        NaN   NaN         NaN
7      NaN  Martinez  liam.martinez@example.com  37.0         NaN
8      NaN       NaN                        NaN   NaN         NaN


Unnamed: 0,first,last,email,age,dob
0,John,Doe,john.doe@example.com,25.0,1998-03-15
1,Bob,Johnson,,,1980-05-10
2,Eve,Anderson,,,1975-09-07
3,Charlie,Brown,charlie.brown@example.com,40.0,1995-08-18
4,,Miller,grace.miller@example.com,28.0,1995-08-18
5,,Wilson,will@gmail.com,45.0,1989-06-25
6,Sophia,Davis,,,
7,,Martinez,liam.martinez@example.com,37.0,


In [14]:
# Exercise: Drop rows where both 'email' and 'age' are missing with columns 'email' and 'age',
df.dropna(axis='index', how='all', subset=['email', 'age'])

Unnamed: 0,first,last,email,age,dob
0,John,Doe,john.doe@example.com,25.0,1998-03-15
3,Charlie,Brown,charlie.brown@example.com,40.0,1995-08-18
4,,Miller,grace.miller@example.com,28.0,1995-08-18
5,,Wilson,will@gmail.com,45.0,1989-06-25
7,,Martinez,liam.martinez@example.com,37.0,


In [15]:
# Exercise: Fill missing values in 'age' column with the mean age

df_filled_age = df.copy()
df_filled_age['age'] = pd.to_numeric(df_filled_age['age'], errors='coerce')  # Convert to numeric
mean_age = df_filled_age['age'].mean()
df_filled_age['age'].fillna(mean_age, inplace=True)

print("\nExercise 3: DataFrame after filling missing values in 'age' column with the mean age:")
df_filled_age



Exercise 3: DataFrame after filling missing values in 'age' column with the mean age:


Unnamed: 0,first,last,email,age,dob
0,John,Doe,john.doe@example.com,25.0,1998-03-15
1,Bob,Johnson,,35.0,1980-05-10
2,Eve,Anderson,,35.0,1975-09-07
3,Charlie,Brown,charlie.brown@example.com,40.0,1995-08-18
4,,Miller,grace.miller@example.com,28.0,1995-08-18
5,,Wilson,will@gmail.com,45.0,1989-06-25
6,Sophia,Davis,,35.0,
7,,Martinez,liam.martinez@example.com,37.0,
8,,,,35.0,


In [16]:
df.dtypes

first     object
last      object
email     object
age      float64
dob       object
dtype: object

In [17]:
# Exercise: Cast age column as a float data type.
df['age'] = df['age'].astype(float)
df

Unnamed: 0,first,last,email,age,dob
0,John,Doe,john.doe@example.com,25.0,1998-03-15
1,Bob,Johnson,,,1980-05-10
2,Eve,Anderson,,,1975-09-07
3,Charlie,Brown,charlie.brown@example.com,40.0,1995-08-18
4,,Miller,grace.miller@example.com,28.0,1995-08-18
5,,Wilson,will@gmail.com,45.0,1989-06-25
6,Sophia,Davis,,,
7,,Martinez,liam.martinez@example.com,37.0,
8,,,,,
