### Day 6 â€” Pandas Data Cleaning (Missing Values & Duplicates)

In [1]:
import pandas as pd
import numpy as np

##### 1. Creating a sample DataFrame with missing values

In [2]:
data = {
    'Name': ['Amit', 'Riya', 'John', 'Sara', None],
    'Age': [25, None, 29, 24, 30],
    'City': ['Delhi', 'Mumbai', None, 'London', 'Delhi'],
    'Score': [88, 92, None, 95, 88]
}

In [3]:
df = pd.DataFrame(data)
print("Original DataFrame with missing values:")
print(df)

Original DataFrame with missing values:
   Name   Age    City  Score
0  Amit  25.0   Delhi   88.0
1  Riya   NaN  Mumbai   92.0
2  John  29.0    None    NaN
3  Sara  24.0  London   95.0
4  None  30.0   Delhi   88.0


##### 2. Checking missing values

In [4]:
print("\nMissing values in each column:")
print(df.isnull().sum())


Missing values in each column:
Name     1
Age      1
City     1
Score    1
dtype: int64


In [5]:
print("\nCheck if any missing values exist:")
print(df.isnull().any())


Check if any missing values exist:
Name     True
Age      True
City     True
Score    True
dtype: bool


##### 3. Handling missing values

In [6]:
# Drop rows with ANY missing value
df_drop_any = df.dropna()
print("\nDrop rows with ANY missing value:")
print(df_drop_any)


Drop rows with ANY missing value:
   Name   Age    City  Score
0  Amit  25.0   Delhi   88.0
3  Sara  24.0  London   95.0


In [7]:
# Drop rows where ALL values are missing
df_drop_all = df.dropna(how='all')
print("\nDrop rows where ALL values are missing:")
print(df_drop_all)


Drop rows where ALL values are missing:
   Name   Age    City  Score
0  Amit  25.0   Delhi   88.0
1  Riya   NaN  Mumbai   92.0
2  John  29.0    None    NaN
3  Sara  24.0  London   95.0
4  None  30.0   Delhi   88.0


In [8]:
# Fill missing values with a specific value
df_fill_value = df.fillna(value={'Age': 0, 'City': 'Unknown', 'Score': df['Score'].mean()})
print("\nFill missing values with specific values:")
print(df_fill_value)


Fill missing values with specific values:
   Name   Age     City  Score
0  Amit  25.0    Delhi  88.00
1  Riya   0.0   Mumbai  92.00
2  John  29.0  Unknown  90.75
3  Sara  24.0   London  95.00
4  None  30.0    Delhi  88.00


In [9]:
# Fill missing values with mean (numeric columns)
df_fill_mean = df.copy()
df_fill_mean['Age'] = df_fill_mean['Age'].fillna(df_fill_mean['Age'].mean())
df_fill_mean['Score'] = df_fill_mean['Score'].fillna(df_fill_mean['Score'].mean())

In [10]:
print("\nFill numeric missing values with mean:")
print(df_fill_mean)


Fill numeric missing values with mean:
   Name   Age    City  Score
0  Amit  25.0   Delhi  88.00
1  Riya  27.0  Mumbai  92.00
2  John  29.0    None  90.75
3  Sara  24.0  London  95.00
4  None  30.0   Delhi  88.00


##### 4. Handling Duplicates

In [11]:
df_dup = df_fill_mean.copy()
df_dup.loc[5] = ['Amit', 25, 'Delhi', 88]   # adding a duplicate row

In [12]:
print("\nDataFrame with a duplicate row:")
print(df_dup)


DataFrame with a duplicate row:
   Name   Age    City  Score
0  Amit  25.0   Delhi  88.00
1  Riya  27.0  Mumbai  92.00
2  John  29.0    None  90.75
3  Sara  24.0  London  95.00
4  None  30.0   Delhi  88.00
5  Amit  25.0   Delhi  88.00


In [13]:
print("\nCheck duplicate rows:")
print(df_dup.duplicated())


Check duplicate rows:
0    False
1    False
2    False
3    False
4    False
5     True
dtype: bool


In [14]:
# Remove duplicates
df_no_dup = df_dup.drop_duplicates()
print("\nDataFrame after removing duplicates:")
print(df_no_dup)


DataFrame after removing duplicates:
   Name   Age    City  Score
0  Amit  25.0   Delhi  88.00
1  Riya  27.0  Mumbai  92.00
2  John  29.0    None  90.75
3  Sara  24.0  London  95.00
4  None  30.0   Delhi  88.00


##### 5. Replace values

In [15]:
df_replace = df.copy()
df_replace['City'] = df_replace['City'].replace({'Delhi': 'New Delhi'})
print("\nReplace values in 'City' column:")
print(df_replace)


Replace values in 'City' column:
   Name   Age       City  Score
0  Amit  25.0  New Delhi   88.0
1  Riya   NaN     Mumbai   92.0
2  John  29.0       None    NaN
3  Sara  24.0     London   95.0
4  None  30.0  New Delhi   88.0
