##  Data Cleaning and Transformation

In [2]:
import numpy as np
import pandas as pd

data = { 
'Name': ['  Alice  ', 'Bob', 'Charlie', 'david', 'Eve', 'Frank', 'Grace', 'Heidi', 'Alice'], 
'Age': [25, 30, 35, 28, np.nan, 40, 35, 28, 25], 
'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Los Angeles', 'Chicago', np.nan, 'New York'], 
'Email': ['alice@email.com', 'bob@email.com', 'charlie@email.com', 
'david@email.com', 'eve@email.com', 'frank@email.com', 'grace@email.com', 
'heidi@email.com', 'alice@email.com'] 
} 

df_clean = pd.DataFrame(data)
df_clean

Unnamed: 0,Name,Age,City,Email
0,Alice,25.0,New York,alice@email.com
1,Bob,30.0,Los Angeles,bob@email.com
2,Charlie,35.0,Chicago,charlie@email.com
3,david,28.0,Houston,david@email.com
4,Eve,,Phoenix,eve@email.com
5,Frank,40.0,Los Angeles,frank@email.com
6,Grace,35.0,Chicago,grace@email.com
7,Heidi,28.0,,heidi@email.com
8,Alice,25.0,New York,alice@email.com


In [3]:
# 37. Find and remove any duplicate rows from df_clean. 
df_clean.drop_duplicates()

Unnamed: 0,Name,Age,City,Email
0,Alice,25.0,New York,alice@email.com
1,Bob,30.0,Los Angeles,bob@email.com
2,Charlie,35.0,Chicago,charlie@email.com
3,david,28.0,Houston,david@email.com
4,Eve,,Phoenix,eve@email.com
5,Frank,40.0,Los Angeles,frank@email.com
6,Grace,35.0,Chicago,grace@email.com
7,Heidi,28.0,,heidi@email.com
8,Alice,25.0,New York,alice@email.com


In [4]:
# 38. Fill the missing 'Age' values with the mean age of the column.
df_clean['Age'] = df_clean['Age'].fillna(df_clean['Age'].mean())
df_clean

Unnamed: 0,Name,Age,City,Email
0,Alice,25.0,New York,alice@email.com
1,Bob,30.0,Los Angeles,bob@email.com
2,Charlie,35.0,Chicago,charlie@email.com
3,david,28.0,Houston,david@email.com
4,Eve,30.75,Phoenix,eve@email.com
5,Frank,40.0,Los Angeles,frank@email.com
6,Grace,35.0,Chicago,grace@email.com
7,Heidi,28.0,,heidi@email.com
8,Alice,25.0,New York,alice@email.com


In [5]:
# 39. Strip leading/trailing whitespace from the 'Name' column. 
df_clean['Name'] = df_clean['Name'].str.strip()
df_clean[['Name']]

Unnamed: 0,Name
0,Alice
1,Bob
2,Charlie
3,david
4,Eve
5,Frank
6,Grace
7,Heidi
8,Alice


In [6]:
# 40. Capitalize the first letter of each name in the 'Name' column. 
df_clean['Name'] = df_clean['Name'].str.capitalize()
df_clean[['Name']]

Unnamed: 0,Name
0,Alice
1,Bob
2,Charlie
3,David
4,Eve
5,Frank
6,Grace
7,Heidi
8,Alice


In [7]:
# 41. Create a new column 'Age_Group' that categorizes people into 'Young' (<30) and 'Adult' (>=30).
df_clean['Age_Group'] = df_clean['Age'].apply(lambda x: 'Young' if x < 30 else 'Adult')
df_clean[['Name','Age','Age_Group']]

Unnamed: 0,Name,Age,Age_Group
0,Alice,25.0,Young
1,Bob,30.0,Adult
2,Charlie,35.0,Adult
3,David,28.0,Young
4,Eve,30.75,Adult
5,Frank,40.0,Adult
6,Grace,35.0,Adult
7,Heidi,28.0,Young
8,Alice,25.0,Young


In [8]:
# 42. Extract the domain name (e.g., 'email.com') from the 'Email' column and create a new column 'Domain'. 
df_clean['Domain'] = df_clean['Email'].str.split('@').str[1]
df_clean[['Email','Domain']]

Unnamed: 0,Email,Domain
0,alice@email.com,email.com
1,bob@email.com,email.com
2,charlie@email.com,email.com
3,david@email.com,email.com
4,eve@email.com,email.com
5,frank@email.com,email.com
6,grace@email.com,email.com
7,heidi@email.com,email.com
8,alice@email.com,email.com


In [9]:
# 43. Drop all rows that have any missing values. 
df_clean.dropna()

Unnamed: 0,Name,Age,City,Email,Age_Group,Domain
0,Alice,25.0,New York,alice@email.com,Young,email.com
1,Bob,30.0,Los Angeles,bob@email.com,Adult,email.com
2,Charlie,35.0,Chicago,charlie@email.com,Adult,email.com
3,David,28.0,Houston,david@email.com,Young,email.com
4,Eve,30.75,Phoenix,eve@email.com,Adult,email.com
5,Frank,40.0,Los Angeles,frank@email.com,Adult,email.com
6,Grace,35.0,Chicago,grace@email.com,Adult,email.com
8,Alice,25.0,New York,alice@email.com,Young,email.com


In [10]:
# 44. Replace 'New York' with 'NY' and 'Los Angeles' with 'LA' in the 'City' column using map. 
df_clean['City'] = df_clean['City'].map({'New York':'NY','Los Angeles':'LA','Chicago':'Chicago','Houston':'Houston','Phoenix':'Phoenix'})
df_clean[['City']]

Unnamed: 0,City
0,NY
1,LA
2,Chicago
3,Houston
4,Phoenix
5,LA
6,Chicago
7,
8,NY


In [11]:
# 45. Convert the 'Age' column from a float to an integer data type. 
df_clean['Age'] = df_clean['Age'].astype(int)
df_clean[['Age']]

Unnamed: 0,Age
0,25
1,30
2,35
3,28
4,30
5,40
6,35
7,28
8,25


In [12]:
# 46. Create a boolean column 'Is_Chicago' which is True if the city is 'Chicago' and False otherwise. 
df_clean['Is_Chicago'] = df_clean['City'] == 'Chicago'
df_clean[['City','Is_Chicago']]

Unnamed: 0,City,Is_Chicago
0,NY,False
1,LA,False
2,Chicago,True
3,Houston,False
4,Phoenix,False
5,LA,False
6,Chicago,True
7,,False
8,NY,False


In [14]:
# 47. Apply a lambda function to the 'Age' column that adds 5 to each person's age. 
df_clean['New_age'] = df_clean['Age'].apply(lambda x: x+5)
df_clean[['Name','Age','New_age']]

Unnamed: 0,Name,Age,New_age
0,Alice,25,30
1,Bob,30,35
2,Charlie,35,40
3,David,28,33
4,Eve,30,35
5,Frank,40,45
6,Grace,35,40
7,Heidi,28,33
8,Alice,25,30


In [16]:
# 48. Find all rows where the 'Name' column contains the letter 'e'. 
df = df_clean[df_clean['Name'].str.contains('e', case=False, na=False)]
df

Unnamed: 0,Name,Age,City,Email,Age_Group,Domain,Is_Chicago,New_age
0,Alice,25,NY,alice@email.com,Young,email.com,False,30
2,Charlie,35,Chicago,charlie@email.com,Adult,email.com,True,40
4,Eve,30,Phoenix,eve@email.com,Adult,email.com,False,35
6,Grace,35,Chicago,grace@email.com,Adult,email.com,True,40
7,Heidi,28,,heidi@email.com,Young,email.com,False,33
8,Alice,25,NY,alice@email.com,Young,email.com,False,30
