In [4]:
# Performing the Data Cleaning

In [5]:
import numpy as np
import pandas as pd

data = {
    "Name" : ["Alice", "Bob", "Charlie", "David", "Eva", "Alice"],
    "Age" : [24, 30, np.nan, 35, 28, 24],
    "Department" : ["HR", "Finance", "IT", "IT", np.nan, "HR"],
    "Salary" : [50000, 60000, 55000, 70000, np.nan, 50000]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Department,Salary
0,Alice,24.0,HR,50000.0
1,Bob,30.0,Finance,60000.0
2,Charlie,,IT,55000.0
3,David,35.0,IT,70000.0
4,Eva,28.0,,
5,Alice,24.0,HR,50000.0


In [6]:
# To add 'Promoted Salary' column with *10 of 'Salary' column
df['Promoted Salary'] = df['Salary'] * 10
df

Unnamed: 0,Name,Age,Department,Salary,Promoted Salary
0,Alice,24.0,HR,50000.0,500000.0
1,Bob,30.0,Finance,60000.0,600000.0
2,Charlie,,IT,55000.0,550000.0
3,David,35.0,IT,70000.0,700000.0
4,Eva,28.0,,,
5,Alice,24.0,HR,50000.0,500000.0


In [7]:
# To Check how many missing values are present in entire DataFrame and count them

df.isnull().sum()

Name               0
Age                1
Department         1
Salary             1
Promoted Salary    1
dtype: int64

In [8]:
# Without modifying the original DataFrame, we can drop rows with any missing values by using: df.dropna()
df.dropna()

Unnamed: 0,Name,Age,Department,Salary,Promoted Salary
0,Alice,24.0,HR,50000.0,500000.0
1,Bob,30.0,Finance,60000.0,600000.0
3,David,35.0,IT,70000.0,700000.0
5,Alice,24.0,HR,50000.0,500000.0


In [9]:
# As we not used the inplace=True parameter, the original DataFrame remains unchanged.
df

Unnamed: 0,Name,Age,Department,Salary,Promoted Salary
0,Alice,24.0,HR,50000.0,500000.0
1,Bob,30.0,Finance,60000.0,600000.0
2,Charlie,,IT,55000.0,550000.0
3,David,35.0,IT,70000.0,700000.0
4,Eva,28.0,,,
5,Alice,24.0,HR,50000.0,500000.0


In [10]:
# Performing the Data Cleaning by any and all methods
# 1. Dropping Missing Values by any 

df.dropna(how = 'any')

# Here 'any' only drops the rows where at least one null value is present

Unnamed: 0,Name,Age,Department,Salary,Promoted Salary
0,Alice,24.0,HR,50000.0,500000.0
1,Bob,30.0,Finance,60000.0,600000.0
3,David,35.0,IT,70000.0,700000.0
5,Alice,24.0,HR,50000.0,500000.0


In [11]:
# 2. Dropping Missing Values by all
df.dropna(how = 'all')
df
# Here all only drops the rows where all null values in that row are present otherwise keeps the row intact

Unnamed: 0,Name,Age,Department,Salary,Promoted Salary
0,Alice,24.0,HR,50000.0,500000.0
1,Bob,30.0,Finance,60000.0,600000.0
2,Charlie,,IT,55000.0,550000.0
3,David,35.0,IT,70000.0,700000.0
4,Eva,28.0,,,
5,Alice,24.0,HR,50000.0,500000.0


In [None]:
# To fill the null values with a specific value, we can use the fillna() method. 
# For example, to fill null values in the 'Age' column with the mean age:
df["Age"] = df["Age"].fillna(df["Age"].mean())

0    24.0
1    30.0
2    28.2
3    35.0
4    28.0
5    24.0
Name: Age, dtype: float64

In [13]:
# similarly, to fill null values in the 'Salary' column with the median salary:
df['Salary'].fillna(df['Salary'].median())

0    50000.0
1    60000.0
2    55000.0
3    70000.0
4    55000.0
5    50000.0
Name: Salary, dtype: float64

In [14]:
# We can also fill null values with the help of forward fill (ffill) or backward fill (bfill) methods.

# For Forward Fill(ffill) in 'Age' column:

df["Age"].fillna(method = "ffill")

  df["Age"].fillna(method = "ffill")


0    24.0
1    30.0
2    30.0
3    35.0
4    28.0
5    24.0
Name: Age, dtype: float64

In [15]:
# For Backward Fill(bfill) in 'Age' column:

df["Age"].fillna(method = "bfill")

  df["Age"].fillna(method = "bfill")


0    24.0
1    30.0
2    35.0
3    35.0
4    28.0
5    24.0
Name: Age, dtype: float64

In [16]:
# Performing Forward Fill(ffill) and Backward Fill(bfill) in 'Salary' column:

df['Salary'].fillna(method = "ffill")

  df['Salary'].fillna(method = "ffill")


0    50000.0
1    60000.0
2    55000.0
3    70000.0
4    70000.0
5    50000.0
Name: Salary, dtype: float64

In [17]:
# Performing Backward Fill(bfill) in 'Salary' column:
df["Salary"].fillna(method = "bfill")

  df["Salary"].fillna(method = "bfill")


0    50000.0
1    60000.0
2    55000.0
3    70000.0
4    50000.0
5    50000.0
Name: Salary, dtype: float64

In [18]:
# To Replace Specific Values From a Column in DataFrame

df["Name"]=df["Name"].replace("Charlie","Rose")
df

Unnamed: 0,Name,Age,Department,Salary,Promoted Salary
0,Alice,24.0,HR,50000.0,500000.0
1,Bob,30.0,Finance,60000.0,600000.0
2,Rose,,IT,55000.0,550000.0
3,David,35.0,IT,70000.0,700000.0
4,Eva,28.0,,,
5,Alice,24.0,HR,50000.0,500000.0


In [19]:
# Checking the Duplicate Rows in DataFrame.

df_duplicates = df[df.duplicated()]
df_duplicates

Unnamed: 0,Name,Age,Department,Salary,Promoted Salary
5,Alice,24.0,HR,50000.0,500000.0


In [20]:
# To drop the duplicate rows from DataFrame
df = df.drop_duplicates()
df

Unnamed: 0,Name,Age,Department,Salary,Promoted Salary
0,Alice,24.0,HR,50000.0,500000.0
1,Bob,30.0,Finance,60000.0,600000.0
2,Rose,,IT,55000.0,550000.0
3,David,35.0,IT,70000.0,700000.0
4,Eva,28.0,,,


In [21]:
# Applying Lambda Function to Clean Data in a Column

df['Promoted Salary'] = df['Promoted Salary'].apply(lambda x : x/10 if x > 550000 else x)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Promoted Salary'] = df['Promoted Salary'].apply(lambda x : x/10 if x > 550000 else x)


Unnamed: 0,Name,Age,Department,Salary,Promoted Salary
0,Alice,24.0,HR,50000.0,500000.0
1,Bob,30.0,Finance,60000.0,60000.0
2,Rose,,IT,55000.0,550000.0
3,David,35.0,IT,70000.0,70000.0
4,Eva,28.0,,,


In [22]:
# Using Lambda functions 

# to multiply 'Age' column values by 2

def multiplyAge(x):
    return x * 2

df['Age'] = df['Age'].apply(multiplyAge)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Age'] = df['Age'].apply(multiplyAge)


Unnamed: 0,Name,Age,Department,Salary,Promoted Salary
0,Alice,48.0,HR,50000.0,500000.0
1,Bob,60.0,Finance,60000.0,60000.0
2,Rose,,IT,55000.0,550000.0
3,David,70.0,IT,70000.0,70000.0
4,Eva,56.0,,,


In [23]:
# To get back to original 'Age' values, we can divide by 2

df['Age'] = df['Age'].apply(lambda x : x / 2)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Age'] = df['Age'].apply(lambda x : x / 2)


Unnamed: 0,Name,Age,Department,Salary,Promoted Salary
0,Alice,24.0,HR,50000.0,500000.0
1,Bob,30.0,Finance,60000.0,60000.0
2,Rose,,IT,55000.0,550000.0
3,David,35.0,IT,70000.0,70000.0
4,Eva,28.0,,,
