# Data Cleaning in Pandas

## Data cleaning mainly means:

## Handling missing values
 ## Removing duplicates
 ## Fixing wrong data types

# 1. Detecting Missing Values

In [35]:
import pandas as pd

In [36]:
import numpy as np

In [37]:
data = {
    "Name": ["Ali", "Sara", "John", None],
    "Age": [22, np.nan, 25, 24],
    "Salary": [50000, 60000, np.nan, 52000]
}

In [38]:
data

{'Name': ['Ali', 'Sara', 'John', None],
 'Age': [22, nan, 25, 24],
 'Salary': [50000, 60000, nan, 52000]}

In [39]:
df=pd.DataFrame(data)

In [40]:
df

Unnamed: 0,Name,Age,Salary
0,Ali,22.0,50000.0
1,Sara,,60000.0
2,John,25.0,
3,,24.0,52000.0


## note: isnull() and isna() are same

In [41]:
df.isnull()

Unnamed: 0,Name,Age,Salary
0,False,False,False
1,False,True,False
2,False,False,True
3,True,False,False


In [42]:
df.isna()

Unnamed: 0,Name,Age,Salary
0,False,False,False
1,False,True,False
2,False,False,True
3,True,False,False


In [43]:
df.isnull().sum()

Name      1
Age       1
Salary    1
dtype: int64

In [44]:
df.isnull().any()

Name      True
Age       True
Salary    True
dtype: bool

# 2.  Removing Missing Values

## dropna()
## Removes rows (or columns) with missing values.

In [45]:
df_dropped=df.dropna()

In [46]:
df_dropped

Unnamed: 0,Name,Age,Salary
0,Ali,22.0,50000.0


In [47]:
df_drop=df.dropna(axis=1)

In [48]:
df_drop

0
1
2
3


In [49]:
df.dropna(how="all")

Unnamed: 0,Name,Age,Salary
0,Ali,22.0,50000.0
1,Sara,,60000.0
2,John,25.0,
3,,24.0,52000.0


# 3. Filling Missing Values

In [50]:
df.fillna(0)

Unnamed: 0,Name,Age,Salary
0,Ali,22.0,50000.0
1,Sara,0.0,60000.0
2,John,25.0,0.0
3,0,24.0,52000.0


## Fill using MEAN / MEDIAN / MODE

In [51]:
df["Age"] = df["Age"].fillna(df["Age"].mean())

In [52]:
df["Salary"] = df["Salary"].fillna(df["Salary"].median())


In [53]:
df

Unnamed: 0,Name,Age,Salary
0,Ali,22.0,50000.0
1,Sara,23.666667,60000.0
2,John,25.0,52000.0
3,,24.0,52000.0


## Fill using mode

In [54]:
df["Name"] = df["Name"].fillna(df["Name"].mode()[0])

In [55]:
df

Unnamed: 0,Name,Age,Salary
0,Ali,22.0,50000.0
1,Sara,23.666667,60000.0
2,John,25.0,52000.0
3,Ali,24.0,52000.0


# 4. Duplicate Data

## Duplicates can harm ML models.

In [56]:
data2 = {
    "ID": [1, 2, 2, 3],
    "Product": ["Phone", "Laptop", "Laptop", "Tablet"],
    "Price": [30000, 50000, 50000, 20000]
}

In [57]:
df1=pd.DataFrame(data2)

In [58]:
df1

Unnamed: 0,ID,Product,Price
0,1,Phone,30000
1,2,Laptop,50000
2,2,Laptop,50000
3,3,Tablet,20000


In [59]:
df.duplicated()

0    False
1    False
2    False
3    False
dtype: bool

In [60]:
df[df.duplicated()]

Unnamed: 0,Name,Age,Salary


In [62]:
df_clean=df.drop_duplicates()

In [63]:
df_clean

Unnamed: 0,Name,Age,Salary
0,Ali,22.0,50000.0
1,Sara,23.666667,60000.0
2,John,25.0,52000.0
3,Ali,24.0,52000.0
