<img src="pandas_logo.svg" alt="NumPy Logo" style="width:250px; height:100px;">
<h1>Handling Null Values and Duplicates</h1>

In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

<h4>Handling Null Values:</h4>

In [82]:
df = pd.DataFrame(
    {"R.No.":[11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
     "Maths":[80, 40, np.nan, 50, 40, np.nan, 68, 30, np.nan, 63],
     "Social":[90, 80, np.nan, 60, np.nan, 85, 78, 56, np.nan, 73],
     "Science":[80, 40, np.nan, 50, 40, np.nan, 68, 30, np.nan, 63]
    }
)
df

Unnamed: 0,R.No.,Maths,Social,Science
0,11,80.0,90.0,80.0
1,12,40.0,80.0,40.0
2,13,,,
3,14,50.0,60.0,50.0
4,15,40.0,,40.0
5,16,,85.0,
6,17,68.0,78.0,68.0
7,18,30.0,56.0,30.0
8,19,,,
9,20,63.0,73.0,63.0


In [84]:
df.isnull()

Unnamed: 0,R.No.,Maths,Social,Science
0,False,False,False,False
1,False,False,False,False
2,False,True,True,True
3,False,False,False,False
4,False,False,True,False
5,False,True,False,True
6,False,False,False,False
7,False,False,False,False
8,False,True,True,True
9,False,False,False,False


In [86]:
df.isnull().sum()

R.No.      0
Maths      3
Social     3
Science    3
dtype: int64

In [88]:
df.dropna()

Unnamed: 0,R.No.,Maths,Social,Science
0,11,80.0,90.0,80.0
1,12,40.0,80.0,40.0
3,14,50.0,60.0,50.0
6,17,68.0,78.0,68.0
7,18,30.0,56.0,30.0
9,20,63.0,73.0,63.0


In [90]:
df

Unnamed: 0,R.No.,Maths,Social,Science
0,11,80.0,90.0,80.0
1,12,40.0,80.0,40.0
2,13,,,
3,14,50.0,60.0,50.0
4,15,40.0,,40.0
5,16,,85.0,
6,17,68.0,78.0,68.0
7,18,30.0,56.0,30.0
8,19,,,
9,20,63.0,73.0,63.0


In [92]:
df.dropna(inplace = True)

In [94]:
df

Unnamed: 0,R.No.,Maths,Social,Science
0,11,80.0,90.0,80.0
1,12,40.0,80.0,40.0
3,14,50.0,60.0,50.0
6,17,68.0,78.0,68.0
7,18,30.0,56.0,30.0
9,20,63.0,73.0,63.0


In [96]:
df1 = pd.DataFrame(
    {
        "R.No.":[11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
        "Maths":[80, 40, np.nan, 50, 40, np.nan, 68, 30, np.nan, 63],
        "Social":[90, 80, np.nan, 60, np.nan, 85, 78, 56, np.nan, 73],
        "Science":[80, 40, np.nan, 50, 40, np.nan, 68, 30, np.nan, 63]
    }
)
df1

Unnamed: 0,R.No.,Maths,Social,Science
0,11,80.0,90.0,80.0
1,12,40.0,80.0,40.0
2,13,,,
3,14,50.0,60.0,50.0
4,15,40.0,,40.0
5,16,,85.0,
6,17,68.0,78.0,68.0
7,18,30.0,56.0,30.0
8,19,,,
9,20,63.0,73.0,63.0


In [98]:
df1["Maths"].dropna()

0    80.0
1    40.0
3    50.0
4    40.0
6    68.0
7    30.0
9    63.0
Name: Maths, dtype: float64

In [100]:
df1[["Maths"]].dropna().head()

Unnamed: 0,Maths
0,80.0
1,40.0
3,50.0
4,40.0
6,68.0


In [102]:
df1[["Maths", "Social"]].dropna()

Unnamed: 0,Maths,Social
0,80.0,90.0
1,40.0,80.0
3,50.0,60.0
6,68.0,78.0
7,30.0,56.0
9,63.0,73.0


In [104]:
df1.loc[:, ["Maths"]].dropna()

Unnamed: 0,Maths
0,80.0
1,40.0
3,50.0
4,40.0
6,68.0
7,30.0
9,63.0


In [106]:
df1.loc[:, ["Maths", "Social"]].dropna()

Unnamed: 0,Maths,Social
0,80.0,90.0
1,40.0,80.0
3,50.0,60.0
6,68.0,78.0
7,30.0,56.0
9,63.0,73.0


In [108]:
df1.fillna(45)

Unnamed: 0,R.No.,Maths,Social,Science
0,11,80.0,90.0,80.0
1,12,40.0,80.0,40.0
2,13,45.0,45.0,45.0
3,14,50.0,60.0,50.0
4,15,40.0,45.0,40.0
5,16,45.0,85.0,45.0
6,17,68.0,78.0,68.0
7,18,30.0,56.0,30.0
8,19,45.0,45.0,45.0
9,20,63.0,73.0,63.0


In [110]:
df1

Unnamed: 0,R.No.,Maths,Social,Science
0,11,80.0,90.0,80.0
1,12,40.0,80.0,40.0
2,13,,,
3,14,50.0,60.0,50.0
4,15,40.0,,40.0
5,16,,85.0,
6,17,68.0,78.0,68.0
7,18,30.0,56.0,30.0
8,19,,,
9,20,63.0,73.0,63.0


In [112]:
df1.fillna(45, inplace = True)

In [114]:
df1

Unnamed: 0,R.No.,Maths,Social,Science
0,11,80.0,90.0,80.0
1,12,40.0,80.0,40.0
2,13,45.0,45.0,45.0
3,14,50.0,60.0,50.0
4,15,40.0,45.0,40.0
5,16,45.0,85.0,45.0
6,17,68.0,78.0,68.0
7,18,30.0,56.0,30.0
8,19,45.0,45.0,45.0
9,20,63.0,73.0,63.0


In [116]:
df2 = pd.DataFrame(
    {
        "R.No.":[11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
        "Maths":[80, 40, np.nan, 50, 40, np.nan, 68, 30, np.nan, 63],
        "Social":[90, 80, np.nan, 60, np.nan, 85, 78, 56, np.nan, 73],
        "Science":[80, 40, np.nan, 50, 40, np.nan, 68, 30, np.nan, 63]
    }
)
df2

Unnamed: 0,R.No.,Maths,Social,Science
0,11,80.0,90.0,80.0
1,12,40.0,80.0,40.0
2,13,,,
3,14,50.0,60.0,50.0
4,15,40.0,,40.0
5,16,,85.0,
6,17,68.0,78.0,68.0
7,18,30.0,56.0,30.0
8,19,,,
9,20,63.0,73.0,63.0


In [118]:
df2["Maths"].fillna(40).head()

0    80.0
1    40.0
2    40.0
3    50.0
4    40.0
Name: Maths, dtype: float64

In [120]:
df2[["Maths"]].fillna(40).head()

Unnamed: 0,Maths
0,80.0
1,40.0
2,40.0
3,50.0
4,40.0


In [122]:
df2[["Maths", "Social"]].fillna(40).head()

Unnamed: 0,Maths,Social
0,80.0,90.0
1,40.0,80.0
2,40.0,40.0
3,50.0,60.0
4,40.0,40.0


In [124]:
df2.loc[:, "Maths"].fillna(50).head()

0    80.0
1    40.0
2    50.0
3    50.0
4    40.0
Name: Maths, dtype: float64

In [126]:
df2.loc[:, ["Maths"]].fillna(50).head()

Unnamed: 0,Maths
0,80.0
1,40.0
2,50.0
3,50.0
4,40.0


In [128]:
df2.loc[:, ["Maths", "Social"]].fillna(50).head()

Unnamed: 0,Maths,Social
0,80.0,90.0
1,40.0,80.0
2,50.0,50.0
3,50.0,60.0
4,40.0,50.0


In [130]:
df2.loc[:, ["Maths"]].fillna(df["Maths"].mean())

Unnamed: 0,Maths
0,80.0
1,40.0
2,55.166667
3,50.0
4,40.0
5,55.166667
6,68.0
7,30.0
8,55.166667
9,63.0


In [132]:
import math
df2["Maths"].fillna(math.floor(df["Maths"].mean()))

0    80.0
1    40.0
2    55.0
3    50.0
4    40.0
5    55.0
6    68.0
7    30.0
8    55.0
9    63.0
Name: Maths, dtype: float64

<h4>Handling Duplicates:</h4>

In [135]:
df3 = pd.DataFrame(
    {
        "R.No.":[101, 102, 101, 104, 105, 106, 105, 108, 109, 110],
        "Names":["Matsya", "Kurma", "Matsya", "Narasimha", "Vamana", "Parashuram", "Vamana", "Krishna", "Venkata", "Kalki"],
        "Maths":[80, 40, 80, 50, 40, 59, 40, 30, 69, 63], 
        "Social":[90, 80, 90, 60, 89, 85, 89, 56, 98, 73],
        "Science":[80, 40, 80, 50, 40, 78, 40, 30, 70, 63]
    }
)
df3

Unnamed: 0,R.No.,Names,Maths,Social,Science
0,101,Matsya,80,90,80
1,102,Kurma,40,80,40
2,101,Matsya,80,90,80
3,104,Narasimha,50,60,50
4,105,Vamana,40,89,40
5,106,Parashuram,59,85,78
6,105,Vamana,40,89,40
7,108,Krishna,30,56,30
8,109,Venkata,69,98,70
9,110,Kalki,63,73,63


In [137]:
df3.duplicated()

0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
8    False
9    False
dtype: bool

In [139]:
df3.duplicated().sum()

2

In [141]:
df3.drop_duplicates()

Unnamed: 0,R.No.,Names,Maths,Social,Science
0,101,Matsya,80,90,80
1,102,Kurma,40,80,40
3,104,Narasimha,50,60,50
4,105,Vamana,40,89,40
5,106,Parashuram,59,85,78
7,108,Krishna,30,56,30
8,109,Venkata,69,98,70
9,110,Kalki,63,73,63


In [143]:
df3

Unnamed: 0,R.No.,Names,Maths,Social,Science
0,101,Matsya,80,90,80
1,102,Kurma,40,80,40
2,101,Matsya,80,90,80
3,104,Narasimha,50,60,50
4,105,Vamana,40,89,40
5,106,Parashuram,59,85,78
6,105,Vamana,40,89,40
7,108,Krishna,30,56,30
8,109,Venkata,69,98,70
9,110,Kalki,63,73,63


In [145]:
df3.drop_duplicates(inplace = True)

In [147]:
df3

Unnamed: 0,R.No.,Names,Maths,Social,Science
0,101,Matsya,80,90,80
1,102,Kurma,40,80,40
3,104,Narasimha,50,60,50
4,105,Vamana,40,89,40
5,106,Parashuram,59,85,78
7,108,Krishna,30,56,30
8,109,Venkata,69,98,70
9,110,Kalki,63,73,63
