# 25. How to find and remove duplicate rows in pandas?

In [1]:
import pandas as pd
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']

users = pd.read_csv("data/movieuser.csv", index_col='user_id')
users.head()

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [2]:
users.shape

(943, 4)

In [3]:
# We can show if there is duplicated in specific column by calling 'duplicated' on a Series.
users.zip_code.duplicated().sum()

148

In [4]:
# We can see if an entire row is duplicated by calling 'duplicated' on the DataFrame.
users.duplicated().sum()

7

In [5]:
# We can filter the duplicated rows by passing a series of booleans.
# there is parameter 'keep', if it is 'first', it keeps the first appearance.
# if it is 'last', it keep the 'last' appearance.
# if it is False, it show all duplicated rows.
users.loc[users.duplicated(keep=False), :]

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
67,17,M,student,60402
85,51,M,educator,20003
198,21,F,student,55414
350,32,M,student,97301
428,28,M,student,55414
437,27,F,other,20009
460,44,F,other,60630
496,21,F,student,55414
572,51,M,educator,20003
621,17,M,student,60402


In [6]:
# 'drop_duplicates': Return DataFrame with duplicate rows removed, optionally only
# considering certain columns.
# The parameter 'keep' controls with rows to keep.
users.drop_duplicates(keep='first').shape

(936, 4)

In [7]:
# BONUS:
# What if you want to consider only certain columns when identifying duplicates?
# We pass a list of columns to the parameter 'subset' of the 'drop_duplicates' method.
users.duplicated(subset=['age', 'zip_code']).sum()

16