In [64]:
import pandas as pd

In [65]:
# create a sample data

data = {'Flower':['Rose','Rose','Marigold','Marigold','Hibiscus', 'Hibiscus'],
       'Colour':['Red','White','Yellow','Orange','Pink','Red']}

### We will now create a Data Frame from the dictionary, and set our custom index

In [66]:
df = pd.DataFrame(data, index = ['Flower_'+str(i) for i in range(len(data['Flower']))])

In [67]:
df

Unnamed: 0,Flower,Colour
Flower_0,Rose,Red
Flower_1,Rose,White
Flower_2,Marigold,Yellow
Flower_3,Marigold,Orange
Flower_4,Hibiscus,Pink
Flower_5,Hibiscus,Red


### Note: Our data has duplicates in two columns: Colour and Flower

### Now we can start exploring how to remove duplicates

### 1. Remove duplicates without mentioning the column name

In [68]:
df1 = df.drop_duplicates()

In [69]:
df1

Unnamed: 0,Flower,Colour
Flower_0,Rose,Red
Flower_1,Rose,White
Flower_2,Marigold,Yellow
Flower_3,Marigold,Orange
Flower_4,Hibiscus,Pink
Flower_5,Hibiscus,Red


In [70]:
assert len(df) == len(df1)    

### There is no change in the data frame, and this is because when we do not mention the column name, pandas <u> considers the combination of all column values </u> and tries to remove duplicates. Here, every combination of Flower and Colour is unique, hence no data got removed

In [71]:
# intentionally add a duplicate data in the dataframe

# get the index position where new data can be inserted

print("New Data to be added at index: ", len(df.index))

New Data to be added at index:  6


In [72]:
df.loc[len(df.index)] =  ['Rose', 'Red']

In [73]:
df

Unnamed: 0,Flower,Colour
Flower_0,Rose,Red
Flower_1,Rose,White
Flower_2,Marigold,Yellow
Flower_3,Marigold,Orange
Flower_4,Hibiscus,Pink
Flower_5,Hibiscus,Red
6,Rose,Red


### What we observe here is that, the index name here is different than what we have set previously. We can do the same thing, by specifying our custom index name

In [74]:
df.loc['Flower_6'] = ['Rose','Red']

In [75]:
df

Unnamed: 0,Flower,Colour
Flower_0,Rose,Red
Flower_1,Rose,White
Flower_2,Marigold,Yellow
Flower_3,Marigold,Orange
Flower_4,Hibiscus,Pink
Flower_5,Hibiscus,Red
6,Rose,Red
Flower_6,Rose,Red


### Now if we try to remove duplicates without mentioning any column, the last two rows should get removed

In [76]:
df = df.drop_duplicates()

In [77]:
df

Unnamed: 0,Flower,Colour
Flower_0,Rose,Red
Flower_1,Rose,White
Flower_2,Marigold,Yellow
Flower_3,Marigold,Orange
Flower_4,Hibiscus,Pink
Flower_5,Hibiscus,Red


In [78]:
try:
    assert len(df) == len(df1), "Some data has been removed"
except Exception as e:
    print(e)

## Yay!!

###  2. Remove duplicates from particular columns using the subset parameter

In [79]:
# Remove rows that have duplicate flower names

df1 = df.drop_duplicates(subset=["Flower"])

In [80]:
df1

Unnamed: 0,Flower,Colour
Flower_0,Rose,Red
Flower_2,Marigold,Yellow
Flower_4,Hibiscus,Pink


### Observe how the first occurence of the duplicates are retained and the remaining occurences are removed

In [81]:
try:
    assert len(df) == len(df1), "Some data has been removed"
except Exception as e:
    print(e)

Some data has been removed


In [82]:
# Remove rows that have duplicate Colour names

df1 = df.drop_duplicates(subset=["Colour"])

In [83]:
df1

Unnamed: 0,Flower,Colour
Flower_0,Rose,Red
Flower_1,Rose,White
Flower_2,Marigold,Yellow
Flower_3,Marigold,Orange
Flower_4,Hibiscus,Pink


In [84]:
try:
    assert len(df) == len(df1), "Some data has been removed"
except Exception as e:
    print(e)

Some data has been removed


### 3. Remove duplicates from particular columns using the keep parameter
### The keep paramater helps us control how to remove the duplicate values. It can have 3 possible values:
#### <li> first: Keep the First Occurence of the duplicate data and remove the rest </li><br> <li> last: Keep the Last Occurence of the duplicate data and remove the rest </li><br> <li> False: Keep no occurence if data is duplicate </li>

In [85]:
# keep the first occurence and remove remaining duplicates

df1 = df.drop_duplicates(subset = 'Flower', keep = 'first')

In [86]:
df1

Unnamed: 0,Flower,Colour
Flower_0,Rose,Red
Flower_2,Marigold,Yellow
Flower_4,Hibiscus,Pink


In [87]:
# keep the last occurence and remove remaining duplicates

df1 = df.drop_duplicates(subset = 'Flower', keep = 'last')

In [88]:
df1

Unnamed: 0,Flower,Colour
Flower_1,Rose,White
Flower_3,Marigold,Orange
Flower_5,Hibiscus,Red


In [None]:
# keep no occurence of the duplicates

d