### finding duplicate rows

This is a common task (and a common interview question)

In [61]:
# create a list of rows, with some duplicates
# to make it easier to track this visually, duplicates have v1==v2
entries = [
    [0,0], 
    [0,0],
    [1,0], 
    [1,1],
    [1,1],
    [2,1],
    [2,2],
    [2,2],
    [3,2],
    [3,3],
    [3,3],
    [4,3],
    [4,4],
    [4,4],
    [5,4],
    [5,5],
    [5,5],
    [6,5],
] 

headers = ['v1','v2']

In [62]:
import pandas as pd

In [63]:
df = pd.DataFrame(entries, columns=headers)

In [64]:
df

Unnamed: 0,v1,v2
0,0,0
1,0,0
2,1,0
3,1,1
4,1,1
5,2,1
6,2,2
7,2,2
8,3,2
9,3,3


In [65]:
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

### Use RowID

If you have rowid (in sqlite) or other unique sequential identifier for a row, you can use MIN or MAX to identify rows that have duplicates

In [66]:
# rows with duplicates will have different values for MIN and MAX rowid
pysqldf("SELECT v1, v2, min(rowid), max(rowid) FROM df GROUP BY v1, v2")

Unnamed: 0,v1,v2,min(rowid),max(rowid)
0,0,0,1,2
1,1,0,3,3
2,1,1,4,5
3,2,1,6,6
4,2,2,7,8
5,3,2,9,9
6,3,3,10,11
7,4,3,12,12
8,4,4,13,14
9,5,4,15,15


In [67]:
# leverage this to find rows with a duplicate (ie., a row value that isn't the MIN for the group)
pysqldf("""
SELECT 
    rowid, * 
FROM 
    df
WHERE 
    rowid 
NOT IN
    (SELECT 
        min(rowid) 
    FROM df 
        GROUP BY v1, v2
    )
""")

Unnamed: 0,rowid,v1,v2
0,2,0,0
1,5,1,1
2,8,2,2
3,11,3,3
4,14,4,4
5,17,5,5


### Without a rowid

If you don't have a rowid (or our database doesn't auto-generate one for you), you can use a partition to pick out the duplicates

In [68]:
pysqldf("""
WITH df_1 AS 
(
    SELECT 
        a.v1, 
        a.v2, 
        ROW_NUMBER() OVER (PARTITION BY v1, v2) as row_id 
    FROM 
        df a
)

SELECT 
    * 
FROM 
    df_1
WHERE
    row_id NOT IN 
    (SELECT 
        MIN(row_id)
    FROM
        df_1
    GROUP BY v1, v2
    )
""")

Unnamed: 0,v1,v2,row_id
0,0,0,2
1,1,1,2
2,2,2,2
3,3,3,2
4,4,4,2
5,5,5,2
