In [2]:
'''
The following script is used to remove duplicate images in the data set
The way to detect duplicate image is to check if the hash value of two images are the same
Note that the changes are performed in place,
so please make sure you have a copy of the csv file and the image folder to "undo" the process
'''
# the csv file and the image folder to remove duplicate
TARGET_CSV_FILE = r"C:\Users\jdu12\Desktop\humpback\train.csv"
TARGET_IMG_PATH = r"C:\Users\jdu12\Desktop\humpback\train"

In [6]:
import numpy as np
import pandas as pd
from PIL import Image
import imagehash
import collections
import os

def getImageHash(imagePath):
    """
    Fuction to calculate the image's hash value given the image path
    
    Args:
        imagePath: the path to the image
    Returns:
        str: hash value of the image.
    """
    with Image.open(imagePath) as img:
        imgHash = imagehash.phash(img)
        return str(imgHash)

def get_train_input(targetCSVFile, targetImageFolder):
    """
    Fuction to load the CSV file and append the hash value of the corresponding images
    
    Args:
        targetCSVFile: the path to the CSV file
        targetImageFolder: the folder of the images

    Returns:
        pandas.DataFrame: data of the CSV with index (autoincrement); Image (filename); Id (label); Hash
    """
    train_input = pd.read_csv(targetCSVFile)
    
    imgHashes = train_input.Image.apply(lambda imageFile: getImageHash(os.path.join(targetImageFolder,imageFile)))
    train_input["Hash"] = [hashValue for hashValue in imgHashes]
    
    return train_input

def getHashWithDuplicate(dataWithHash):
    hashAndCounts = dataWithHash.Hash.value_counts()
    return hashAndCounts.loc[hashAndCounts>1]

def showDuplicateData(dataWithHash, ignoredNewWhale = True):
    """
    Fuction to show duplicate data in the dataset (duplicate images with different labels)
    
    Args:
        dataWithHash: data obtained from get_train_input (included hash value)
        ignoredNewWhale: the folder of the images
    """
    hashWithDuplicate = getHashWithDuplicate(dataWithHash)
    
    newWhaleCount = 0
    numOfDuplicate = 0
    _numOfDuplicate = 0
    for hashValue in hashWithDuplicate.index:
        duplicatedInfo = train_input[train_input.Hash==hashValue]
        shownId = set(duplicatedInfo.Id)
        
        numOfDuplicate = numOfDuplicate + (len(duplicatedInfo)-len(shownId))
        _numOfDuplicate = _numOfDuplicate + len(duplicatedInfo)
        if 'new_whale' in shownId:
            newWhaleCount += 1
            numOfDuplicate -= 1
        if ignoredNewWhale:
            shownId.discard('new_whale')
            
        if len(shownId) >= 1:
            print("Duplicate images: {}, set of Ids: {}".format(duplicatedInfo.Image.tolist(), shownId))
    if not ignoredNewWhale:
        print("Number of 'new whale' in duplicate images: {}".format(newWhaleCount))
    print("Number of duplicates: {}".format(numOfDuplicate))
    print("Number of _duplicates: {}".format(_numOfDuplicate))
            
def inconsistentDataIndex(dataWithHash, newWhaleOnly = True):
    """
    Fuction to show inconsistent data in the dataset (duplicate images with different labels)
    
    Args:
        dataWithHash: data obtained from get_train_input (included hash value)
        ignoredNewWhale: the folder of the images

    Returns:
        pandas.DataFrame: data of the CSV with index (autoincrement); Image (filename); Id (label); Hash
    """
    hashWithDuplicate = getHashWithDuplicate(dataWithHash)
    
    targetList = []
    for hashValue in hashWithDuplicate.index:
        duplicatedInfo = train_input[(train_input.Hash==hashValue) & ((train_input.Id == 'new_whale') | (not newWhaleOnly))]
        
        if len(duplicatedInfo.index.values)>0:
            targetList += duplicatedInfo.index.values.tolist()
    return targetList

def removeInconsistentData(dataWithHash, targetCSVFile, targetImageFolder, newWhaleOnly = True):
    rowIndices = inconsistentDataIndex(dataWithHash, newWhaleOnly)
    
    # remove images
    for imagePath in dataWithHash.loc[dataWithHash.index.isin(toRemove)].Image.tolist():
        os.remove(os.path.join(targetImageFolder, imagePath))
    
    # drop rows in CSV
    dataWithHash.drop(rowIndices, inplace=True)
    dataWithHash.to_csv(targetCSVFile, index=False, columns=['Image', 'Id'])

def removeDuplicateData(dataWithHash, targetCSVFile, targetImageFolder):
    # remove images
    for imagePath in dataWithHash[dataWithHash.duplicated(subset=['Id','Hash'], keep='first')].Image.tolist():
        os.remove(os.path.join(targetImageFolder, imagePath))
    
    # drop rows in CSV
    dataWithHash.drop_duplicates(subset=['Id','Hash'], keep='first', inplace=True)
    dataWithHash.to_csv(targetCSVFile, index=False, columns=['Image', 'Id'])

In [7]:
train_input = get_train_input(TARGET_CSV_FILE, TARGET_IMG_PATH)

In [8]:
showDuplicateData(train_input, False)

Duplicate images: ['59becb6c.jpg', 'cc68d9f2.jpg', 'eb026a29.jpg'], set of Ids: {'w_cae7677', 'new_whale'}
Duplicate images: ['7b738ee1.jpg', 'c0785cd9.jpg'], set of Ids: {'w_5c23454'}
Duplicate images: ['f1b7c15c.jpg', 'ff2d0d82.jpg'], set of Ids: {'w_17ee910', 'new_whale'}
Duplicate images: ['2985c2ae.jpg', '36c04f89.jpg'], set of Ids: {'w_49bbc79'}
Duplicate images: ['4123189d.jpg', '637b28a7.jpg'], set of Ids: {'w_efd3f81', 'new_whale'}
Duplicate images: ['b15a5565.jpg', 'f78d5e9e.jpg'], set of Ids: {'w_db7e2c5'}
Duplicate images: ['54d52fe1.jpg', 'db0c4225.jpg'], set of Ids: {'w_5e25f9f'}
Duplicate images: ['3b1a143a.jpg', 'd7f63ee6.jpg'], set of Ids: {'w_dc81791'}
Duplicate images: ['e2e68da4.jpg', 'fa29b1d9.jpg'], set of Ids: {'w_06a6351'}
Duplicate images: ['9d4ec8e6.jpg', 'a9827190.jpg'], set of Ids: {'w_a837660'}
Duplicate images: ['27627742.jpg', '36118167.jpg'], set of Ids: {'w_fd3ce71', 'new_whale'}
Duplicate images: ['490d05ea.jpg', 'f2d9b75f.jpg'], set of Ids: {'w_4a38a9

Duplicate images: ['2c3e707c.jpg', 'fe3329b4.jpg'], set of Ids: {'w_a837660', 'new_whale'}
Duplicate images: ['3ae20220.jpg', 'b8951041.jpg'], set of Ids: {'w_93e1b76', 'new_whale'}
Duplicate images: ['965f25eb.jpg', 'c5c8305d.jpg'], set of Ids: {'w_80b1c48'}
Duplicate images: ['0025e8c2.jpg', 'f4c81765.jpg'], set of Ids: {'w_8b1ca89'}
Duplicate images: ['72ef4e0f.jpg', '78c06fcd.jpg'], set of Ids: {'w_1a70685'}
Duplicate images: ['95d95348.jpg', 'dd25e679.jpg'], set of Ids: {'w_ca8bfb4'}
Duplicate images: ['06b62519.jpg', '883796e8.jpg'], set of Ids: {'w_5dc1c2d'}
Duplicate images: ['2f4347f4.jpg', '66cb1bde.jpg'], set of Ids: {'w_56bbc91'}
Duplicate images: ['41981a88.jpg', '74934a2c.jpg'], set of Ids: {'w_6361632', 'new_whale'}
Duplicate images: ['30ca0299.jpg', '75cd5008.jpg'], set of Ids: {'w_f5771d1', 'w_1000f90'}
Duplicate images: ['99caa8ec.jpg', 'ba95dda5.jpg'], set of Ids: {'w_2855124'}
Duplicate images: ['b1d3d395.jpg', 'cb7f7d57.jpg'], set of Ids: {'w_1e5a146', 'new_whale'}

Duplicate images: ['50ada087.jpg', '9dc8cf7f.jpg'], set of Ids: {'w_d9055d1'}
Duplicate images: ['4ca241b1.jpg', '5f8d9492.jpg'], set of Ids: {'w_ef89416', 'new_whale'}
Duplicate images: ['806cf583.jpg', 'c5da34e7.jpg'], set of Ids: {'w_0bc1db0'}
Duplicate images: ['3ce7d968.jpg', 'b4a17d44.jpg'], set of Ids: {'w_12cdfbd'}
Duplicate images: ['ba4a4729.jpg', 'f30cb844.jpg'], set of Ids: {'w_64f3545'}
Duplicate images: ['67758056.jpg', '9cd2fa20.jpg'], set of Ids: {'w_d8e752e', 'new_whale'}
Duplicate images: ['263246e0.jpg', '7db2f364.jpg'], set of Ids: {'w_f5b8faf'}
Duplicate images: ['4390f29c.jpg', '67f88e3e.jpg'], set of Ids: {'w_778ee6e'}
Duplicate images: ['bd91e244.jpg', 'c941efe9.jpg'], set of Ids: {'w_22bcbd6', 'new_whale'}
Duplicate images: ['7e9a738f.jpg', 'd6b5f3bc.jpg'], set of Ids: {'w_3349c9d'}
Duplicate images: ['776b0b63.jpg', '8d7606bf.jpg'], set of Ids: {'w_89e159a'}
Duplicate images: ['1efe5ef7.jpg', 'f5287173.jpg'], set of Ids: {'w_11f6df1', 'new_whale'}
Duplicate im

Duplicate images: ['33ec74f3.jpg', 'b45cb8ed.jpg'], set of Ids: {'w_f8e6546', 'new_whale'}
Duplicate images: ['527ff29a.jpg', 'fb7b0e02.jpg'], set of Ids: {'w_c74ab24', 'new_whale'}
Duplicate images: ['3fb0d64d.jpg', 'c7b6ded2.jpg'], set of Ids: {'w_c30959a'}
Duplicate images: ['667a6ea6.jpg', 'ef3578e6.jpg'], set of Ids: {'w_3b0894d'}
Duplicate images: ['e5dfd9ab.jpg', 'e9b3dfdf.jpg'], set of Ids: {'w_dcf2001'}
Duplicate images: ['220d0887.jpg', '54ea1e8d.jpg'], set of Ids: {'w_1854334'}
Duplicate images: ['56893b19.jpg', 'baf56258.jpg'], set of Ids: {'w_e7f8e67'}
Duplicate images: ['2b2eb6e1.jpg', '60b8927d.jpg'], set of Ids: {'w_6737f89', 'new_whale'}
Duplicate images: ['11ba757f.jpg', '5988332d.jpg'], set of Ids: {'w_37bd99a'}
Duplicate images: ['4bd5bedb.jpg', '5e467657.jpg'], set of Ids: {'w_2f6a962'}
Duplicate images: ['a76fa33b.jpg', 'a97f7dca.jpg'], set of Ids: {'w_9ff699b'}
Duplicate images: ['5e1e8352.jpg', 'a971c563.jpg'], set of Ids: {'w_aa3d7b8', 'new_whale'}
Duplicate im

Duplicate images: ['84bd38f3.jpg', 'c398d294.jpg'], set of Ids: {'w_7c18f3c', 'new_whale'}
Duplicate images: ['5d7102b4.jpg', '64b9e47d.jpg'], set of Ids: {'w_41ed8e8'}
Duplicate images: ['432f627b.jpg', 'c9213ad9.jpg'], set of Ids: {'w_83714b7'}
Duplicate images: ['19e2b8fd.jpg', '43fbabbb.jpg'], set of Ids: {'w_2071a4c'}
Duplicate images: ['37544081.jpg', '3a831f39.jpg'], set of Ids: {'w_d6df554'}
Duplicate images: ['b7f01086.jpg', 'd60ecdd5.jpg'], set of Ids: {'w_3039e7a', 'new_whale'}
Duplicate images: ['5ac4811d.jpg', '6415a7b6.jpg'], set of Ids: {'w_17d5eb9'}
Duplicate images: ['8d4b7dc2.jpg', '9f569a7d.jpg'], set of Ids: {'w_0c70bc3'}
Duplicate images: ['2862d1bf.jpg', '7df73caa.jpg'], set of Ids: {'w_d6815ff'}
Duplicate images: ['660e39f4.jpg', '8df52ebd.jpg'], set of Ids: {'w_92be3ca', 'new_whale'}
Duplicate images: ['5bba3adb.jpg', '636e2e2e.jpg'], set of Ids: {'w_a59905f'}
Duplicate images: ['2496a4a9.jpg', 'af9895ae.jpg'], set of Ids: {'w_a965f14'}
Duplicate images: ['76a22

Duplicate images: ['4e37d286.jpg', '610241d4.jpg'], set of Ids: {'w_8e451d9'}
Duplicate images: ['68776b72.jpg', '84e9b373.jpg'], set of Ids: {'w_9868b95'}
Duplicate images: ['5d2c69ae.jpg', '99e6f6a0.jpg'], set of Ids: {'w_86e5d8c'}
Duplicate images: ['8083a4af.jpg', 'a77f4d68.jpg'], set of Ids: {'w_fd1308e'}
Duplicate images: ['e7c1c10f.jpg', 'fac671ec.jpg'], set of Ids: {'w_def715a'}
Duplicate images: ['85140821.jpg', 'cd274fb6.jpg'], set of Ids: {'w_fdf60bb'}
Duplicate images: ['69d946a0.jpg', '77ccf90d.jpg'], set of Ids: {'w_e700deb'}
Duplicate images: ['3cb1fd98.jpg', 'bdcdff28.jpg'], set of Ids: {'w_9b401eb'}
Duplicate images: ['239c83c2.jpg', '73332aa2.jpg'], set of Ids: {'w_54c00ad', 'new_whale'}
Duplicate images: ['2e8b0594.jpg', '34b06b3b.jpg'], set of Ids: {'w_516bedb'}
Duplicate images: ['44522955.jpg', '880ba235.jpg'], set of Ids: {'w_cef690d'}
Duplicate images: ['040e5bd6.jpg', '3c2540c3.jpg'], set of Ids: {'w_86b3de4'}
Duplicate images: ['70a02704.jpg', '8eb5b714.jpg'],

In [9]:
removeInconsistentData(train_input, TARGET_CSV_FILE, TARGET_IMG_PATH)

NameError: name 'toRemove' is not defined

In [10]:
removeDuplicateData(train_input, TARGET_CSV_FILE, TARGET_IMG_PATH)

In [11]:
showDuplicateData(train_input, False)

Duplicate images: ['00863b8c.jpg', 'e9813721.jpg'], set of Ids: {'w_bc93297', 'new_whale'}
Duplicate images: ['0376d91d.jpg', '868061cd.jpg'], set of Ids: {'w_1274a11', 'new_whale'}
Duplicate images: ['27627742.jpg', '36118167.jpg'], set of Ids: {'w_fd3ce71', 'new_whale'}
Duplicate images: ['5b68e47c.jpg', 'c7cee76e.jpg'], set of Ids: {'w_6545984', 'new_whale'}
Duplicate images: ['0c88a7aa.jpg', '6699001e.jpg'], set of Ids: {'w_d36f58c', 'new_whale'}
Duplicate images: ['7310c41a.jpg', 'af079b53.jpg'], set of Ids: {'w_5436d75', 'new_whale'}
Duplicate images: ['02c8235c.jpg', 'dcc5b795.jpg'], set of Ids: {'w_e59a1f0', 'new_whale'}
Duplicate images: ['024144e5.jpg', 'e17384fd.jpg'], set of Ids: {'w_0f16be3', 'new_whale'}
Duplicate images: ['451c8b8f.jpg', 'e940bee9.jpg'], set of Ids: {'w_02bb4cf', 'new_whale'}
Duplicate images: ['9cbc9f7f.jpg', 'bc803c71.jpg'], set of Ids: {'w_964c1b3', 'new_whale'}
Duplicate images: ['6ad9c4b8.jpg', '8a26e9f8.jpg'], set of Ids: {'w_44cccf6', 'new_whale'}

Duplicate images: ['1b5feb2a.jpg', '8e3dd01a.jpg'], set of Ids: {'w_8c9ed42', 'new_whale'}
Duplicate images: ['2f778f6c.jpg', 'bb2a1714.jpg'], set of Ids: {'w_3de676c', 'new_whale'}
Duplicate images: ['5a3f0084.jpg', '87ec06b2.jpg'], set of Ids: {'w_ad3dd0b', 'new_whale'}
Duplicate images: ['9529dfe3.jpg', 'e0030e15.jpg'], set of Ids: {'w_f75302f', 'new_whale'}
Duplicate images: ['1efe5ef7.jpg', 'f5287173.jpg'], set of Ids: {'w_11f6df1', 'new_whale'}
Duplicate images: ['4398e933.jpg', 'fa2b1d93.jpg'], set of Ids: {'w_c666071', 'new_whale'}
Duplicate images: ['0e4e6449.jpg', 'c80d3657.jpg'], set of Ids: {'w_63d10a1', 'new_whale'}
Duplicate images: ['5844f3f2.jpg', '73bbaf2d.jpg'], set of Ids: {'w_94ba67d', 'new_whale'}
Duplicate images: ['391ae1f9.jpg', '42e24810.jpg'], set of Ids: {'w_06972d2', 'new_whale'}
Duplicate images: ['c31459be.jpg', 'ec506248.jpg'], set of Ids: {'w_bb2d34d', 'new_whale'}
Duplicate images: ['626ff3eb.jpg', '802dee44.jpg'], set of Ids: {'w_f115d53', 'new_whale'}