# Removing The Duplicate Poster Images In Each Movie

In [1]:
!pip install imagehash

Collecting imagehash
  Downloading ImageHash-4.2.1.tar.gz (812 kB)
[?25l[K     |▍                               | 10 kB 20.8 MB/s eta 0:00:01[K     |▉                               | 20 kB 8.6 MB/s eta 0:00:01[K     |█▏                              | 30 kB 7.3 MB/s eta 0:00:01[K     |█▋                              | 40 kB 6.8 MB/s eta 0:00:01[K     |██                              | 51 kB 4.3 MB/s eta 0:00:01[K     |██▍                             | 61 kB 5.0 MB/s eta 0:00:01[K     |██▉                             | 71 kB 5.2 MB/s eta 0:00:01[K     |███▎                            | 81 kB 5.1 MB/s eta 0:00:01[K     |███▋                            | 92 kB 5.7 MB/s eta 0:00:01[K     |████                            | 102 kB 5.3 MB/s eta 0:00:01[K     |████▍                           | 112 kB 5.3 MB/s eta 0:00:01[K     |████▉                           | 122 kB 5.3 MB/s eta 0:00:01[K     |█████▎                          | 133 kB 5.3 MB/s eta 0:00:01[K     |██

In [2]:
from PIL import Image
from tqdm.notebook import tqdm
from pathlib import Path
import imagehash
import pickle
from scipy.spatial.distance import cdist
import numpy as np
import pandas as pd

movies_path = Path("movie_posters.pkl")


THRESHOLD = 20 # we chose 20 based on a random sample of duplicate posters

In [None]:
posters = pd.read_pickle(movies_path)
posters = posters.sort_values("movie")

In [None]:
!unzip posters.zip # the sample's posters images 

In [None]:
# calcualte each poster image hash value

def get_img_dhash(row):
    poster_path = f'posters/{row["tconst"]}/{row["file_path"]}'
    img = Image.open(poster_path)
    row['dhash'] = imagehash.dhash(img)

    return row

posters = posters.apply(get_img_dhash, axis=1)

In [None]:
def get_duplicates(s1, s2):
    dup = set()
    not_dup = set()
    for i, j in zip(s1,s2):
        if i not in dup and j not in dup:
            if i in not_dup:
                dup.add(j)
            elif j in not_dup:
                dup.add(i)
            else:
                dup.add(j)
                not_dup.add(i)
        else:
            if i in not_dup:
                dup.add(j)
            if j in not_dup:
                dup.add(i)
    return list(dup)


In [None]:
posters["dups"]= False

for m, posters in tqdm(posters.groupby("tconst")):    
    dhash = [p.hash.flatten() for p in posters["dhash"]]
    dist = cdist(dhash, dhash, lambda u, v: np.count_nonzero(v != u))
    sim = np.where(dist<THRESHOLD) 
    s1 = sim[0][sim[0]!=sim[1]]
    s2 = sim[1][sim[0]!=sim[1]]
    
    dups = get_duplicates(s1, s2)
    posters.loc[posters.iloc[dups].index,'dups'] = True

  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
posters = posters.reset_index(drop=True)

In [None]:
posters.to_pickle("posters_with_dup.pkl")