# Image Metadata Clustering

In [1]:
from itertools import combinations
import ujson as json
import numpy as np
import os
import pandas as pd
import pickle

In [2]:
def image_metadata_cleaner(img):
    """
    clean up image JSON from tika. 
    """
    to_del = ['meta']
    for key in img:
        if isinstance(img[key], list):
            if len(img[key]) == 1:
                img[key] = img[key][0]
                # Kill empty lists and empty strings
                # We are _assuming_ that list entries are strings
            if len(img[key]) == 0:
                to_del.append(key)
            
    for key in to_del:
        del img[key]
    
    if 'content' in img:
        img['content'] = ' '.join(img['content'].strip().split())
    else:
        img['content'] = ''
    
    if 'f_number' in img:
        if isinstance(img['f_number'], list):
            img['f_number'] = [x for x in img['f_number'] if x[0].lower() == 'f'][0]
        if isinstance(img['f_number'], str):
            try:
                img['f_number'] = float(img['f_number'][1:].strip())
            except:
                print(img['f_number'])
    
    if 'id' in img:
        img['id'] = img['id'].replace('/data/roxyimages/', '').replace('.jpg', '')
    
    # meta-feature for describing all listed features
    img['features'] = frozenset([key for key in img.keys()])
    
    if 'camera_serial_number' in img:
        img['camera_serial_number'] = ''.join(x for x in img['camera_serial_number'] if x.isdigit())
    if 'x_parsed_by' in img:
        img['x_parsed_by'] = str(set(img['x_parsed_by']))[1:-1]
    if 'artist' not in img:
        img['artist'] = ''
    if isinstance(img['color_space'], list):
        img['color_space'] = str(set(img['color_space']))[1:-1]
        
    return img

In [3]:
jsn = json.load(open('../serial_numbers/serials.json'))
imgs = [image_metadata_cleaner(x) for x in jsn['response']['docs']]

no_serial = [img for img in imgs if 'camera_serial_number' not in img]
imgs = [img for img in imgs if 'camera_serial_number' in img]
    
serials = {img['camera_serial_number'] for img in imgs}
models = {img['model'] for img in imgs}
print('There are {} images attributed to {} camera serial numbers,'.format(len(imgs), len(serials)))
print('... and to {} camera models.'.format(len(models)))

There are 974 images attributed to 349 camera serial numbers,
... and to 49 camera models.


In [4]:
# Aggressively prune the data frame by killing all unique fields and all fields with constant values
img_df = pd.DataFrame.from_records(imgs)
img_df.set_index('id', inplace=True)
print(img_df.shape)
img_df_2 = img_df.dropna(1)
print(img_df_2.shape)
img_df_2 = img_df_2.ix[:, img_df_2.apply(lambda x: x.value_counts().shape[0] != 1 , 0)]
print(img_df_2.shape)

(974, 591)
(974, 44)
(974, 40)


In [5]:
non_feature_cols = set(img_df_2.columns) - set('features')

In [6]:
def get_metadata_jaccard(ind_one, ind_two):
    row_one = img_df_2.ix[ind_one, :]
    row_two = img_df_2.ix[ind_two, :]
    set_match = 1. * len(row_one['features'] & row_two['features'])/len(row_one['features'] | row_two['features'])
    feature_match = 1.*sum(row_one.ix[non_feature_cols] == row_two.ix[non_feature_cols])/len(non_feature_cols)
    hmean = 2 * set_match * feature_match / (set_match + feature_match)
    return (set_match, feature_match, hmean)

In [7]:
metadata_index = list(combinations(list(img_df_2.index), 2))

In [8]:
metadata_jacs = [get_metadata_jaccard(x[0], x[1]) for x in metadata_index]

In [9]:
metadata_similarity_df = pd.DataFrame.from_records(metadata_jacs,
                                                   columns=['label', 'value', 'hmean'])

In [10]:
metadata_index = [frozenset([x[0], x[1]]) for x in metadata_index]
metadata_similarity_df.index = metadata_index
metadata_similarity_df.index.name = 'ad_img_pairs'

metadata_similarity_df.head()

Unnamed: 0_level_0,label,value,hmean
ad_img_pairs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(01cf0852ee19216caa38aa1eed86ba640f8e5bff, 08f3d487a285889bba7fc9e9317bf8490ccba0c5)",0.319066,0.1,0.152275
"(09e0a6f5f56c18ab31606fd1b9bdcdf9b7a3a90e, 01cf0852ee19216caa38aa1eed86ba640f8e5bff)",0.338843,0.1,0.154426
"(0ccd1144442d01933c868066e87d01b395ea029b, 01cf0852ee19216caa38aa1eed86ba640f8e5bff)",0.280443,0.1,0.14743
"(01cf0852ee19216caa38aa1eed86ba640f8e5bff, 0a59e7eaca0ddfe089cec00290a15290b1a63e82)",0.330677,0.075,0.122269
"(08f9756024e5570693b789b459bb14ed6f6ec895, 01cf0852ee19216caa38aa1eed86ba640f8e5bff)",0.333333,0.1,0.153846


In [11]:
metadata_similarity_df.to_pickle('metadata_jaccard_df.pkl')