# Extract Image Similarity Values

Note that Tao initially provides *distance* values. That is, 0.0 means that two images are perfectly identical and 1.0 means that they are perfectly different. I arithmetically invert this to get a similarity value.

Furthermore, Tao's image data is *not* just from the roxy data; it comes from a wide variety of sources. To deal with this, I disregard all images that aren't from the roxy data.

In [1]:
from itertools import combinations
import ujson as json
import numpy as np
import os
import pandas as pd

root_path = os.path.abspath('../tao_cams/')
fpaths = [os.path.join(root_path, x) for x in os.listdir(root_path) if x[-5:] == '.json']

In [2]:
all_pairs = combinations([x.split('/')[-1].split('.')[0] for x in fpaths], 2)
index_tuples = [frozenset(x) for x in all_pairs]
all_pairs_df = pd.DataFrame({'ad_img_pairs': index_tuples,
                             'ht_ads_id': np.nan,
                             'similarity': np.nan,
                             'ht_images_id': np.nan,
                             'image_urls': '?',
                             'sha1': '?',
                             'tld': '?'})
print(all_pairs_df.shape)
all_pairs_df.head()

(548628, 7)


Unnamed: 0,ad_img_pairs,ht_ads_id,ht_images_id,image_urls,sha1,similarity,tld
0,"(00b17bf5aa94029b3405a81e35e803c10471dddc, 006...",,,?,?,,?
1,"(006b0743ef18bcefa4ccdb5204690a8947bf25a0, 00e...",,,?,?,,?
2,"(00fa0707ef1b8e415f911512deafcd9cc4332be1, 006...",,,?,?,,?
3,"(006b0743ef18bcefa4ccdb5204690a8947bf25a0, 010...",,,?,?,,?
4,"(0116d8cfdd4520097e257fcb9bc8e580fde23ff4, 006...",,,?,?,,?


In [3]:
roxy_prefix = 'https://s3.amazonaws.com/roxyimages/'

df_list = []
for fpath in fpaths:
    jsn = json.load(open(fpath))['images'][0]['similar_images']
    id_one = fpath.split('/')[-1].rstrip('.json')
    
    del jsn['number']
    jsn['cache_id'] = [x.lstrip(roxy_prefix).rstrip('.jpg')
                                for x in jsn['cached_image_urls']]
    del jsn['cached_image_urls']
    jsn['ad_img_pairs'] = [frozenset([id_one, x]) for x in jsn['cache_id']]
    del jsn['cache_id']
    jsn['tld'] = ['?' if x is None else x.split('/')[2] for x in jsn['page_urls']]
    del jsn['page_urls']
    jsn['ht_images_id'] = [np.nan if x is None else x for x in jsn['ht_images_id']]
    jsn['ht_ads_id'] = [np.nan if x is None else x for x in jsn['ht_ads_id']]
    jsn['sha1'] = [x.lower() for x in jsn['sha1']]
    jsn['similarity'] = [max(1. - float(x), 0.) for x in jsn['distance']]
    del jsn['distance']
    del jsn['image_urls']
    
    df_list.append(pd.DataFrame(jsn))

    
df = pd.concat(df_list)
del df_list
print(df.shape)
df = df.ix[df['ad_img_pairs'].apply(lambda x: len(x) > 1), :]
print(df.shape)
df.head()

(60552, 6)
(59822, 6)


Unnamed: 0,ad_img_pairs,ht_ads_id,ht_images_id,sha1,similarity,tld
1,"(006b0743ef18bcefa4ccdb5204690a8947bf25a0, 178...",8956498,7762242,6866cb60c4aedae1f19e2426845c1fcc12917c8a,0.644679,www.myproviderguide.com
2,"(006b0743ef18bcefa4ccdb5204690a8947bf25a0, bc8...",10196594,28546354,0cf0b2d258e37e0e31b56db79bdb68dcb3b6c690,0.592387,727-495-9078.escortsincollege.com
3,"(b5102380ef9c4116c6df4afc207cb086a9979b, 006b0...",27295569,40681331,a4c248157dc60a0cebc2c6a57e9e826112a38c4c,0.568121,escortads.xxx
4,"(006b0743ef18bcefa4ccdb5204690a8947bf25a0, 263...",27265295,40596410,3101dcdf7546f01ea89113e4e3ce3b751eec6d4c,0.556701,escortsinthe.us
5,"(006b0743ef18bcefa4ccdb5204690a8947bf25a0, 1b4...",17178641,5328902,47aa2025166f658639f06258dcbe85a6197a7dfd,0.546814,pennsylvania.backpage.com


In [4]:
merged_df = pd.concat([all_pairs_df, df]).groupby('ad_img_pairs').agg(max)
print(merged_df.shape)
merged_df.head()

(608186, 6)


Unnamed: 0_level_0,ht_ads_id,ht_images_id,image_urls,sha1,similarity,tld
ad_img_pairs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"(fffe21ee40490f083d0b6b3858fd0b9a782ae0ac, uImages/Connecticut_2013_10_25_1382744052000_4_0)",,4065489.0,,11a7ed6c4c4e2875ba12480769dc91a81da61f6e,0.603257,?
"(006b0743ef18bcefa4ccdb5204690a8947bf25a0, 00e02efc74cb4319ef93af6bbc1db1afb3a2712c)",,,?,?,,?
"(00fa0707ef1b8e415f911512deafcd9cc4332be1, 006b0743ef18bcefa4ccdb5204690a8947bf25a0)",,,?,?,,?
"(006b0743ef18bcefa4ccdb5204690a8947bf25a0, 010b77d4f14bee2f9cb40404d62d96329bc89c89)",,,?,?,,?
"(0116d8cfdd4520097e257fcb9bc8e580fde23ff4, 006b0743ef18bcefa4ccdb5204690a8947bf25a0)",,,?,?,,?


In [7]:
df.to_pickle('image_similarity_df.pkl')