# Advertisement Text Clustering

In [1]:
from itertools import combinations
from bs4 import BeautifulSoup
import ujson as json
import os
import pandas as pd
from html.parser import HTMLParser

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

hp = HTMLParser(convert_charrefs=True)

root_path = os.path.abspath('../pedro_img_lookups/')
ls = [os.path.join(root_path, x) for x in os.listdir(root_path) if x[-5:] == '.json']

jsns = [json.load(open(fpath, 'r')) for fpath in ls]

In [2]:
def parse_ad_text(body):
    txt = hp.unescape(BeautifulSoup(body).get_text())
    return ' '.join(txt.strip().split()).strip()

seen_ids = set()
hits = []
for jsn in jsns:
    results = jsn['hits']['hits']
    for res in results:
        if res['_id'] in seen_ids:
            continue
        seen_ids.add(res['_id'])
        base_entry_dict = {'dig_id': res['_id'], 'body': ''}
        if 'hasBodyPart.text' in res['fields']:
            base_entry_dict['body'] = parse_ad_text(res['fields']['hasBodyPart.text'][0])
        for img_url in res['fields']['hasImagePart.cacheUrl']:
            base_entry_dict['img_url'] = img_url
            hits.append(base_entry_dict.copy())

In [3]:
print('There are {} ads, with {} images'.format(len(seen_ids), len(hits)))

There are 1683 ads, with 13240 images


In [4]:
img_df = pd.DataFrame.from_records(hits)
print(img_df.shape)

(13240, 3)


In [5]:
import hashlib
img_df['cluster'] = [hashlib.sha1(x.encode('utf8')).hexdigest() for x in img_df['body']]
img_df['id'] = img_df['img_url'].apply(
    lambda x: x.replace('https://s3.amazonaws.com/roxyimages/', '').replace('.jpg', ''))
del img_df['img_url']
img_df['dig_id'] = img_df['dig_id'].apply(lambda x: x.replace('http://dig.isi.edu/ht/data/page/', ''))

print(img_df.shape)
print(img_df.columns)

(13240, 4)
Index(['body', 'dig_id', 'cluster', 'id'], dtype='object')


In [6]:
bad_val = 'df9bca22d5f1faa7ca715bf6dbbca2f6c5668a14'

In [8]:
bad_val in img_df['id']

False

In [10]:
bad_val in [x.split('/')[-1].rstrip('.json') for x in ls]

True

In [11]:
ls[0]

'/Users/pmlandwehr/git/giantoak/qpr_two/pedro_img_lookups/006b0743ef18bcefa4ccdb5204690a8947bf25a0.json'

In [13]:
jsn = json.load(open(os.path.join('../pedro_img_lookups/', bad_val+'.json')))

In [14]:
jsn

{'_shards': {'failed': 0, 'successful': 40, 'total': 40},
 'hits': {'hits': [], 'max_score': None, 'total': 0},
 'timed_out': False,
 'took': 151}

In [6]:
def get_token_set(x):
    toks = []
    for sent in sent_tokenize(x):
        toks.extend(word_tokenize(sent))
    return frozenset(toks)

text_dict = {ad_text: get_token_set(ad_text) for ad_text in list(img_df['body'].unique())}
cluster_dict = {hashlib.sha1(ad_text.encode('utf8')).hexdigest(): get_token_set(ad_text)
                for ad_text in list(img_df['body'].unique())}

In [7]:
cluster_keys = list(cluster_dict.keys())
jaccard_dict = {'cluster_pairs': [], 'jaccard_body': []}
cluster_index = list(combinations(cluster_keys, 2))
for one, two in cluster_index:
    num = len(cluster_dict[one] & cluster_dict[two])
    denom = len(cluster_dict[one] | cluster_dict[two])
    jaccard_dict['jaccard_body'].append(1.*num/denom)
    jaccard_dict['cluster_pairs'].append(frozenset([one, two]))

cluster_df = pd.DataFrame(jaccard_dict)
cluster_df.set_index('cluster_pairs', inplace=True)

In [8]:
cluster_df.head()

Unnamed: 0_level_0,jaccard_body
cluster_pairs,Unnamed: 1_level_1
"(62bae4a7eaa59674c72245e13900e0b915a1b42c, 263070aaa5b1af850437d3e38e7609bb8aac8eb9)",0.027778
"(62bae4a7eaa59674c72245e13900e0b915a1b42c, 29a08c00c06f66024ee0a08c1bf2aa0de33910c6)",0.028571
"(b137b7b2f54a94050029b5115232dbc1331eb919, 62bae4a7eaa59674c72245e13900e0b915a1b42c)",0.031746
"(62bae4a7eaa59674c72245e13900e0b915a1b42c, 89694b2ade5608115b7898db1d0f4012e24ee234)",0.008547
"(62bae4a7eaa59674c72245e13900e0b915a1b42c, 3ab32d9aa647db0de0ee0d896a454e31a59540f8)",0.0


### Write some pickles

Specifically, the jaccard pairings for the texts, and the body-to-cluster-to-img-to-ad mapping.

In [9]:
import pickle
# pickle.dump(text_dict, open('text_to_frozenset_dict.pkl', 'wb'))
# pickle.dump(jacard_dict, open('text_jacard_pairs.pkl', 'wb'))
cluster_df.to_pickle('cluster_jaccard_df.pkl')
img_df.to_pickle('body_cluster_img_ad_df.pkl')