In [2]:
import json
import gzip
import numpy as np
from copy import deepcopy

In [3]:
def get_json_list_from_gz_file(file_name):
    json_list = []
    
    with gzip.open(file_name,'r') as fin:
        for line in fin:        
            data_json = json.loads(line.decode('utf-8'))
            json_list.append(data_json)
    
    return json_list

In [13]:
class CrawledData:
    
    def __init__(self, category, crawl_date, subcategory, title, mrp, urlh, http_status, pack_size, available_price):
        self.category = category
        self.crawl_date = crawl_date
        self.subcategory = subcategory
        self.title = title
        self.mrp = mrp
        self.urlh = urlh
        self.http_status = http_status
        self.pack_size = pack_size
        self.available_price = available_price
        
    @staticmethod
    def build_object(json_item):
        crawled_data = CrawledData(
            category=json_item['category'],
            crawl_date = json_item['crawl_date'],
            subcategory = json_item['subcategory'],
            title = json_item['title'],
            mrp = float(json_item['mrp'] or 0),
            urlh = json_item['urlh'],
            http_status = json_item['http_status'],
            pack_size = json_item['pack_size'],
            available_price = float(json_item['available_price'] or 0),
        )
        return crawled_data

class DataBuilder:
    
    @staticmethod
    def build_id_obj_dict(json_list):
        '''Uses a hashmap which maps ids to objects so as to fecilitate faster lookup based on ids and also to find overlapping ids.
        '''
        data_dict = {}
        
        for json_item in json_list:
            crawled_data = CrawledData.build_object(json_item)
            urlh = crawled_data.urlh

            if urlh in data_dict:
                if data_dict.get(urlh).http_status != '200' and crawled_data.http_status == '200':
                    data_dict[crawled_data.urlh] = crawled_data

            else:
                data_dict[crawled_data.urlh] = crawled_data
        
        return data_dict
    
    @staticmethod
    def build_cat_subcat_dict(id_obj_dict):
        '''Uses a hashmap of the form category -> subcategory -> list<objects> to fecilitate faster and easier taxonomy stats generation.
        '''
        cat_subcat_dict = {}
        for id, obj in id_obj_dict.items():
            if cat_subcat_dict.get(obj.category) is None:
                cat_subcat_dict[obj.category] = {}
                
            category_dict = cat_subcat_dict[obj.category]
            
            if category_dict.get(obj.subcategory) is None:
                category_dict[obj.subcategory] = list()
            
            subcat_list = category_dict.get(obj.subcategory)
            subcat_list.append(obj)
            
        return cat_subcat_dict


In [16]:
today_json_list = get_json_list_from_gz_file('t.json.gz')
yesterday_json_list = get_json_list_from_gz_file('y.json.gz')


In [17]:
todays_crawled_data_dict = DataBuilder.build_id_obj_dict(today_json_list)
yesterdays_crawled_data_dict = DataBuilder.build_id_obj_dict(yesterday_json_list)

In [18]:
# Q1. No. of overlapping URLHs.
''' Overlapping URLHs, occurring in both today and yesterday's catalog is found using set intersection.
    Set 1: URLHs of today's items.
    Set 2: URLHs of yesterday's items.
    The intersection yields the urlh ids of items common in both.
    
'''
overlapping_urlhs = set(todays_crawled_data_dict.keys()).intersection(set(yesterdays_crawled_data_dict.keys()))
print("Number of overlapping items is ", len(overlapping_urlhs))

Number of overlapping items is  129673


In [20]:
# Q2. Price difference between overlapping URLHS.
'''Using the list of overlapping urlhs from previous calculation, difference in price can be calculated.
'''
price_diff_dict = {}
for urlh in overlapping_urlhs:
    price_diff = float(todays_crawled_data_dict.get(urlh).available_price) - float(yesterdays_crawled_data_dict.get(urlh).available_price)
    todays_crawled_data_dict.get(urlh).available_price
    if price_diff > 0:
        price_diff_dict[urlh] = price_diff

print("Number of items with difference in price: ", len(price_diff_dict))
price_diff_dict

Number of items with difference in price:  1061


{'f3a703cc462c9fce96b6957efb17c26ef2f62237': 2.0,
 '7b897fa8be9b5ac67785767333a1989b1e8650d5': 3.5,
 'd83470736c77b7ebafede8306598e77f8a474854': 2.1999999999999993,
 'b09a109e32ba407f971f3a4b9b2c7168e7fcd06d': 1.5,
 '6d21863223ba3f4ee9b05a4f01f4870c6b9ca4bd': 4.0,
 '1e943551e721ca34465362635135129c95712f91': 1.4999999999999982,
 '27044b7df22f7623ab6a83b5851236f3183fbbf8': 1.0,
 '982119895d3ffec980c9ce81c37b5b58b4b9f262': 6.0,
 'e861eb84b781bebc05ffa043a85d3b481398df8d': 4.0,
 '8afe057c6b88f89d766f1b4abd83c37066546444': 3.0,
 '243b75d91fe62d32a5d0fd8704b41906f4563b47': 0.5200000000000005,
 '2ac5dfc415073cbe197b8111bc57e26cb25ad491': 1.2000000000000002,
 '3e69120bd521c8287febbb92f041050332ecdcfe': 3.5,
 '011cd2d9cf0ae3ba0a5716e7420f35fc2a130c56': 1.2000000000000002,
 '1dd0f5ee0a4cc9529583951f0dda742a50087779': 2.5,
 '8fde54b3634553eae4aeaaac88307eb05df2d6eb': 1.5,
 '2a3cd9e96317af149f217d3c62c10453e5d0f68c': 1.5,
 'e854d9eb15a81086b8b67fc2e827ff5f4c17c6c2': 10.0,
 'a5de90f7929ac57fe3f8fc

In [21]:
todays_cat_subcat_dict = DataBuilder.build_cat_subcat_dict(todays_crawled_data_dict)
yesterdays_cat_subcat_dict = DataBuilder.build_cat_subcat_dict(yesterdays_crawled_data_dict)

In [27]:
# Q3 Number of unique categories in both files
'''Since, the keys of dictionaries are unique, the number of keys in the category-subcategory dict gives the number of unique categories.
'''
no_of_uniques_categories_in_todays_dict = len(todays_cat_subcat_dict.keys())
no_of_uniques_categories_in_yesterdays_dict = len(yesterdays_cat_subcat_dict.keys())
print("Unique categories in today's file: ", no_of_uniques_categories_in_todays_dict)
print("Unique categories in yesterdays's file: ", no_of_uniques_categories_in_yesterdays_dict)


Unique categories in today's file:  50
Unique categories in yesterdays's file:  50


In [None]:
# Q4 Set of categories which don't have any overlapping items.
'''Compute the global categories set and remove the categories of overlapping items from it.
'''
categories = set(todays_cat_subcat_dict.keys()).union(set(yesterdays_cat_subcat_dict.keys()))
categories_of_overlapping_items = {todays_crawled_data_dict[urlh].category for urlh in overlapping_urlhs}

non_overlapping_categories =  categories - categories_of_overlapping_items
print("Non overlapping categories: ", non_overlapping_categories)

In [28]:
# Q5 Taxonomy generation
'''Taxonomy stats can be generated iterating through the category-subcategory dict.
'''
def generate_taxonomy_details(cat_subcat_dict, query_func):
    '''A function which generates taxonomy stats as required by the query func.
    Args: 
        cat_subcat_dict: The category-subcategory dict.
        query_func: The function which represents the query. In this example, len is passed to get the number of items in the list
        corresponding to that subcategory. Other functions corresponding to average, median, etc of any other metric of the list
        can also be passed.
    
    Returns:
         taxonomy_stats: Dict representing the queried stats.
    '''
    taxonomy_stats = {}
    for category in cat_subcat_dict:
        subcat_dict = cat_subcat_dict[category]
        for subcat in subcat_dict:
            item_list = subcat_dict[subcat]
            print(category, " --> ", subcat, " --> ", query_func(item_list))
            taxonomy_stats[(category, subcat)] = query_func(item_list)
    
    return taxonomy_stats

todays_taxonomy_stats = generate_taxonomy_details(todays_cat_subcat_dict, len)
yesterdays_taxonomy_stats = generate_taxonomy_details(yesterdays_cat_subcat_dict, len)

Household  -->  Trash Bags & Liners  -->  175
Household  -->  Paper Goods  -->  584
Household  -->  Laundry  -->  1051
Household  -->  Kitchen Supplies  -->  1155
Household  -->  Plates, Bowls, Cups & Flatware  -->  486
Household  -->  Food Storage  -->  615
Household  -->  Dish Detergents  -->  388
Household  -->  Cleaning Products  -->  1505
Household  -->  Air Fresheners & Candles  -->  807
Household  -->  More Household  -->  3900
Household  -->  Garden  -->  56
Household  -->  Books & Magazines  -->  1
Pets  -->  Dog Food & Care  -->  1493
Pets  -->  Cat Food & Care  -->  1233
Pets  -->  Small Animal Care  -->  153
Babies  -->  Baby Accessories  -->  236
Babies  -->  Baby First Aid & Vitamins  -->  197
Babies  -->  Diapers & Wipes  -->  542
Babies  -->  Baby Food & Formula  -->  1259
Babies  -->  Baby Bath & Body Care  -->  264
Personal Care  -->  Adult Care  -->  66
Personal Care  -->  Foot Care  -->  138
Personal Care  -->  Family Planning  -->  241
Personal Care  -->  Aromather

In [30]:
!pip install plotly
import plotly.plotly as py
import plotly.graph_objs as go

py.sign_in('PythonAPI', 'ubpiol2cve')

Collecting plotly
[?25l  Downloading https://files.pythonhosted.org/packages/fd/db/003b5cfbc710f4d4982440451185b952269e4080a57ae7e760a2ceb8ce0c/plotly-3.6.1-py2.py3-none-any.whl (38.6MB)
[K    100% |████████████████████████████████| 38.6MB 746kB/s 
Collecting retrying>=1.3.3 (from plotly)
  Downloading https://files.pythonhosted.org/packages/44/ef/beae4b4ef80902f22e3af073397f079c96969c69b2c7d52a57ea9ae61c9d/retrying-1.3.3.tar.gz
Building wheels for collected packages: retrying
  Running setup.py bdist_wheel for retrying ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/d7/a9/33/acc7b709e2a35caa7d4cae442f6fe6fbf2c43f80823d46460c
Successfully built retrying
Installing collected packages: retrying, plotly
Successfully installed plotly-3.6.1 retrying-1.3.3
[33mYou are using pip version 18.0, however version 19.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [32]:
'''Plots the taxonomy stats in plotly for a given category'''
import plotly.plotly as py
import plotly.graph_objs as go

category_to_plot = 'Household'

today = go.Bar(
    x=[cat_subcat[1] for cat_subcat in todays_taxonomy_stats if cat_subcat[0] == category_to_plot],
    y= [value for key, value in todays_taxonomy_stats.items() if key[0] == category_to_plot],
    name='''Today's taxonomy stats''',
    marker=dict(
        color='rgb(55, 83, 109)'
    )
)
yesterday = go.Bar(
    x=[cat_subcat[1] for cat_subcat in todays_taxonomy_stats if cat_subcat[0] == category_to_plot],
    y= [value for key, value in yesterdays_taxonomy_stats.items() if key[0] == category_to_plot],
    name='''Yesterdays's taxonomy stats''',
    marker=dict(
        color='rgb(26, 118, 255)'
    )
)
data = [today, yesterday]
layout = go.Layout(
    title= category_to_plot + ' ' 'taxonomy stats',
    xaxis=dict(
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
    ),
    yaxis=dict(
        title='Number of Items',
        titlefont=dict(
            size=16,
            color='rgb(107, 107, 107)'
        ),
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
    ),
    legend=dict(
        x=0,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='group',
    bargap=0.15,
    bargroupgap=0.1
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='style-bar')

In [29]:
# Demonstrating the use of a different query function, in this case, to compute the subcategory-wise mean of available price.

available_price_query_func = lambda item_list: np.mean([item.available_price for item in item_list])
todays_taxonomy_available_price_stats = generate_taxonomy_details(todays_cat_subcat_dict, available_price_query_func)
yesterdays_taxonomy_available_price_stats = generate_taxonomy_details(todays_cat_subcat_dict, available_price_query_func)

Household  -->  Trash Bags & Liners  -->  7.652514285714286
Household  -->  Paper Goods  -->  6.878270547945205
Household  -->  Laundry  -->  9.207887725975262
Household  -->  Kitchen Supplies  -->  10.972632034632035
Household  -->  Plates, Bowls, Cups & Flatware  -->  8.033950617283951
Household  -->  Food Storage  -->  9.583430894308941
Household  -->  Dish Detergents  -->  5.967371134020619
Household  -->  Cleaning Products  -->  6.404451827242525
Household  -->  Air Fresheners & Candles  -->  6.948289962825279
Household  -->  More Household  -->  11.530474358974361
Household  -->  Garden  -->  11.169285714285715
Household  -->  Books & Magazines  -->  6.99
Pets  -->  Dog Food & Care  -->  9.656121902210314
Pets  -->  Cat Food & Care  -->  7.190186536901865
Pets  -->  Small Animal Care  -->  8.705032679738562
Babies  -->  Baby Accessories  -->  8.429533898305083
Babies  -->  Baby First Aid & Vitamins  -->  8.633553299492386
Babies  -->  Diapers & Wipes  -->  12.29249077490775
Babie

In [None]:
'''Plots the taxonomy stats in plotly for a given category'''
import plotly.plotly as py
import plotly.graph_objs as go

category_to_plot = 'Household'

today = go.Bar(
    x=[cat_subcat[1] for cat_subcat in todays_taxonomy_available_price_stats if cat_subcat[0] == category_to_plot],
    y= [value for key, value in todays_taxonomy_available_price_stats.items() if key[0] == category_to_plot],
    name='''Today's taxonomy stats for Average Price''',
    marker=dict(
        color='rgb(55, 83, 109)'
    )
)
yesterday = go.Bar(
    x=[cat_subcat[1] for cat_subcat in todays_taxonomy_available_price_stats if cat_subcat[0] == category_to_plot],
    y= [value for key, value in todays_taxonomy_available_price_stats.items() if key[0] == category_to_plot],
    name='''Yesterdays's  taxonomy stats for Average Price''',
    marker=dict(
        color='rgb(26, 118, 255)'
    )
)
data = [today, yesterday]
layout = go.Layout(
    title= category_to_plot + ' ' 'taxonomy stats: Average Price',
    xaxis=dict(
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
    ),
    yaxis=dict(
        title='Price',
        titlefont=dict(
            size=16,
            color='rgb(107, 107, 107)'
        ),
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
    ),
    legend=dict(
        x=0,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='group',
    bargap=0.15,
    bargroupgap=0.1
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='style-bar')

In [None]:
# Q6 Normalized MRP
'''MRP can be normalized such that the mean in 0 and std dev is 1.
'''

mrp = np.array([obj.mrp for obj in todays_crawled_data_dict.values() if obj.mrp != 0])
mrp_mean = np.mean(mrp)
mrp_std = np.std(mrp)
mrp_normalized_item_list = []

for item in todays_crawled_data_dict.values():
    item_copy = deepcopy(item)
    
    if item_copy.mrp == 0:
        item_copy.mrp = 'NA'
    
    else:
        item_copy.mrp = (item_copy.mrp - mrp_mean) / mrp_std
    
    mrp_normalized_item_list.append(item_copy.__dict__)
    
json_str = json.dumps(mrp_normalized_item_list) + "\n"               
json_bytes = json_str.encode('utf-8')            

with gzip.GzipFile('t_mrp_normalized.json.gz', 'w') as fout: # Write to a output file
    fout.write(json_bytes)             