In [8]:
import json

# Read JSON file
with open('sample-dataset-3.json', 'r') as file:
    transactions = json.load(file)

In [9]:
transactions

[{'transaction_id': 'T123456',
  'timestamp': '2024-01-15T14:30:00',
  'customer': {'id': 'CUS123',
   'region': 'North',
   'segment': 'Premium',
   'join_date': '2023-01-01'},
  'items': [{'product_id': 'P789',
    'category': 'Electronics',
    'subcategory': 'Smartphones',
    'price': 499.99,
    'quantity': 1,
    'discount': 0.0},
   {'product_id': 'P456',
    'category': 'Accessories',
    'subcategory': 'Phone Cases',
    'price': 29.99,
    'quantity': 2,
    'discount': 5.0}],
  'payment_method': 'credit_card',
  'status': 'completed',
  'shipping': {'method': 'express',
   'cost': 15.99,
   'address': {'city': 'Boston', 'state': 'MA', 'country': 'USA'}}},
 {'transaction_id': 'T123457',
  'timestamp': '2024-01-15T14:45:00',
  'customer': {'id': 'CUS456',
   'region': 'South',
   'segment': 'Standard',
   'join_date': '2023-06-15'},
  'items': [{'product_id': 'P234',
    'category': 'Books',
    'subcategory': 'Fiction',
    'price': 24.99,
    'quantity': 3,
    'discount': 

In [3]:
pip install flatten_json

Note: you may need to restart the kernel to use updated packages.


In [4]:
from flatten_json import flatten

Flattening json file

In [10]:

# Function for flattening
# json


def flatten_json(y):
	out = {}

	def flatten(x, name=''):

		# If the Nested key-value
		# pair is of dict type
		if type(x) is dict:

			for a in x:
				flatten(x[a], name + a + '_')

		# If the Nested key-value
		# pair is of list type
		elif type(x) is list:

			i = 0

			for a in x:
				flatten(a, name + str(i) + '_')
				i += 1
		else:
			out[name[:-1]] = x

	flatten(y)
	return out

# Apply flatten function to each transaction 
flattened_transactions = [flatten_json(transaction) for transaction in transactions] 
print(flattened_transactions)



[{'transaction_id': 'T123456', 'timestamp': '2024-01-15T14:30:00', 'customer_id': 'CUS123', 'customer_region': 'North', 'customer_segment': 'Premium', 'customer_join_date': '2023-01-01', 'items_0_product_id': 'P789', 'items_0_category': 'Electronics', 'items_0_subcategory': 'Smartphones', 'items_0_price': 499.99, 'items_0_quantity': 1, 'items_0_discount': 0.0, 'items_1_product_id': 'P456', 'items_1_category': 'Accessories', 'items_1_subcategory': 'Phone Cases', 'items_1_price': 29.99, 'items_1_quantity': 2, 'items_1_discount': 5.0, 'payment_method': 'credit_card', 'status': 'completed', 'shipping_method': 'express', 'shipping_cost': 15.99, 'shipping_address_city': 'Boston', 'shipping_address_state': 'MA', 'shipping_address_country': 'USA'}, {'transaction_id': 'T123457', 'timestamp': '2024-01-15T14:45:00', 'customer_id': 'CUS456', 'customer_region': 'South', 'customer_segment': 'Standard', 'customer_join_date': '2023-06-15', 'items_0_product_id': 'P234', 'items_0_category': 'Books', 'it

Extract unique categories

In [17]:
def extract_unique_categories(transactions):
    categories = set()
    for transaction in transactions:
        for key in transaction:
            if 'category' in key:
                categories.add(transaction[key])
    return categories

unique_categories = extract_unique_categories(flattened_transactions)
print("Unique Categories:", unique_categories)


Unique Categories: {'Accessories', 'Fiction', 'Home', 'Shirts', 'Electronics', 'Clothing', 'Tablets', 'Headphones', 'Books', 'Kitchen', 'Pants', 'Smartphones', 'Phone Cases'}


In [11]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [12]:
import pandas as pd

### 2. Create analysis functions

Find top-selling products using sorted() with custom key

In [21]:
def top_selling_products(transactions):
    product_sales = {}
    for transaction in transactions:
        for i in range(len([key for key in transaction.keys() if key.startswith('items_') and key.endswith('_product_id')])):
            product_id = transaction[f'items_{i}_product_id']
            quantity = transaction[f'items_{i}_quantity']
            if product_id not in product_sales:
                product_sales[product_id] = 0
            product_sales[product_id] += quantity
    sorted_products = sorted(product_sales.items(), key=lambda x: x[1], reverse=True)
    return sorted_products

top_products = top_selling_products(flattened_transactions)
print("Top-Selling Products:", top_products)


Top-Selling Products: [('P234', 3), ('P789', 2), ('P456', 2), ('P777', 2), ('P333', 2), ('P555', 1), ('P444', 1), ('P888', 1)]


### 3. Create a report generation function

Filters completed transactions using filter()


In [13]:
def completed_transactions(transactions):
     return list(filter(lambda x: x['status'] == 'Completed', transactions))

Sorts data by multiple criteria using lambda

In [15]:
def sort_by_criteria(transactions):
     return sorted(transactions, key=lambda x: (x['region'], x['total_value']))

Generates summary statistics for different time period

In [16]:
def summary_statistics(transactions): 
    # For simplicity, only showing an example for event count by region 
    event_count_by_region = {} 
    for transaction in transactions:
         region = transaction['region'] 
         if region not in event_count_by_region:
             event_count_by_region[region] = 0 
             event_count_by_region[region] += 1 
             return event_count_by_region 
         def generate_report(transactions): 
            completed = completed_transactions(transactions)
            sorted_transactions = sort_by_criteria(completed)
            summary_stats = summary_statistics(sorted_transactions)
            return { 'sorted_transactions': sorted_transactions, 'summary_statistics': summary_stats }
            report = generate_report(flattened_transactions) 
            print("Report:", report)