<a href="https://colab.research.google.com/github/elinabuniatyan/test-assignment/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files

uploaded = files.upload()


Saving customer_orders.pkl to customer_orders (1).pkl


In [None]:
import pandas as pd
import pickle
from datetime import datetime
from pathlib import Path

class CustomerDataExtractor:
    CATEGORY_MAP = {
        1: 'Electronics',
        2: 'Apparel',
        3: 'Books',
        4: 'Home Goods'
    }

    def __init__(self, data_path='customer_orders.pkl', vip_path='vip_customers.txt'):
        self.data_path = Path(data_path)
        self.vip_path = Path(vip_path)

    def load_data(self):
        with open(self.data_path, 'rb') as f:
            self.customers = pickle.load(f)
        with open(self.vip_path, 'r') as f:
            self.vip_ids = set(int(line.strip()) for line in f if line.strip().isdigit())

    def extract_flat_data(self):
        rows = []

        for customer in self.customers:
            customer_id = int(customer.get('id'))
            customer_name = customer.get('name')
            registration_date = pd.to_datetime(customer.get('registration_date'), errors='coerce')
            is_vip = customer_id in self.vip_ids

            for order in customer.get('orders', []):
                order_id = int(order.get('order_id'))
                order_date = pd.to_datetime(order.get('order_date'), errors='coerce')

                items = order.get('items', [])
                total_order_value = 0.0

                # First pass: calculate total order value
                for item in items:
                    try:
                        price = float(item.get('price', 0))
                        quantity = int(item.get('quantity', 0))
                        total_order_value += price * quantity
                    except (ValueError, TypeError):
                        continue

                # Second pass: create flat rows
                for item in items:
                    try:
                        product_id = int(item.get('item_id'))
                        product_name = item.get('product_name')
                        category_raw = item.get('category')
                        category = self.CATEGORY_MAP.get(category_raw, 'Misc')
                        unit_price = float(item.get('price', 0))
                        item_quantity = int(item.get('quantity', 0))
                        total_item_price = unit_price * item_quantity
                        total_order_value_percentage = (
                            (total_item_price / total_order_value) * 100 if total_order_value > 0 else 0
                        )

                        rows.append({
                            'customer_id': customer_id,
                            'customer_name': customer_name,
                            'registration_date': registration_date,
                            'is_vip': is_vip,
                            'order_id': order_id,
                            'order_date': order_date,
                            'product_id': product_id,
                            'product_name': product_name,
                            'category': category,
                            'unit_price': unit_price,
                            'item_quantity': item_quantity,
                            'total_item_price': total_item_price,
                            'total_order_value_percentage': total_order_value_percentage
                        })
                    except Exception as e:
                        # Skip bad records
                        continue

        df = pd.DataFrame(rows)

        # Enforce data types
        df = df.astype({
            'customer_id': 'int64',
            'customer_name': 'string',
            'registration_date': 'datetime64[ns]',
            'is_vip': 'bool',
            'order_id': 'int64',
            'order_date': 'datetime64[ns]',
            'product_id': 'int64',
            'product_name': 'string',
            'category': 'string',
            'unit_price': 'float64',
            'item_quantity': 'int64',
            'total_item_price': 'float64',
            'total_order_value_percentage': 'float64'
        })

        # Sort as required
        df.sort_values(by=['customer_id', 'order_id', 'product_id'], inplace=True)
        df.reset_index(drop=True, inplace=True)

        return df
