# **ServiceTitan Internship Task Report**

In [1]:
import pandas as pd
import pickle
from pprint import pprint
from collections import Counter
from CustomerDataExt import CustomerDataExtractor

## **1. EDA** 

In [2]:
with open('data/customer_orders.pkl', 'rb') as f:
    customers = pickle.load(f)

print(f"Total customers: {len(customers)}")
print("Sample customer record keys:", list(customers[0].keys()))
print("\nExample customer record:")
pprint(customers[0])

Total customers: 50
Sample customer record keys: ['id', 'name', 'registration_date', 'orders']

Example customer record:
{'id': 1,
 'name': 'Customer 1',
 'orders': [{'items': [],
             'order_date': '2024-02-27 21:24:16',
             'order_id': 84,
             'order_total_value': 0,
             'shipping_address': '840 Main St, City 8'},
            {'items': [{'category': 4,
                        'item_id': 1,
                        'price': 52.7,
                        'product_name': 'Item 1 for Order 2',
                        'quantity': 2},
                       {'category': 5,
                        'item_id': 2,
                        'price': 498.45,
                        'product_name': 'Item 2 for Order 2',
                        'quantity': 5},
                       {'category': 5,
                        'item_id': 3,
                        'price': 168.15,
                        'product_name': 'Item 3 for Order 2',
                        'quan

In [3]:
# Top‐level missing values
top_fields = ['id', 'name', 'registration_date', 'orders']
missing_top = {
    field: sum(1 for c in customers if field not in c or c[field] in (None, '', []))
    for field in top_fields
}
print("\nMissing top‐level fields:", missing_top)


Missing top‐level fields: {'id': 0, 'name': 0, 'registration_date': 1, 'orders': 9}


In [4]:
# Orders per customer
orders_per_customer = [len(c.get('orders', [])) for c in customers]
print("\nOrders per customer summary:")
print(pd.Series(orders_per_customer).describe())


Orders per customer summary:
count    50.000000
mean      1.980000
std       1.285556
min       0.000000
25%       1.000000
50%       2.000000
75%       3.000000
max       4.000000
dtype: float64


In [5]:
# Items per order
items_per_order = [
    len(o.get('items', []))
    for c in customers for o in c.get('orders', [])
]
print("\nItems per order summary:")
print(pd.Series(items_per_order).describe())


Items per order summary:
count    99.000000
mean      2.000000
std       1.789995
min       0.000000
25%       0.000000
50%       2.000000
75%       4.000000
max       5.000000
dtype: float64


In [6]:
# Order‐level missing values
order_fields = ['order_id', 'order_date', 'shipping_address', 'items']
missing_orders = {
    field: sum(
        1 for c in customers for o in c.get('orders', [])
        if field not in o or o[field] in (None, '', [])
    )
    for field in order_fields
}
print("\nMissing order‐level fields:", missing_orders)


Missing order‐level fields: {'order_id': 5, 'order_date': 2, 'shipping_address': 0, 'items': 30}


In [7]:
# Item‐level missing values
item_fields = ['item_id', 'product_name', 'category', 'price', 'quantity']
missing_items = {
    field: sum(
        1 for c in customers
          for o in c.get('orders', [])
          for item in o.get('items', [])
        if field not in item or item[field] is None
    )
    for field in item_fields
}
print("\nMissing item‐level fields:", missing_items)


Missing item‐level fields: {'item_id': 15, 'product_name': 0, 'category': 0, 'price': 2, 'quantity': 0}


In [8]:
# Distribution of product categories
all_categories = [
    item['category']
    for c in customers
      for o in c.get('orders', [])
      for item in o.get('items', [])
    if 'category' in item
]
print("\nTop 10 categories:")
print(Counter(all_categories).most_common(10))


Top 10 categories:
[(1, 36), (4, 28), (5, 27), (0, 27), (2, 26), (3, 20), ('home goods', 11), (99, 9), (' Electronics ', 4), ('Misc Item', 3)]


## **2. Dealing with Missing Values**

In [9]:
for cust in customers:
    # Top‐level
    cust.setdefault('id', None)
    cust.setdefault('name', None)
    if cust.get('registration_date') is None:
        cust['registration_date'] = None
    if cust.get('orders') is None:
        cust['orders'] = []

    for order in cust['orders']:
        # mark missing IDs/dates as None
        if order.get('order_id') is None:
            order['order_id'] = None
        if order.get('order_date') is None:
            order['order_date'] = None
        if order.get('shipping_address') is None:
            order['shipping_address'] = None
        if order.get('items') is None:
            order['items'] = []

        for item in order['items']:
            # string fields: None → pandas <NA> in string dtype
            item.setdefault('item_id',      None)
            item.setdefault('product_name', None)
            item.setdefault('category',     None)
            # numeric fields: keep None
            if item.get('price') is None:
                item['price'] = None
            if item.get('quantity') is None:
                item['quantity'] = None

# 3. Save cleaned data
with open('data/customer_orders_cleaned.pkl', 'wb') as f:
    pickle.dump(customers, f)

print("✅ Clean complete — saved to customer_orders_cleaned.pkl") 

✅ Clean complete — saved to customer_orders_cleaned.pkl


## **3. Customer Data Extractor** 

In [10]:
extractor = CustomerDataExtractor(
    "data/customer_orders_cleaned.pkl",
    "data/vip_customers.txt"
)
df = extractor.extract()
print(df.dtypes)
df.head()

customer_id                              Int64
customer_name                   string[python]
registration_date               datetime64[ns]
is_vip                                 boolean
order_id                                 Int64
order_date                      datetime64[ns]
product_id                               Int64
product_name                    string[python]
unit_price                             float64
item_quantity                            Int64
total_item_price                       float64
category                        string[python]
total_order_value_percentage           float64
dtype: object


Unnamed: 0,customer_id,customer_name,registration_date,is_vip,order_id,order_date,product_id,product_name,unit_price,item_quantity,total_item_price,category,total_order_value_percentage
0,1,Customer 1,2022-12-31 04:19:19,True,2,2025-05-03 08:09:20,1,Item 1 for Order 2,52.7,2,105.4,Home Goods,3.810832
1,1,Customer 1,2022-12-31 04:19:19,True,2,2025-05-03 08:09:20,2,Item 2 for Order 2,498.45,5,2492.25,Misc,90.109552
2,1,Customer 1,2022-12-31 04:19:19,True,2,2025-05-03 08:09:20,3,Item 3 for Order 2,168.15,1,168.15,Misc,6.079615
3,1,Customer 1,2022-12-31 04:19:19,True,3,2023-09-06 00:42:50,1,Item 1 for Order 3,377.96,1,377.96,Electronics,100.0
4,2,Customer 2,2022-05-27 00:23:28,False,5,2024-08-28 19:37:56,1,Item 1 for Order 5,342.68,2,685.36,Misc,30.523255


## **END**