In [2]:
import pandas as pd
import sqlite3
import os

In [3]:
files_tables = {
    "orders.csv": "orders",
    "products.csv": "products",
    "aisles.csv": "aisles",
    "departments.csv": "departments",
    "order_products__prior.csv": "order_products_prior",
    "order_products__train.csv": "order_products_train"
}

# 2. Connect to database
db_name = "instacart.db"
conn = sqlite3.connect(db_name)

In [4]:
print("--- Starting Import ---")
for file, table in files_tables.items():
    if os.path.exists(file):
        print(f"Reading {file}...")
        chunksize = 100000 
        for i, chunk in enumerate(pd.read_csv(file, chunksize=chunksize)):
            chunk.to_sql(table, conn, if_exists='append', index=False)
            if i == 0: print(f"  - Started writing to '{table}'...")
        print(f"Finished loading {table}")
    else:
        print(f"File not found: {file}")

print("\n--- Database Ready ---")

def run_query(q):
    return pd.read_sql_query(q, conn)

--- Starting Import ---
Reading orders.csv...
  - Started writing to 'orders'...
Finished loading orders
Reading products.csv...
  - Started writing to 'products'...
Finished loading products
Reading aisles.csv...
  - Started writing to 'aisles'...
Finished loading aisles
Reading departments.csv...
  - Started writing to 'departments'...
Finished loading departments
Reading order_products__prior.csv...
  - Started writing to 'order_products_prior'...
Finished loading order_products_prior
Reading order_products__train.csv...
  - Started writing to 'order_products_train'...
Finished loading order_products_train

--- Database Ready ---


# üçå Analysis: Most Popular Produce Item

**Objective:** Identify the single most purchased item specifically from the **"produce"** department.

### üìä SQL Query
To find this, performed:
1. **Join** `order_products_prior` (sales) with `products` (names) and `departments` (categories).
2. **Filter** specifically for the `'produce'` department.
3. **Count** the frequency of each product and **Sort** descending.

In [5]:
query = """
SELECT 
    p.product_name, 
    COUNT(*) as purchase_count
FROM order_products_prior op
JOIN products p ON op.product_id = p.product_id
JOIN departments d ON p.department_id = d.department_id
WHERE d.department = 'produce'
GROUP BY p.product_name
ORDER BY purchase_count DESC
LIMIT 1;
"""

run_query(query)

Unnamed: 0,product_name,purchase_count
0,Banana,3780520


# üçå Market Basket Analysis: What is bought with Bananas?

**Objective:** Identify the top 10 products most frequently purchased in the same transaction as "Bananas" (Product ID: `24852`). This is a classic "Association Rule" mining task used for recommendation engines.

### üìä SQL Query
We use a **subquery** to first find all orders containing Bananas, then count the *other* items in those specific orders.

In [None]:
query = """
SELECT 
    p.product_name, 
    COUNT(*) as frequency
FROM order_products_prior op
JOIN products p ON op.product_id = p.product_id
WHERE op.order_id IN (
    SELECT order_id 
    FROM order_products_prior 
    WHERE product_id = 24852
)
AND op.product_id != 24852 
GROUP BY p.product_name
ORDER BY frequency DESC
LIMIT 10;
"""

run_query(query)

Unnamed: 0,product_name,frequency
0,Organic Strawberries,224624
1,Organic Avocado,213580
2,Organic Baby Spinach,205580
3,Strawberries,164928
4,Large Lemon,163520
5,Organic Fuji Apple,135772
6,Cucumber Kirby,128388
7,Limes,127088
8,Organic Whole Milk,126564
9,Organic Hass Avocado,124888
