In [101]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('sales_data.csv') #load csv file
#test header
df.head(5)

In [None]:
#Question 1
most_prevalent_pd = df['Product Name'].value_counts().head(1)

product_name = most_prevalent_pd.index[0]
product_count = most_prevalent_pd.values[0]

print(f"Most prevalent product: {product_name} with {product_count} sales.")

In [None]:
#Question 1 
product_counts = df['Product Name'].value_counts()

most_prevalent_products = product_counts[product_counts == product_counts.max()]

print("Most Prevalent Product")
for product, count in most_prevalent_products.items():
    print(f"{product}: {count} sales")

In [None]:
#Question 2
order_product_count = df.groupby(['CustomerID', 'OrderID'])['Product Name'].nunique().reset_index()

#5 or greater is considered "large"
large_basket_orders = order_product_count[order_product_count['Product Name'] >= 10]

total_large_purchases = len(large_basket_orders)

print("\nFrequency of Large Baskets:", total_large_purchases, "occurrences")

In [None]:
#Question 3
order_product_count = df.groupby(['CustomerID', 'OrderID'])['Product Name'].nunique().reset_index()

large_basket_orders = order_product_count[order_product_count['Product Name'] >= 10] #10 or greater are considered large baskets

large_basket_orders = large_basket_orders.merge(df[['StoreID', 'OrderID']], on='OrderID')

store_large_basket_counts = large_basket_orders['StoreID'].value_counts()

total_stores_with_large_baskets = len(store_large_basket_counts)

print(f"Stores containing at least one large basket: {total_stores_with_large_baskets}")

print("\n5 stores with the most filled-up baskets:")
for store_id, count in store_large_basket_counts.head(5).items():
    print(f"StoreID {store_id} had {count} large purchases")
     

In [None]:
#Question 4
df['Product Price'] = pd.to_numeric(df['Product Price'], errors='coerce')  
df.dropna(subset=['Product Price'], inplace=True)

basket_sizes = df.groupby(['StoreID', 'OrderID'])['Product Price'].sum().reset_index()

large_basket_threshold = basket_sizes['Product Price'].quantile(0.95)  # Top 5% of basket sizes

large_basket_data = basket_sizes[basket_sizes['Product Price'] > large_basket_threshold]

top_stores = large_basket_data['StoreID'].value_counts().head(25)

plt.figure(figsize=(8, 4))
plt.bar(top_stores.index.astype(str), top_stores.values, color='skyblue')
plt.title('Top 25 Stores by Large-Basket Transaction Frequency', fontsize=14)
plt.xlabel('StoreID', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.xticks(fontsize=7)
plt.yticks(fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

plt.show()

In [None]:
#Question 5
df['ProductCount'] = df.groupby('OrderID')['Product Name'].transform('nunique')
large_basket_orders = df[df['ProductCount'] >= 5]

product_counts_in_large_baskets = large_basket_orders['Product Name'].value_counts()
top_n_products_large_basket = product_counts_in_large_baskets.head(5)

print("Top 5 products linked to large basket shoppers:")

for rank, (product, count) in enumerate(top_n_products_large_basket.items(), start=1):
    print(f"{rank}. {product}, Sold {count} times")

In [None]:
#Question 6
category_map = {
    "Driscolls Blueberries": "Food",
    "Alisan Kitchen Mats": "Home",
    "Organic 2% Milk": "Food",
    "Goodfellow Grey T-shirt": "Apparel",
    "Apple AirPods Pro": "Electronics",
    "Tropicana Orange Juice": "Beverages",
    "Toll House Cookie Dough": "Food",
    "Daves Killer Bread": "Food",
    "Afflux Type-C": "Electronics",
    "Mobil 1 5W30 Oil": "Automotive",
}

df['Category'] = df['Product Name'].map(category_map)
large_basket_orders = df[df['ProductCount'] >= 10]
category_counts_in_large_baskets = large_basket_orders['Category'].value_counts()
top_5_categories_large_basket = category_counts_in_large_baskets.head(5)

print("Top 5 categories typical to large-basket customers:")
for i, (category, count) in enumerate(top_5_categories_large_basket.items(), start=1):
    print(f"{i}: {category}, Sold {count} times")

In [None]:
#Question 7
plt.figure(figsize=(10, 6))
top_5_categories_large_basket.plot(kind='bar', color='coral')

plt.title("Top 5 Categories Typical to Large-Basket Customers", fontsize=14)
plt.xlabel("Category", fontsize=12)
plt.ylabel("Number of Products Sold", fontsize=12)

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
