In [2]:
import pandas as pd

# Load the product price history
price_history_df = pd.read_excel('/content/price_history.xlsx')

price_history_df.head()

Unnamed: 0,Product Name,Price,Effective Date
0,Barbie with Robbie Chocolate Cookie,3.75,2022-07-26
1,Barbie with Robbie Chocolate Cookie,4.1,2022-10-24
2,Barbie with Robbie Chocolate Cookie,4.37,2022-12-23
3,Barbie with Robbie Chocolate Cookie,4.18,2023-04-22
4,Barbie with Robbie Croissant,2.04,2022-07-26


In [4]:
!pip3 install faker
from faker import Faker
import random
import numpy as np
from datetime import timedelta, date
from collections import defaultdict

# Set seed for random
random.seed(1234)

# Set seed for numpy
np.random.seed(1234)

# Set seed for faker
fake = Faker()
fake.seed_instance(1234)
# Generate 10000 sales records for 750 unique customers

# Generate a list of unique customer identifiers
customer_ids = list(range(750))

# Generate unique first names, last names, phone numbers, emails, and addresses
first_names = [f"{fake.first_name()}_{id}" for id in customer_ids]
last_names = [fake.last_name() for _ in range(750)]
customer_phones = customer_phones = [f"({str(fake.random_number(digits=10, fix_len=True))[:3]}) {str(fake.random_number(digits=10, fix_len=True))[3:6]}-{str(fake.random_number(digits=10, fix_len=True))[6:]}" for _ in range(750)]
customer_emails = [fake.unique.email() for _ in range(750)]
customer_addresses = [fake.address().replace('\n', ', ') for _ in range(750)]

# Store customer information in a dictionary for easy lookup
customer_info = dict(zip(zip(first_names, last_names), zip(customer_phones, customer_emails, customer_addresses)))

# Create a list of all dates in the past year
start_date = date.today() - timedelta(days=365)
end_date = date.today()
date_list = [start_date + timedelta(days=x) for x in range((end_date-start_date).days + 1)]

# Create a list of all unique products
products = price_history_df['Product Name'].unique().tolist()

# Define a function to get the price of a product on a given date
def get_product_price(product, date):
    applicable_prices = price_history_df[(price_history_df['Product Name'] == product) &
                                         (price_history_df['Effective Date'] <= date)]
    if not applicable_prices.empty:
        return applicable_prices.iloc[-1]['Price']
    else:
        return None

# Generate sales records
sales_records = []
sale_id = 0

# Assign purchases to customers in round-robin fashion to ensure each customer makes at least one purchase
for i in range(10000):
    first_name, last_name = list(customer_info.keys())[i % 750]
    sale_id += 1
    store_id = sale_id % 5 + 1  # Distribute sales across 5 stores
    total_amount = 0
    product_quantity_str = ""
    unit_price_str = ""

    # Each purchase can contain 1-5 unique products
    num_products = np.random.choice(range(1, 6), 1)[0]
    purchased_products = random.sample(products, num_products)

    purchase_date = random.choice(date_list)
    for product in purchased_products:
        quantity = np.random.choice(range(1, 8), 1)[0]  # Quantity ranges from 1 to 7
        price = get_product_price(product, pd.to_datetime(purchase_date))  # Convert to datetime64 before comparison

        if price is not None:
            total_amount += price * quantity
            product_quantity_str += f"{product} * {quantity}, "
            unit_price_str += f"{product} : {price}, "

    phone, email, address = customer_info[(first_name, last_name)]
    sales_records.append([f"{store_id}-{sale_id}", first_name, last_name, product_quantity_str[:-2],
                          unit_price_str[:-2], total_amount, purchase_date, phone, email, address])

# Convert the sales records to a DataFrame
sales_df = pd.DataFrame(sales_records, columns=['Sale ID', 'Customer First Name', 'Customer Last Name', 'Product*Quantity',
                                                'Unit Price for each product', 'Total Amount', 'Sales Date',
                                                'Customer Phone', 'Customer Email', 'Customer Address'])

sales_df['Customer First Name'] = sales_df['Customer First Name'].apply(lambda x: x.split('_')[0])

sales_df.head()


Collecting faker
  Downloading Faker-19.2.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faker
Successfully installed faker-19.2.0


Unnamed: 0,Sale ID,Customer First Name,Customer Last Name,Product*Quantity,Unit Price for each product,Total Amount,Sales Date,Customer Phone,Customer Email,Customer Address
0,2-1,Tammy,Raymond,"Jenny's Avocado * 7, Boneless Chicken Thigh fr...","Jenny's Avocado : 3.19, Boneless Chicken Thigh...",194.84,2023-05-25,(163) 785-2038,zcollins@example.org,"897 Chris Port Suite 331, Chenstad, VA 29924"
1,3-2,Scott,Johnston,Barbie with Robbie Croissant * 2,Barbie with Robbie Croissant : 2.09,4.18,2023-07-09,(197) 806-9763,charleswilliams@example.com,"131 Johnson Prairie, Hannaborough, AS 94820"
2,4-3,Adrian,Cannon,"Beef Ribeye from Riverside Farm * 2, Boneless ...","Beef Ribeye from Riverside Farm : 18.07, Bonel...",69.2,2023-01-28,(537) 996-9822,kjohnson@example.org,"620 Robert Oval Apt. 056, Collinsburgh, PA 76308"
3,5-4,Rhonda,Moyer,"David's Farm Egg * 7, Barbie with Robbie Choco...","David's Farm Egg : 3.86, Barbie with Robbie Ch...",140.37,2023-01-24,(396) 861-2547,zacharypeters@example.net,"270 Torres Shores Suite 694, South Kimshire, A..."
4,1-5,Sheila,Davis,"Upper West Green Kale * 1, Mckinnon's Organic ...","Upper West Green Kale : 3.69, Mckinnon's Organ...",23.61,2022-10-15,(728) 998-8651,deannawheeler@example.com,"475 Fuller Grove Suite 470, New Lisaport, GU 9..."


In [None]:
sales_df.to_excel("sales_v3.xlsx", index=False)

**customer**

In [5]:
# Create the Dataframe
data_customer = {
    "customer_id": customer_ids,
    "first_name" : first_names,
    "last_name" : last_names,
    "customer_phone" : customer_phones,
    "customer_email" : customer_emails,
    "customer_address" : customer_addresses
}
df_customer = pd.DataFrame(data_customer)


# Printing the Dataframe
print(df_customer)

     customer_id    first_name last_name  customer_phone  \
0              0       Tammy_0   Raymond  (163) 785-2038   
1              1       Scott_1  Johnston  (197) 806-9763   
2              2      Adrian_2    Cannon  (537) 996-9822   
3              3      Rhonda_3     Moyer  (396) 861-2547   
4              4      Sheila_4     Davis  (728) 998-8651   
..           ...           ...       ...             ...   
745          745    Nathan_745  Williams  (363) 398-0608   
746          746     Kelly_746     Chang  (221) 995-9110   
747          747   Bethany_747    Watson  (927) 556-7093   
748          748      Ryan_748     Clark  (275) 328-6703   
749          749  Terrence_749     Ayers  (882) 932-4976   

                   customer_email  \
0            zcollins@example.org   
1     charleswilliams@example.com   
2            kjohnson@example.org   
3       zacharypeters@example.net   
4       deannawheeler@example.com   
..                            ...   
745          brandy2

In [8]:
df_customer.to_excel("customer.xlsx", index=False)

**loyalty**

In [6]:
import numpy as np
import pandas as pd

# Determine the number of customers to be assigned loyalty cards (50%)
num_cards = int(len(df_customer) * 0.5)

# Sample a subset of the customers
sampled_customers = df_customer.sample(num_cards)

# Create a separate counter for card_ids
card_ids_counter = range(1, num_cards + 1)

# Create loyalty card ids
card_ids = ["C" + str(id).zfill(5) for id in card_ids_counter]

# Generate random points between 50 and 1000 for each customer
points = np.random.randint(50, 1001, size=num_cards)

# Create the loyalty Dataframe
data_loyalty = {
    "card_id": card_ids,
    "customer_id" : sampled_customers['customer_id'].values,
    "points" : points
}
df_loyalty = pd.DataFrame(data_loyalty)

# Printing the Dataframe
print(df_loyalty)

    card_id  customer_id  points
0    C00001          536     317
1    C00002          101     610
2    C00003          616     289
3    C00004          699     236
4    C00005          347     115
..      ...          ...     ...
370  C00371          473     749
371  C00372          541     491
372  C00373          551     902
373  C00374          720     472
374  C00375          379     138

[375 rows x 3 columns]


In [7]:
df_loyalty.to_excel("loyalty.xlsx", index=False)