In [None]:
pip install faker

In [39]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime
from google.colab import files

In [40]:
# Initializing faker
fake = Faker()

#Setting the maximum amount of records and customers
num_records = 10000
num_customers = 1000

# Generate unique order_ids
order_ids = [fake.uuid4() for _ in range(num_records)]
store = [random.choice(['Genval', 'Brussels']) for _ in range(num_records)]


#Generating fake dates from today all the way back to Jan, 1st last year
start_date = datetime(datetime.now().year - 1, 1, 1)
end_date = datetime(datetime.now().year, datetime.now().month, datetime.now().day)
dates = [fake.date_between(start_date, end_date) for _ in range(num_records)]

# Generate random customer_ids
customer_ids_max = [fake.uuid4() for _ in range(num_customers)]
customer_ids = [random.choice(customer_ids_max) for _ in range(num_records)]

# Create a DataFrame
orders_df = pd.DataFrame({
    'order_id': order_ids,
    'store': store,
    'order_date': dates,
    'customer_id': customer_ids
})

print(orders_df.shape)
orders_df.head()

(10000, 4)


Unnamed: 0,order_id,store,order_date,customer_id
0,24019896-1ee5-421d-8633-7510e8ae3815,Genval,2023-03-24,7bfe26b1-6cf7-4802-a41d-4f2cbf9c9327
1,8fe2a767-245d-46a6-81be-6217db7d6981,Genval,2023-05-21,052ba7e1-c4de-4d5a-ab00-6e7a14713ae5
2,fd804222-c49b-4b61-8331-731cd2a71a00,Brussels,2023-12-29,efa5d3bb-1840-4877-aab5-2f4f8d138b33
3,173d3ef4-be18-419d-be66-90f864a8d00d,Genval,2023-06-23,482eb64b-0947-4f58-8ee1-90f9bda13fa4
4,b6258bd8-1d0c-48ec-812f-80b84a88221a,Genval,2023-04-25,d48e9290-14b6-464d-b540-3c8a7b8a31dd


In [41]:
# Getting a csv with product information
products_df = pd.read_csv('/content/retail products.csv')

order_lines = []

# Some lists to scatter around the rest of the columns
suffixes = ['Flared', 'Linen', 'Cotton', 'Denim', 'Textured', 'Stretch', 'Comfort','Casual','Leather', 'Basic', 'Plastic', 'Rubber']
condition = ['New', 'New Open Box', 'Refurbished', 'Like New', 'Very Good', 'Good', 'Acceptable','As is']
color_names = ['Red', 'Green', 'Blue', 'Yellow', 'Purple', 'Orange', 'Pink', 'Brown', 'Black', 'White', 'Gray', 'Cyan', 'Magenta', 'Teal', 'Lime', 'Indigo', 'Violet', 'Gold', 'Silver']

# Assign random products to each order
for order_id in orders_df['order_id']:
    num_products = random.randint(1, 5)  # Each order has between 1 to 5 products
    sampled_products = products_df.sample(num_products)

    for _, product in sampled_products.iterrows():
        # Add a random suffix to the product name
        product_name_with_suffix = f"{random.choice(suffixes)} {product['Product']}"

        # Append the order line information
        order_lines.append({
            'order_id': order_id,
            'product_name': product_name_with_suffix,
            'type': product['Product'],
            'price': round(random.uniform(0.0, 1000.0), 2),
            'condition': random.choice(condition),
            'color': random.choice(color_names),
            'brand': product['Brand'],
            'category': product['Category'],
            'sub_category': product['Subcategory']
        })

# Create a DataFrame from the order lines
order_line_df = pd.DataFrame(order_lines)
# Display the first few rows of the 'order_line' DataFrame
print(order_line_df.shape)
order_line_df.head()

(29920, 9)


Unnamed: 0,order_id,product_name,type,price,condition,color,brand,category,sub_category
0,24019896-1ee5-421d-8633-7510e8ae3815,Basic Shoes,Shoes,307.84,Good,Silver,New Balance,Footwear,Athletic
1,8fe2a767-245d-46a6-81be-6217db7d6981,Linen Belt,Belt,418.86,Like New,Indigo,Tommy Hilfiger,Accessories,Belts
2,8fe2a767-245d-46a6-81be-6217db7d6981,Leather Shoes,Shoes,919.55,Refurbished,Violet,Puma,Footwear,Athletic
3,fd804222-c49b-4b61-8331-731cd2a71a00,Linen Handbag,Handbag,311.33,Like New,Lime,Michael Kors,Accessories,Bags
4,fd804222-c49b-4b61-8331-731cd2a71a00,Rubber Necklace,Necklace,715.4,As is,Blue,Fossil,Accessories,Jewelry


In [42]:
# Generate a unique customer dataset based on the fake customer_id created for the orders dataset
num_customers = len(set(customer_ids))

# Generate random customer names
customer_names = [fake.name() for _ in range(num_customers)]

# Generate random genders
genders = [random.choice(['Male', 'Female']) for _ in range(num_customers)]
language = [random.choice(['English', 'French', 'Dutch']) for _ in range(num_customers)]
member = [random.choice([True, False]) for _ in range(num_customers)]
age = [random.randint(18, 50) for _ in range(num_customers)]

# Getting Belgian cities, to plot customers on a map later on, for now only cities, but in the future, maybe will be more specific
cities_data = pd.read_csv('/content/cities.csv')[['name', 'zipCode', 'province']]
cities_data = cities_data.drop_duplicates('zipCode',keep='first')
cities_data = cities_data.rename(columns={'name': 'city'})
cities_data['country'] = 'Belgium'
postcodes = cities_data['zipCode'].tolist()

postcodes_list = [random.choice(postcodes) for _ in range(num_customers)]

# Create a DataFrame
customers_dataset = pd.DataFrame({
    'customer_id': list(set(customer_ids)),
    'customer_name': customer_names,
    'language': language,
    'gender': genders,
    'age': age,
    'is_member': member,
    'zipCode': postcodes_list
})

customers_dataset = pd.merge(customers_dataset, cities_data, on='zipCode', how='left')

print(customers_dataset.shape)
customers_dataset.head()

(1000, 10)


Unnamed: 0,customer_id,customer_name,language,gender,age,is_member,zipCode,city,province,country
0,778a2dcc-b8ad-4a4e-a2f2-5068923b23af,Michael Lyons,Dutch,Female,40,True,7533,Thimougies,Henegouwen,Belgium
1,069298cd-e857-4f6b-ba63-7835bbe46bbf,Lisa Park,Dutch,Male,24,True,3910,Neerpelt,Limburg,Belgium
2,b2ecb6f1-61c0-405b-b44f-04eb504cd2c6,Matthew Coleman,French,Male,46,False,1755,Gooik,Vlaams-Brabant,Belgium
3,7953b5cb-75e7-4136-aeec-098edbe1a5e0,Michelle Wagner,Dutch,Female,40,False,8904,Boezinge,West-Vlaanderen,Belgium
4,eb816a61-50f2-4510-9355-34b586141850,Christina Johnson,Dutch,Male,48,True,2220,Hallaar,Antwerpen,Belgium


In [44]:
orders_and_clients = pd.merge(orders_df, customers_dataset, on='customer_id', how='left')
complete_order_details = pd.merge(orders_and_clients, order_line_df, on='order_id', how='inner')

filename = 'retail_order_details.csv'
complete_order_details.to_csv(filename, index=False)
files.download(filename)

complete_order_details.head()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,order_id,store,order_date,customer_id,customer_name,language,gender,age,is_member,zipCode,...,province,country,product_name,type,price,condition,color,brand,category,sub_category
0,24019896-1ee5-421d-8633-7510e8ae3815,Genval,2023-03-24,7bfe26b1-6cf7-4802-a41d-4f2cbf9c9327,Kyle Lopez,English,Male,19,True,4750,...,Luik,Belgium,Basic Shoes,Shoes,307.84,Good,Silver,New Balance,Footwear,Athletic
1,8fe2a767-245d-46a6-81be-6217db7d6981,Genval,2023-05-21,052ba7e1-c4de-4d5a-ab00-6e7a14713ae5,Heidi Rodriguez,French,Male,50,False,5352,...,Namen,Belgium,Linen Belt,Belt,418.86,Like New,Indigo,Tommy Hilfiger,Accessories,Belts
2,8fe2a767-245d-46a6-81be-6217db7d6981,Genval,2023-05-21,052ba7e1-c4de-4d5a-ab00-6e7a14713ae5,Heidi Rodriguez,French,Male,50,False,5352,...,Namen,Belgium,Leather Shoes,Shoes,919.55,Refurbished,Violet,Puma,Footwear,Athletic
3,fd804222-c49b-4b61-8331-731cd2a71a00,Brussels,2023-12-29,efa5d3bb-1840-4877-aab5-2f4f8d138b33,Natasha Anthony,Dutch,Female,50,False,3803,...,Limburg,Belgium,Linen Handbag,Handbag,311.33,Like New,Lime,Michael Kors,Accessories,Bags
4,fd804222-c49b-4b61-8331-731cd2a71a00,Brussels,2023-12-29,efa5d3bb-1840-4877-aab5-2f4f8d138b33,Natasha Anthony,Dutch,Female,50,False,3803,...,Limburg,Belgium,Rubber Necklace,Necklace,715.4,As is,Blue,Fossil,Accessories,Jewelry
