Data Generation 🛠️🪄🏭

In [22]:
from faker import Faker
import pandas as pd
import numpy as np
import uuid
from datetime import datetime
import sqlite3

fake = Faker('pt_BR')

conn = sqlite3.connect('C:/Users/santo/Documents/GitHubPublished/DataScienceProject/database/ecommerceProject.db')
cursor = conn.cursor()

numYears = 5
customerBase = 1500
customerGrowthRate = 0.2
salesCustomerYear = 3
salesGrowthRate = 0.25

categoriesDict = {
    "Eletronicos": ["Smartphone X", "Tablet Y", "Fone Bluetooth", "TV 4K", "Smartwatch", "Carregador Turbo", "Caixa de Som Bluetooth"],
    "Computadores": ["Notebook Ultra", "PC Gamer", "Monitor Curvo", "Teclado Mecânico", "Mouse RGB", "Webcam Full HD", "SSD 1TB"],
    "Roupas": ["Camiseta Dry Fit", "Jaqueta Jeans", "Tênis Running", "Vestido Casual", "Boné Esportivo", "Mochila Casual", "Óculos de Sol"],
    "Livros": ["Python para Iniciantes", "Data Science Avançado", "O Poder do Hábito", "1984", "Mindset", "Clean Code", "A Arte da Guerra"],
    "Beleza": ["Perfume Elegance", "Kit Skincare", "Batom Matte", "Shampoo Orgânico", "Máscara Facial", "Base Líquida", "Protetor Solar"],
    "Automotivo": ["Pneu Aro 17", "Óleo Sintético", "Câmera de Ré", "Suporte Celular", "Capa para Banco", "Kit Ferramentas", "Lâmpada LED Automotiva"],
    "Brinquedos": ["Carrinho Controle Remoto", "Boneca Fashion", "Lego Criativo", "Quebra-Cabeça 1000pçs", "Jogo Educativo", "Playset Cozinha", "Bola de Vinil"],
    "Esportes": ["Bola de Futebol", "Raquete de Tênis", "Corda de Pular", "Mochila Esportiva", "Kit de Halteres", "Bicicleta Speed", "Luvas de Boxe"],
    "Moveis": ["Sofá Retrátil", "Mesa de Jantar", "Cadeira Gamer", "Guarda-Roupa 6 Portas", "Cama Box Queen", "Estante de Livros", "Mesa de Escritório"],
    "Eletrodomesticos": ["Geladeira Frost Free", "Micro-ondas Inox", "Máquina de Lavar", "Aspirador de Pó", "Fogão 5 Bocas", "Cafeteira Elétrica", "Liquidificador"],
    "Ferramentas": ["Furadeira Elétrica", "Chave de Fenda", "Serra Circular", "Martelo Reforçado", "Trena Digital", "Alicate Universal", "Kit Brocas"],
    "Petshop": ["Ração Premium", "Coleira Ajustável", "Brinquedo Interativo", "Cama para Cachorro", "Areia para Gato", "Shampoo para Pets", "Arranhador para Gatos"],
    "Perfumaria": ["Desodorante Roll-on", "Hidratante Corporal", "Shampoo Anticaspa", "Sabonete Líquido", "Óleo Capilar", "Condicionador Nutritivo", "Creme para Mãos"],
    "Papelaria": ["Caderno Universitário", "Caneta Esferográfica", "Marcador Permanente", "Papel Sulfite A4", "Planner Diário", "Grampeador", "Estojo Organizador"],
    "Games": ["Console NextGen", "Controle Sem Fio", "Teclado Gamer RGB", "Headset Surround", "Cadeira Gamer Pro", "Mousepad XL", "Cartão Presente PSN"]
}

Clientes

In [23]:
def generateCustomer(customerBase, customerGrowthRate, numYears):
    customerYear = [int(customerBase * (1 + customerGrowthRate) ** years) for years in range(numYears)]
    allCustomers = []
    customerInfo = {'customer_id': {}, 'address': {}, 'email': {}, 'birth_date': {}, 'state': {}}

    for year, numCustomer in enumerate(customerYear):
        namesYear = [fake.name() for _ in range(numCustomer)]
        customersIdsYear = [str(uuid.uuid4()) for _ in range(numCustomer)]
        addressYear = [fake.address() for _ in range(numCustomer)]
        emailsYear = [fake.email() for _ in range(numCustomer)]
        birthDateYear = [fake.date_of_birth(minimum_age=18, maximum_age=80) for _ in range(numCustomer)]
        stateYear = [fake.state() for _ in range(numCustomer)]

        for name, id, address, email, birthDate, state in zip(namesYear, customersIdsYear, addressYear, emailsYear, birthDateYear, stateYear):
            allCustomers.append(name)
            customerInfo['customer_id'][name] = id
            customerInfo['address'][name] = address
            customerInfo['email'][name] = email
            customerInfo['birth_date'][name] = birthDate
            customerInfo['state'][name] = state

    return allCustomers, customerInfo

In [25]:
def generateSales(numYears, salesCustomerYear, salesGrowthRate, categoriesDict, customerInfo):
    ordersData = []
    ordersItemsData = []
    currentYear = datetime.now().year
    productsIdsWeek = {}

    for years in range(numYears):
        salesYear = int(salesCustomerYear * (1 + salesGrowthRate) ** years * len(customerInfo['customer_id']))
        dateYear = pd.to_datetime([datetime(currentYear - years, 1, 1) + pd.to_timedelta(np.random.randint(0, 365), unit='D') for _ in range(salesYear)])
        custumerYear = np.random.choice(list(customerInfo['customer_id'].keys()), salesYear)

        for date, customer in zip(dateYear, custumerYear):
            salesUUID = str(uuid.uuid4())
            customerId = customerInfo['customer_id'][customer]
            address = customerInfo['address'][customer].split(',')[-1].strip()
            state = customerInfo['state'][customer]
            email = customerInfo['email'][customer]
            birthDate = customerInfo['birth_date'][customer]
            numProducts = np.random.randint(1, 6)
            totalSalesValue = 0
            items = []

            weekNumber = date.isocalendar()[1]
            yearsNumber = date.isocalendar()[0]

            for _ in range(numProducts):
                category = np.random.choice(list(categoriesDict.keys()))
                product = np.random.choice(categoriesDict[category])
                weekKey = (yearsNumber, weekNumber, product)
                
                if weekKey in productsIdsWeek:
                    productUUID = productsIdsWeek[weekKey]
                else:
                    productUUID = str(uuid.uuid4())[:8]
                    productsIdsWeek[weekKey] = productUUID

                categoryUUID = str(uuid.uuid4())
                unitPrice = np.round(np.random.uniform(25, 600), 2)
                quantity = np.random.randint(1, 14)
                salesValue = quantity * unitPrice
                totalSalesValue += salesValue
                stockQuantity = np.random.randint(0, 300)
                itemsUUID = str(uuid.uuid4())

                itemData = {
                    'items_id': itemsUUID,
                    'order_id': salesUUID,
                    'category_name': category,
                    'category_id': categoryUUID,
                    'product_name': product,
                    'product_id': productUUID,
                    'order_quantity': quantity,
                    'unit_price': unitPrice,
                    'order_value': salesValue,
                    'stock_quantity': stockQuantity,
                }
                ordersItemsData.append(itemData)
                
            orderData = {
                'order_date': date,
                'customer_name': customer,
                'customer_id': customerId,
                'address': address,
                'state': state,
                'email': email,
                'customer_birth_date': birthDate,
                'order_id': salesUUID,
                'order_value': totalSalesValue,  
            }
            ordersData.append(orderData)
            
    return pd.DataFrame(ordersData), pd.DataFrame(ordersItemsData)

def generateCategories(categoriesDict):
    categoriesData = []
    for category_name in categoriesDict.keys():
        category_id = str(uuid.uuid4())
        categoriesData.append({'category_name': category_name, 'category_id': category_id})

    return pd.DataFrame(categoriesData)
            

allCustomers, customerInfo = generateCustomer(customerBase, customerGrowthRate, numYears)
df_orders, df_order_items = generateSales(numYears, salesCustomerYear, salesGrowthRate, categoriesDict, customerInfo)

print('DataFrame criado com sucesso\n')

DataFrame criado com sucesso



In [26]:
df_customers = df_orders[['customer_id', 'customer_name', 'address', 'state', 'email', 'customer_birth_date']].drop_duplicates(subset=['customer_id'])
df_products = df_order_items[['product_id', 'product_name', 'category_id', 'unit_price', 'stock_quantity']].drop_duplicates(subset=['product_id'])
df_categories = generateCategories(categoriesDict)

TEXT_PRIMARY_KEY = 'TEXT PRIMARY KEY'

dtypeCustomer = {
    'customer_id': TEXT_PRIMARY_KEY,
    'customer_name': 'TEXT',
    'address': 'TEXT',
    'state': 'TEXT',
    'email': 'TEXT',
    'customer_birth_date': 'date',
}

dtypeOrder = {
    'order_id': TEXT_PRIMARY_KEY,
    'order_value': 'REAL',
    'FOREIGN KEY (customer_id)': 'REFERENCES customer(customer_id)',
    'order_date': 'date',
}

dtypeProducts = {
    'product_id': TEXT_PRIMARY_KEY,
    'product_name': 'TEXT',
    'FOREIGN KEY (category_id)': 'REFERENCES categories(category_id)',
    'unit_price': 'REAL',
    'stock_quantity': 'INTEGER',
}

dtypeCategories = {
    'category_name': 'TEXT',
    'category_id': TEXT_PRIMARY_KEY,
}

dtypeOrderItems = {
    'items_id': TEXT_PRIMARY_KEY,
    'FOREIGN KEY (product_id)': 'REFERENCES products(product_id)',
    'FOREIGN KEY (order_id)': 'REFERENCES orders(order_id)',
    'unit_price': 'REAL',
    'order_quantity': 'INTEGER',
}

In [None]:
print(f"Número de linhas em df_orders antes de remover duplicatas: {len(df_orders)}")
df_orders = df_orders.drop_duplicates(subset=['order_id'])
print(f"Número de linhas em df_orders depois de remover duplicatas: {len(df_orders)}")
print(f"Número de order_id únicos em df_orders: {df_orders['order_id'].nunique()}")

print(f"Número de linhas em df_order_items antes de remover duplicatas: {len(df_order_items)}")
df_order_items = df_order_items.drop_duplicates(subset=['items_id'])
print(f"Número de linhas em df_order_items depois de remover duplicatas: {len(df_order_items)}")
print(f"Número de items_id únicos em df_order_items: {df_order_items['items_id'].nunique()}")

df_customers.to_sql('customers', conn, if_exists='replace', index=False, dtype=dtypeCustomer)
df_orders.to_sql('orders', conn, if_exists='replace', index=False, dtype=dtypeOrder)
df_products.to_sql('products', conn, if_exists='replace', index=False, dtype=dtypeProducts)
df_categories.to_sql('categories', conn, if_exists='replace', index=False, dtype=dtypeCategories)
df_order_items.to_sql('order_items', conn, if_exists='replace', index=False, dtype=dtypeOrderItems)

dictQuery = {
    "queryCustomers":'SELECT * FROM customers LIMIT 5',
    "queryOrders":'SELECT * FROM orders LIMIT 5',
    "queryProducts":'SELECT * FROM products LIMIT 5',
    "queryCategories":'SELECT * FROM categories LIMIT 5',
    "queryOrderItens":'SELECT * FROM order_items LIMIT 5'
}

def runQuery(queryKey, params=()):
    query = dictQuery[queryKey]
    return pd.read_sql(query, conn, params=params)

tCustomers = runQuery("queryCustomers")
tOrders = runQuery("queryOrders")
tProducts = runQuery("queryProducts")
tCategories = runQuery("queryCategories")
tOrderItens = runQuery("queryOrderItens")

print('\nBanco de dados atualizado com sucesso')

conn.close()

Número de linhas em df_orders antes de remover duplicatas: 240103
Número de linhas em df_orders depois de remover duplicatas: 240103
Número de order_id únicos em df_orders: 240103
Número de linhas em df_order_items antes de remover duplicatas: 720702
Número de linhas em df_order_items depois de remover duplicatas: 720702
Número de items_id únicos em df_order_items: 720702

Banco de dados atualizado com sucesso
