In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, time

In [3]:
# Set random seed for reproducibility
np.random.seed(42)

# Generate dates
start_date = datetime(2023, 1, 1)
dates = [start_date + timedelta(days=x) for x in range(365)]
dates = np.random.choice(dates, size=200)

# Generate customer IDs
customer_ids = [f'CUST_{i:04d}' for i in range(1, 151)]

# Categories and subcategories
categories = ['Vestidos', 'Calças', 'Blusas', 'Saias', 'Acessórios']
subcategories = {
    'Vestidos': ['Casual', 'Festa', 'Praia'],
    'Calças': ['Jeans', 'Social', 'Moletom'],
    'Blusas': ['T-shirt', 'Social', 'Moletom'],
    'Saias': ['Mini', 'Midi', 'Longa'],
    'Acessórios': ['Bolsas', 'Cintos', 'Bijuterias']
}

# Generate data
data = {
    'order_id': [f'ORDER_{i:04d}' for i in range(1, 201)],
    'date': sorted(dates),
    'customer_id': np.random.choice(customer_ids, size=200),
    'category': np.random.choice(categories, size=200),
    'price': np.random.uniform(50, 500, size=200).round(2),
    'size': np.random.choice(['P', 'M', 'G', 'GG'], size=200),
    'color': np.random.choice(['Preto', 'Branco', 'Azul', 'Vermelho', 'Rosa'], size=200),
    'rating': np.random.choice([1, 2, 3, 4, 5], size=200, p=[0.05, 0.1, 0.15, 0.3, 0.4]),
    'returned': np.random.choice([True, False], size=200, p=[0.15, 0.85]),
    'payment_method': np.random.choice(['Credit Card', 'Debit Card', 'PIX', 'Boleto'], size=200)
}

# Create subcategory based on category
data['subcategory'] = [np.random.choice(subcategories[cat]) for cat in data['category']]

# Create DataFrame
df_fashion = pd.DataFrame(data)

# Add return reason for returned items
return_reasons = ['Wrong Size', 'Different from Picture', 'Quality Issues', 'Changed Mind', 'Damaged']
df_fashion['return_reason'] = np.where(df_fashion['returned'], 
                                     np.random.choice(return_reasons, size=200),
                                     None)

display(df_fashion.head())
print("\nDataset Shape:", df_fashion.shape)
print("\nColumns:", df_fashion.columns.tolist())

Unnamed: 0,order_id,date,customer_id,category,price,size,color,rating,returned,payment_method,subcategory,return_reason
0,ORDER_0001,2023-01-02,CUST_0130,Calças,307.16,GG,Azul,5,False,PIX,Jeans,
1,ORDER_0002,2023-01-05,CUST_0131,Acessórios,383.44,M,Vermelho,4,False,Boleto,Bolsas,
2,ORDER_0003,2023-01-09,CUST_0113,Calças,395.24,GG,Rosa,5,False,Credit Card,Jeans,
3,ORDER_0004,2023-01-13,CUST_0101,Acessórios,420.26,M,Preto,5,False,Credit Card,Bijuterias,
4,ORDER_0005,2023-01-14,CUST_0113,Vestidos,384.9,G,Preto,4,False,Credit Card,Festa,



Dataset Shape: (200, 12)

Columns: ['order_id', 'date', 'customer_id', 'category', 'price', 'size', 'color', 'rating', 'returned', 'payment_method', 'subcategory', 'return_reason']


In [4]:
df_fashion.to_csv('./data/fashion_sales.csv', index=False)

In [8]:
# Set random seed
np.random.seed(42)

# Generate dates
start_date = datetime(2023, 1, 1)
dates = [start_date + timedelta(days=x) for x in range(365)]
dates = np.random.choice(dates, size=200)

# Simplified time generation
lunch_times = [time(hour=h, minute=m) for h in range(11,15) for m in range(0,60,15)]
dinner_times = [time(hour=h, minute=m) for h in range(18,23) for m in range(0,60,15)]
all_times = lunch_times + dinner_times

# Basic data
restaurants = [
    'Sabor Caseiro', 'Pizza Express', 'Burger King', 'Sushi House',
    'Veggie Life', 'Taco Feliz', 'Pasta Bella', 'Food Box'
]

categories = {
    'Sabor Caseiro': 'Brasileira',
    'Pizza Express': 'Pizza',
    'Burger King': 'Hambúrguer',
    'Sushi House': 'Japonesa',
    'Veggie Life': 'Vegetariana',
    'Taco Feliz': 'Mexicana',
    'Pasta Bella': 'Italiana',
    'Food Box': 'Variada'
}

# Generate data
data = {
    'order_id': [f'ORDER_{i:04d}' for i in range(1, 201)],
    'date': sorted(dates),
    'time': np.random.choice(all_times, size=200),
    'restaurant': np.random.choice(restaurants, size=200),
    'delivery_time': np.random.normal(35, 10, size=200).round().astype(int),
    'order_value': np.random.normal(80, 30, size=200).round(2),
    'items_quantity': np.random.randint(1, 6, size=200),
    'delivery_rating': np.random.choice([1, 2, 3, 4, 5], size=200, p=[0.05, 0.1, 0.15, 0.3, 0.4]),
    'restaurant_rating': np.random.choice([1, 2, 3, 4, 5], size=200, p=[0.05, 0.1, 0.15, 0.3, 0.4]),
    'region': np.random.choice(['Norte', 'Sul', 'Leste', 'Oeste', 'Centro'], size=200),
    'payment_method': np.random.choice(['Credit Card', 'Debit Card', 'PIX', 'Cash'], size=200)
}

# Create DataFrame
df_delivery = pd.DataFrame(data)

# Add restaurant category
df_delivery['restaurant_category'] = df_delivery['restaurant'].map(categories)

# Add delivery status
df_delivery['delivery_status'] = np.where(
    df_delivery['delivery_time'] <= 45,
    'On Time',
    'Delayed'
)

# Adicionar detalhes do pedido
df_delivery['order_items'] = np.random.randint(1, 6, size=200)  # Número de itens por pedido
df_delivery['has_drinks'] = np.random.choice([True, False], size=200)  # Se inclui bebidas
df_delivery['is_scheduled'] = np.random.choice([True, False], size=200, p=[0.2, 0.8])  # Se é pedido agendado

display(df_delivery.head())
print("\nFormato do DataFrame:", df_delivery.shape)
print("\nColunas:", df_delivery.columns.tolist())



Unnamed: 0,order_id,date,time,restaurant,delivery_time,order_value,items_quantity,delivery_rating,restaurant_rating,region,payment_method,restaurant_category,delivery_status,order_items,has_drinks,is_scheduled
0,ORDER_0001,2023-01-02,11:15:00,Sushi House,41,45.83,4,4,5,Centro,Debit Card,Japonesa,On Time,5,True,False
1,ORDER_0002,2023-01-05,11:30:00,Pizza Express,49,88.03,3,2,4,Oeste,Debit Card,Pizza,Delayed,1,True,False
2,ORDER_0003,2023-01-09,18:00:00,Food Box,31,163.51,2,5,5,Norte,Debit Card,Variada,On Time,1,False,False
3,ORDER_0004,2023-01-13,11:15:00,Burger King,37,31.67,5,3,2,Oeste,PIX,Hambúrguer,On Time,1,True,False
4,ORDER_0005,2023-01-14,11:15:00,Veggie Life,41,75.54,5,2,2,Leste,Cash,Vegetariana,On Time,1,False,False



Formato do DataFrame: (200, 16)

Colunas: ['order_id', 'date', 'time', 'restaurant', 'delivery_time', 'order_value', 'items_quantity', 'delivery_rating', 'restaurant_rating', 'region', 'payment_method', 'restaurant_category', 'delivery_status', 'order_items', 'has_drinks', 'is_scheduled']


In [19]:
# Salvar em CSV
df_delivery.to_csv('./data/food_delivery.csv', index=False)

In [10]:
# Set random seed for reproducibility
np.random.seed(42)

# Generate dates
start_date = datetime(2023, 1, 1)
dates = [start_date + timedelta(days=x) for x in range(365)]
dates = np.random.choice(dates, size=200)

# Generate member IDs
member_ids = [f'MEMBER_{i:04d}' for i in range(1, 101)]

# Activities and their categories
activities = {
    'Musculação': 'Força',
    'Yoga': 'Bem-estar',
    'Spinning': 'Cardio',
    'Zumba': 'Dança',
    'CrossFit': 'Funcional',
    'Pilates': 'Bem-estar',
    'Boxing': 'Luta',
    'Jump': 'Cardio'
}

# Generate check-in times with peak hours
def generate_time():
    peak_hours = [
        (6, 8),   # Morning peak
        (12, 14), # Lunch peak
        (17, 21)  # Evening peak
    ]
    
    peak = np.random.choice([0, 1, 2], p=[0.3, 0.2, 0.5])
    hour = np.random.randint(peak_hours[peak][0], peak_hours[peak][1])
    minute = np.random.randint(0, 60)
    return time(hour, minute)

# Generate data
data = {
    'check_in_id': [f'CHECK_{i:04d}' for i in range(1, 201)],
    'date': sorted(dates),
    'time': [generate_time() for _ in range(200)],
    'member_id': np.random.choice(member_ids, size=200),
    'activity': np.random.choice(list(activities.keys()), size=200),
    'duration_minutes': np.random.normal(60, 15, size=200).round().astype(int),
    'calories_burned': np.random.normal(300, 100, size=200).round().astype(int),
    'age': np.random.randint(18, 65, size=200),
    'gender': np.random.choice(['M', 'F'], size=200),
    'membership_type': np.random.choice(['Basic', 'Premium', 'VIP'], size=200),
    'attendance_streak': np.random.randint(1, 30, size=200)
}

# Create DataFrame
df_fitness = pd.DataFrame(data)

# Add activity category
df_fitness['activity_category'] = df_fitness['activity'].map(activities)

# Add satisfaction score
df_fitness['satisfaction_score'] = np.random.choice([1, 2, 3, 4, 5], size=200, p=[0.05, 0.1, 0.15, 0.3, 0.4])

# Add cancellation status and reason
df_fitness['cancelled'] = np.random.choice([True, False], size=200, p=[0.1, 0.9])
cancellation_reasons = ['Mudança de endereço', 'Custo alto', 'Falta de tempo', 'Insatisfação', 'Outros']
df_fitness['cancellation_reason'] = np.where(
    df_fitness['cancelled'],
    np.random.choice(cancellation_reasons, size=200),
    None
)

display(df_fitness.head())
print("\nDataset Shape:", df_fitness.shape)
print("\nColumns:", df_fitness.columns.tolist())

Unnamed: 0,check_in_id,date,time,member_id,activity,duration_minutes,calories_burned,age,gender,membership_type,attendance_streak,activity_category,satisfaction_score,cancelled,cancellation_reason
0,CHECK_0001,2023-01-02,12:36:00,MEMBER_0050,Pilates,66,257,29,M,Premium,18,Bem-estar,5,False,
1,CHECK_0002,2023-01-05,06:58:00,MEMBER_0012,Pilates,88,269,25,M,Basic,14,Bem-estar,1,False,
2,CHECK_0003,2023-01-09,18:27:00,MEMBER_0065,Musculação,48,322,25,M,Basic,20,Força,2,False,
3,CHECK_0004,2023-01-13,06:31:00,MEMBER_0054,Pilates,41,252,45,F,Premium,18,Bem-estar,5,False,
4,CHECK_0005,2023-01-14,19:01:00,MEMBER_0005,Jump,33,426,37,M,VIP,10,Cardio,2,True,Outros



Dataset Shape: (200, 15)

Columns: ['check_in_id', 'date', 'time', 'member_id', 'activity', 'duration_minutes', 'calories_burned', 'age', 'gender', 'membership_type', 'attendance_streak', 'activity_category', 'satisfaction_score', 'cancelled', 'cancellation_reason']


In [12]:
df_fitness.to_csv('./data/fitness_gym.csv', index=False)

In [16]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, time

# Set random seed for reproducibility
np.random.seed(42)

# Generate dates
start_date = datetime(2023, 1, 1)
dates = [start_date + timedelta(days=x) for x in range(365)]
dates = np.random.choice(dates, size=200)

# Simplified time generation
def generate_times():
    morning_times = [time(hour=h, minute=m) for h in range(6,10) for m in range(0,60,15)]  # Horário de pico manhã
    day_times = [time(hour=h, minute=m) for h in range(10,17) for m in range(0,60,30)]     # Horário do dia
    night_times = [time(hour=h, minute=m) for h in range(17,23) for m in range(0,60,15)]   # Horário de pico noite
    late_times = [time(hour=h, minute=m) for h in range(23,24) for m in range(0,60,30)]    # Horário da noite
    dawn_times = [time(hour=h, minute=m) for h in range(0,6) for m in range(0,60,30)]      # Madrugada
    
    all_times = morning_times*2 + day_times + night_times*2 + late_times + dawn_times  # Duplicando horários de pico
    return np.random.choice(all_times, size=200)

# Music genres and subgenres
genres = {
    'Rock': ['Classic Rock', 'Alternative', 'Metal', 'Indie'],
    'Pop': ['Pop Internacional', 'Pop Brasil', 'K-pop'],
    'Electronic': ['House', 'Techno', 'Trance'],
    'Hip Hop': ['Rap', 'Trap', 'R&B'],
    'MPB': ['Samba', 'Bossa Nova', 'Forró'],
    'Jazz': ['Bebop', 'Swing', 'Fusion']
}

# Generate user IDs
user_ids = [f'USER_{i:04d}' for i in range(1, 101)]

# Generate data
data = {
    'play_id': [f'PLAY_{i:04d}' for i in range(1, 201)],
    'date': sorted(dates),
    'time': generate_times(),
    'user_id': np.random.choice(user_ids, size=200),
    'genre': np.random.choice(list(genres.keys()), size=200),
    'duration_seconds': np.random.normal(210, 60, size=200).round().astype(int),
    'platform': np.random.choice(['Mobile', 'Desktop', 'Web', 'Smart TV'], size=200),
    'subscription_type': np.random.choice(['Free', 'Premium', 'Family'], size=200),
    'stream_quality': np.random.choice(['Low', 'Medium', 'High', 'Ultra'], size=200),
    'offline_mode': np.random.choice([True, False], size=200, p=[0.3, 0.7])
}

# Create DataFrame
df_streaming = pd.DataFrame(data)

# Add subgenre based on genre
df_streaming['subgenre'] = [np.random.choice(genres[genre]) for genre in df_streaming['genre']]

# Add skip status and reason
df_streaming['skipped'] = np.random.choice([True, False], size=200, p=[0.2, 0.8])
skip_reasons = ['Não gostou', 'Música repetida', 'Mudança de humor', 'Interrupção', 'Outro']
df_streaming['skip_reason'] = np.where(
    df_streaming['skipped'],
    np.random.choice(skip_reasons, size=200),
    None
)

# Add user interaction metrics
df_streaming['liked'] = np.random.choice([True, False], size=200, p=[0.3, 0.7])
df_streaming['added_to_playlist'] = np.random.choice([True, False], size=200, p=[0.15, 0.85])
df_streaming['share_count'] = np.random.randint(0, 5, size=200)

# Add user demographics and preferences
df_streaming['user_age'] = np.random.randint(16, 65, size=200)
df_streaming['listening_device'] = np.random.choice(['Smartphone', 'Computer', 'Tablet', 'Smart Speaker'], size=200)
df_streaming['audio_quality_setting'] = np.random.choice(['Auto', 'High', 'Normal', 'Low'], size=200)


display(df_streaming.head())
print("\nFormato do DataFrame:", df_streaming.shape)
print("\nColunas:", df_streaming.columns.tolist())



Unnamed: 0,play_id,date,time,user_id,genre,duration_seconds,platform,subscription_type,stream_quality,offline_mode,subgenre,skipped,skip_reason,liked,added_to_playlist,share_count,user_age,listening_device,audio_quality_setting
0,PLAY_0001,2023-01-02,06:15:00,USER_0009,Rock,63,Web,Free,High,True,Metal,False,,True,False,4,55,Smartphone,Auto
1,PLAY_0002,2023-01-05,06:30:00,USER_0043,Electronic,156,Desktop,Free,Low,True,House,False,,False,False,2,35,Computer,Normal
2,PLAY_0003,2023-01-09,02:00:00,USER_0048,Rock,203,Mobile,Family,Low,False,Indie,True,Não gostou,False,False,0,64,Tablet,Auto
3,PLAY_0004,2023-01-13,19:15:00,USER_0039,Jazz,207,Desktop,Family,Low,False,Fusion,False,,False,False,4,48,Computer,Normal
4,PLAY_0005,2023-01-14,19:30:00,USER_0093,Rock,226,Desktop,Family,Medium,True,Classic Rock,False,,False,True,4,27,Smartphone,Auto



Formato do DataFrame: (200, 19)

Colunas: ['play_id', 'date', 'time', 'user_id', 'genre', 'duration_seconds', 'platform', 'subscription_type', 'stream_quality', 'offline_mode', 'subgenre', 'skipped', 'skip_reason', 'liked', 'added_to_playlist', 'share_count', 'user_age', 'listening_device', 'audio_quality_setting']


In [18]:
# Salvar em CSV
df_streaming.to_csv('./data/music_streaming.csv', index=False)