In [4]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Функция для генерации случайных дат
def random_date(start, end):
    return start + timedelta(days=random.randint(0, (end - start).days))

# Функция для генерации вероятностей удержания
def generate_retention_probabilities(n_days):
    retention_prob = np.zeros(n_days)
    retention_prob[:7] = 0.5     # 50% для первых 7 дней
    retention_prob[7:21] = 0.3   # 30% для 2-3 недели
    retention_prob[21:42] = 0.1  # 10% для 4-6 недели
    retention_prob[42:] = 0.05   # 5% после 6 недель
    return retention_prob

# Генерация данных для таблицы customers
def generate_customers(n_days, avg_customers_per_day):
    total_customers = n_days * avg_customers_per_day
    customer_ids = np.arange(1, total_customers + 1)
    signup_dates = np.array([datetime(2020, 1, 1) + timedelta(days=i // avg_customers_per_day) for i in range(total_customers)])
    regions = np.random.choice(['Москва', 'Тюмень', 'Новосибирск', 'Санкт-Петербург'], total_customers)
    
    return pd.DataFrame({
        'customer_id': customer_ids,
        'region': regions,
        'signup_date': signup_dates
    })

# Генерация данных для таблицы products
def generate_products(n):
    product_ids = range(1, n + 1)
    categories = ['телефоны', 'наушники', 'аксессуары', 'аккумуляторы']
    prices = np.random.uniform(50, 2000, n)

    return pd.DataFrame({
        'product_id': product_ids,
        'category': np.random.choice(categories, n),
        'price': prices.round(2)
    })

# Генерация данных для таблицы orders
def generate_orders(customers, products, n_days):
    retention_prob = generate_retention_probabilities(n_days)
    orders = []
    platforms = ['mobile', 'desktop']

    for day in range(n_days):
        order_date = datetime(2020, 1, 1) + timedelta(days=day)
        cohort_customers = customers[customers['signup_date'] <= order_date].copy()
        cohort_customers['days_since_signup'] = (order_date - cohort_customers['signup_date']).dt.days
        cohort_customers['order_probability'] = retention_prob[np.clip(cohort_customers['days_since_signup'], 0, n_days - 1)]

        rand_vals = np.random.rand(len(cohort_customers))
        cohort_customers = cohort_customers[rand_vals < cohort_customers['order_probability']]

        if not cohort_customers.empty:
            for _, row in cohort_customers.iterrows():
                num_orders = np.random.choice([1, 2, 3, 4, 5], p=[0.5, 0.3, 0.1, 0.07, 0.03])
                for _ in range(num_orders):
                    orders.append({
                        'order_id': len(orders) + 1,
                        'customer_id': row['customer_id'],
                        'order_date': order_date.date(),  # Только дата
                        'order_amount': round(np.random.uniform(50, 5000), 2),
                        'platform': random.choice(platforms),
                        'category': random.choice(products['category'].values)
                    })

    return pd.DataFrame(orders)

# Параметры
n_days = 365
avg_customers_per_day = 10
n_products = 500

# Генерация данных
customers = generate_customers(n_days, avg_customers_per_day)
products = generate_products(n_products)
orders = generate_orders(customers, products, n_days)

# Сохранение в CSV
customers.to_csv('customers.csv', index=False)
products.to_csv('products.csv', index=False)
orders.to_csv('orders.csv', index=False)

print("Данные успешно сгенерированы и сохранены в CSV.")

Данные успешно сгенерированы и сохранены в CSV.


In [6]:
orders.dtypes

order_id          int64
customer_id       int64
order_date       object
order_amount    float64
platform         object
category         object
dtype: object

In [7]:
orders

Unnamed: 0,order_id,customer_id,order_date,order_amount,platform,category
0,1,1,2020-01-01,816.37,mobile,аккумуляторы
1,2,1,2020-01-01,4323.67,mobile,телефоны
2,3,3,2020-01-01,503.03,mobile,телефоны
3,4,4,2020-01-01,2999.85,desktop,аккумуляторы
4,5,4,2020-01-01,3505.21,mobile,телефоны
...,...,...,...,...,...,...
110874,110875,3646,2020-12-30,714.81,desktop,наушники
110875,110876,3649,2020-12-30,1551.27,desktop,телефоны
110876,110877,3649,2020-12-30,1966.40,desktop,телефоны
110877,110878,3649,2020-12-30,565.23,mobile,аккумуляторы


In [9]:
orders[orders['customer_id'] == 3]

Unnamed: 0,order_id,customer_id,order_date,order_amount,platform,category
2,3,3,2020-01-01,503.03,mobile,телефоны
156,157,3,2020-01-06,4348.64,desktop,телефоны
157,158,3,2020-01-06,2164.81,desktop,аксессуары
354,355,3,2020-01-09,3168.26,desktop,аккумуляторы
355,356,3,2020-01-09,2294.24,mobile,аккумуляторы
787,788,3,2020-01-14,2207.48,mobile,аккумуляторы
788,789,3,2020-01-14,1942.55,desktop,наушники
1213,1214,3,2020-01-18,3240.34,desktop,телефоны
1214,1215,3,2020-01-18,1637.07,mobile,наушники
1215,1216,3,2020-01-18,1815.04,desktop,аккумуляторы


In [14]:
orders.groupby('customer_id', as_index=False).agg({'order_date' : 'min'})

Unnamed: 0,customer_id,order_date
0,1,2020-01-01
1,2,2020-01-02
2,3,2020-01-01
3,4,2020-01-01
4,5,2020-01-01
...,...,...
3633,3640,2020-12-30
3634,3641,2020-12-30
3635,3643,2020-12-30
3636,3646,2020-12-30
