In [1]:
import numpy as np
import pandas as pd

In [2]:
np.random.seed(42)  # reproducibility

In [7]:
import pandas as pd

df = pd.read_csv("data/SuperStoreOrders.csv", parse_dates=["order_date","ship_date"])

# Ensure they are datetime (keep them in datetime format)
df['order_date'] = pd.to_datetime(df['order_date'], format='%m/%d/%Y', errors='coerce')
df['ship_date'] = pd.to_datetime(df['ship_date'], format='%m/%d/%Y', errors='coerce')

# Fill missing with a default datetime instead of string
df['order_date'] = df['order_date'].fillna(pd.Timestamp("1900-01-01"))
df['ship_date'] = df['ship_date'].fillna(pd.Timestamp("1900-01-01"))

# Derive configs dynamically
products = df["product_id"].unique()
regions = df["region"].unique()
states = df["state"].unique()
segments = df["segment"].unique()

# Convert to weekly periods for alignment
df["week"] = df["order_date"].dt.to_period('W').dt.start_time
weeks = df["week"].unique()


In [None]:
# -------------------------------
# 1. Inventory Table
# -------------------------------
inventory_data = []
for p in products:
    stock = np.random.randint(200,500)  # initial stock
    for w in weeks:
        sold = np.random.randint(10, 100)
        stock = max(0, stock - sold)
        reorder = np.random.choice([0,1], p=[0.8,0.2])  # 20% chance reorder
        if reorder:
            stock += np.random.randint(100, 400)
        inventory_data.append([
            p, w, stock, int(stock == 0), 100,
            np.random.choice([0,1], p=[0.9,0.1]) * np.random.randint(0, 7)
        ])

inventory_df = pd.DataFrame(inventory_data, 
    columns=["product_id","week","inventory_level","stockout_flag","reorder_point","supplier_delay_days"]
)

In [20]:
inventory_df.head()

Unnamed: 0,product_id,week,inventory_level,stockout_flag,reorder_point,supplier_delay_days
0,OFF-TEN-10000025,2010-12-27,447,0,100,0
1,OFF-TEN-10000025,2011-01-31,355,0,100,0
2,OFF-TEN-10000025,2011-02-28,322,0,100,0
3,OFF-TEN-10000025,2011-03-28,283,0,100,0
4,OFF-TEN-10000025,2011-04-25,241,0,100,0


In [14]:
# -------------------------------
# 2. Store Operations Table
# -------------------------------
store_ops = []
for s in states:
    for w in weeks:
        open_flag = np.random.choice([1,0], p=[0.9,0.1])
        reason = np.random.choice(["Holiday","Renovation","Strike","None"]) if open_flag == 0 else "None"
        store_ops.append([s, w, open_flag, reason, np.random.randint(6,12)])
        
store_ops_df = pd.DataFrame(store_ops, 
    columns=["state","week","store_open","closure_reason","working_hours"]
)

In [21]:
store_ops_df.head()

Unnamed: 0,state,week,store_open,closure_reason,working_hours
0,Constantine,2010-12-27,0,,9
1,Constantine,2011-01-31,1,,10
2,Constantine,2011-02-28,1,,10
3,Constantine,2011-03-28,1,,11
4,Constantine,2011-04-25,1,,11


In [22]:
# -------------------------------
# 3. Marketing & Promotions Table
# -------------------------------
marketing = []
for p in products:
    for w in weeks:
        campaign = np.random.choice([0,1], p=[0.7,0.3])
        discount = np.random.choice([0,1], p=[0.6,0.4])
        discount_pct = np.random.choice([0,10,20,30]) if discount else 0
        marketing.append([p, w, campaign, discount, discount_pct, np.random.randint(1000,5000), np.random.choice(["Online","Offline","Email"])])
        
marketing_df = pd.DataFrame(marketing, 
    columns=["product_id","week","marketing_campaign_active","discount_campaign","discount_percentage","ad_spend","channel"]
)

In [24]:
marketing_df.head() 

Unnamed: 0,product_id,week,marketing_campaign_active,discount_campaign,discount_percentage,ad_spend,channel
0,OFF-TEN-10000025,2010-12-27,0,0,0,2186,Email
1,OFF-TEN-10000025,2011-01-31,1,0,0,3315,Online
2,OFF-TEN-10000025,2011-02-28,0,1,30,4883,Offline
3,OFF-TEN-10000025,2011-03-28,1,0,0,2406,Offline
4,OFF-TEN-10000025,2011-04-25,0,0,0,4741,Offline


In [25]:
# -------------------------------
# 4. Competitor Activity Table
# -------------------------------
competitor = []
for r in regions:
    for w in weeks:
        comp_disc = np.random.choice([0,1], p=[0.7,0.3])
        competitor.append([r, w, comp_disc, np.random.choice([0,10,20,30]) if comp_disc else 0, np.random.choice([0,1], p=[0.9,0.1])])
        
competitor_df = pd.DataFrame(competitor, 
    columns=["region","week","competitor_discount_active","competitor_discount_percentage","competitor_new_product_launch"]
)


In [27]:
competitor_df.head()

Unnamed: 0,region,week,competitor_discount_active,competitor_discount_percentage,competitor_new_product_launch
0,Africa,2010-12-27,0,0,0
1,Africa,2011-01-31,1,0,0
2,Africa,2011-02-28,1,0,0
3,Africa,2011-03-28,1,20,0
4,Africa,2011-04-25,0,0,0


In [28]:
# -------------------------------
# 5. Logistics & Delivery Table
# -------------------------------
logistics = []
for r in regions:
    for w in weeks:
        issue = np.random.choice([0,1], p=[0.85,0.15])
        logistics.append([r, w, np.random.randint(0,5), issue, np.random.choice(["Strike","Weather","Traffic","None"]), np.random.uniform(0.8,1.0)])
        
logistics_df = pd.DataFrame(logistics, 
    columns=["region","week","avg_shipping_delay_days","logistics_issue_flag","issue_reason","delivery_success_rate"]
)

In [30]:
logistics_df.head()

Unnamed: 0,region,week,avg_shipping_delay_days,logistics_issue_flag,issue_reason,delivery_success_rate
0,Africa,2010-12-27,3,0,Weather,0.969359
1,Africa,2011-01-31,4,1,,0.860725
2,Africa,2011-02-28,3,1,Traffic,0.867432
3,Africa,2011-03-28,2,0,Weather,0.803955
4,Africa,2011-04-25,0,0,Weather,0.979906


In [31]:
# -------------------------------
# 6. Customer Behavior Table
# -------------------------------
customers = []
for seg in segments:
    for w in weeks:
        customers.append([seg, w, np.random.uniform(3.5,5.0), np.random.uniform(0.01,0.2), np.random.uniform(0.01,0.15), np.random.randint(0,20)])
        
customers_df = pd.DataFrame(customers, 
    columns=["customer_segment","week","avg_customer_rating","return_rate","churn_rate","complaints_count"]
)

In [33]:
customers_df.head()

Unnamed: 0,customer_segment,week,avg_customer_rating,return_rate,churn_rate,complaints_count
0,Consumer,2010-12-27,4.447447,0.171193,0.078095,4
1,Consumer,2011-01-31,3.919547,0.073779,0.133497,0
2,Consumer,2011-02-28,3.797989,0.011782,0.031556,7
3,Consumer,2011-03-28,4.145188,0.068613,0.029119,16
4,Consumer,2011-04-25,4.781516,0.109156,0.058021,11


In [34]:
# -------------------------------
# 7. External Factors Table
# -------------------------------
external = []
weather_conditions = ["Sunny","Rainy","Storm","Cold","Hot"]
for r in regions:
    for w in weeks:
        holiday = np.random.choice([0,1], p=[0.8,0.2])
        external.append([r, w, holiday, np.random.choice(["Christmas","Labor Day","Independence Day","None"]) if holiday else "None", np.random.choice(weather_conditions), np.random.randint(0,40), np.random.uniform(0.5,1.5)])
        
external_df = pd.DataFrame(external, 
    columns=["region","week","holiday_flag","holiday_name","weather_condition","temperature","economic_index"]
)

In [36]:
external_df.head()

Unnamed: 0,region,week,holiday_flag,holiday_name,weather_condition,temperature,economic_index
0,Africa,2010-12-27,0,,Storm,9,1.379674
1,Africa,2011-01-31,0,,Cold,5,0.79104
2,Africa,2011-02-28,0,,Sunny,15,1.49655
3,Africa,2011-03-28,0,,Cold,12,1.137006
4,Africa,2011-04-25,0,,Sunny,16,1.004915


In [37]:
# -------------------------------
# 8. Pricing & Product Table
# -------------------------------
pricing = []
for p in products:
    base_price = np.random.randint(50,500)
    for w in weeks:
        price_change = np.random.choice([0,1], p=[0.85,0.15])
        pct = np.random.choice([0,5,10,20]) if price_change else 0
        pricing.append([p, w, base_price, price_change, pct, np.random.choice([0,1], p=[0.95,0.05]), np.random.choice([0,1], p=[0.98,0.02])])
        
pricing_df = pd.DataFrame(pricing, 
    columns=["product_id","week","base_price","price_change_flag","price_change_percentage","new_version_launch","discontinued_flag"]
)


In [39]:
pricing_df.head()

Unnamed: 0,product_id,week,base_price,price_change_flag,price_change_percentage,new_version_launch,discontinued_flag
0,OFF-TEN-10000025,2010-12-27,284,0,0,0,0
1,OFF-TEN-10000025,2011-01-31,284,0,0,0,0
2,OFF-TEN-10000025,2011-02-28,284,0,0,0,0
3,OFF-TEN-10000025,2011-03-28,284,0,0,0,0
4,OFF-TEN-10000025,2011-04-25,284,0,0,0,0


In [41]:
import os

# Create folder if it doesn't exist
os.makedirs("synthetic_data", exist_ok=True)

tables = {
    "inventory.csv": inventory_df,
    "store_operations.csv": store_ops_df,
    "marketing.csv": marketing_df,
    "competitor.csv": competitor_df,
    "logistics.csv": logistics_df,
    "customer_behavior.csv": customers_df,
    "external_factors.csv": external_df,
    "pricing.csv": pricing_df
}

# Save each dataframe to CSV
for filename, df in tables.items():
    df.to_csv(f"synthetic_data/{filename}", index=False)

print("✅ All synthetic data saved to 'synthetic_data/' folder.")

✅ All synthetic data saved to 'synthetic_data/' folder.
