## Data Generation

Generates the simulation data using the properties of the actual data

In [None]:
import pandas as pd
import numpy as np
from datetime import date

In [None]:
def generate_data():
    num_samples = 200000

    # The desired mean values of the sample.
    mu = np.array([4.689133, 6.220279, 74.026988, 44.740914])

    # The desired covariance matrix.
    r = np.array(
        [
            #  sold    quantity   price  reduction
            [7.035221, 38.483902, -42.132574, 8.941785],
            [38.483902, 63.069710, -44.900212, 9.328755],
            [-42.132574, -44.900212, 5117.449392, -0.615325],
            [8.941785, 9.328755, -0.615325, 100.391853]
        ]
    )

    # Generate the random samples.
    rng = np.random.default_rng()
    y = rng.multivariate_normal(mu, r, size=num_samples)

    df = pd.DataFrame(y, columns=["sold", "quantity", "price", "reduction"])
    df = df.round(0)
    df = df[df["sold"] >= 0]
    df = df[df["quantity"] > 0]
    df = df[df["quantity"] < 7]
    df = df[df["price"] > 10]
    df = df[df["quantity"] >= df["sold"]]

    num_samples = df.shape[0]
    print("RECORDS: {0}".format(num_samples))

    dates = np.tile(pd.date_range(start='1/1/2021', end='4/1/2022').to_series(), 1000)
    np.random.shuffle(dates)
    df["date"] = dates[-num_samples:,]

    df["reduction"] = 50
    n = int(num_samples/4)
    changes = df.sample(n).index 
    df.loc[changes, "reduction"]=30

    n = int(num_samples/10)
    changes = df.sample(n).index 
    df.loc[changes, "reduction"] = 40

    n = int(num_samples/10)
    changes = df.sample(n).index 
    df.loc[changes, "quantity"] =  df.loc[changes, "quantity"] * 2
    df.loc[changes, "sold"] =  df.loc[changes, "sold"] * 2

    n = int(num_samples*0.02)
    changes = df.sample(n).index 
    df.loc[changes, "quantity"] =  df.loc[changes, "quantity"] * 5
    df.loc[changes, "sold"] =  df.loc[changes, "sold"] * 5
    
    n = int(num_samples*0.001)
    changes = df.sample(n).index 
    df.loc[changes, "quantity"] =  df.loc[changes, "quantity"] * 10
    df.loc[changes, "sold"] =  df.loc[changes, "sold"] * 10

    df["store"] = np.random.randint(1,5,size=(num_samples, 1))
    df["store"] = df["store"].astype(str)
    df["store"] = "MAG_" + df["store"]
    
    df["days"] = np.random.randint(3,10,size=(num_samples, 1))

    n = int(num_samples*0.2)
    changes = df.sample(n).index 
    df.loc[changes, "days"] =  2
    
    n = int(num_samples*0.3)
    changes = df.sample(n).index 
    df.loc[changes, "days"] =  0

    n = int(num_samples*0.4)
    changes = df.sample(n).index 
    df.loc[changes, "days"] =  1
    
    changes = df[df["days"] > 2].index
    df.loc[changes, "sold"] = df.loc[changes, "quantity"]

    return df

In [None]:
df = generate_data()
display(df.shape)
df.sample(10)

In [None]:
url = "markdown.csv"
df.to_csv(url, index=False)