In [None]:
# !git clone https://github.com/dsa-playground/tvt2024.git
# %cd /content/tvt2024/
# !git pull
# !pip install -r requirements.txt -t "tvt2024"
# !pip install pyaml-env

In [None]:
# Imports
from faker import Faker
import pandas as pd
import numpy as np
import scipy.stats as stats
import plotly.graph_objects as go
import random
import queue
import plotly.graph_objects as go
from datetime import datetime
from sklearn.linear_model import LinearRegression
# import sys

# sys.path.append("../")

from scripts.preprocess.create_data import create_dataset, aggretate_data_for_workshop
from scripts.model.model import ts_train_test_split, AverageTrain, AveragePredict, MonthlyMovingAveringTrain, MonthlyMovingAveringPredict, LinearRegressionTrain, LinearRegressionPredict

In [None]:
# start_date='-5y'
# end_date=datetime(2026,1,1)
# fake = Faker('nl_NL')

# fake.date_between(start_date=start_date, end_date=end_date)

In [None]:
# ## Create dataset
# max_clients = 100
# df_data = create_dataset(n_clients=1000, max_clients=max_clients, start_date=datetime(2019,1,1), end_date=datetime(2026,1,1))

# df_agg = aggretate_data_for_workshop(df=df_data, max_clients=100)

# df_agg.to_csv('data/df_agg.csv', index=False, sep=";")

# display(df_agg.head())
# display(df_agg.tail())

In [None]:
# Load data
df = pd.read_csv('data/df_agg.csv', sep=";")
df = pd.read_csv('data/ZZP.csv', sep=";")

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
# kengetallen weergeven

In [None]:
fig = go.Figure()

for value in df['zorgzwaarte'].unique():
    df_subset = df[df['zorgzwaarte'] == value]
    fig.add_trace(go.Scatter(
        x=df_subset['datum'], 
        y=df_subset['aantal_clienten'], 
        mode='lines', 
        name=str(value)))

fig.show()

In [None]:
# Load data
df = pd.read_csv('data/df_agg.csv', 
                 sep=";",
                 dtype={
                    'zorgzwaarte': 'int32',
                    'aantal_clienten': 'int32'
                    }, 
                parse_dates=['datum'])

In [None]:
df_Xy = df[df['zorgzwaarte'] == 8].reset_index().copy()

In [None]:
X_test.head()

In [None]:
X_train, X_test, y_train, y_test = ts_train_test_split(df_Xy, cutoff_date='2023-05-14')

## Train models
average = AverageTrain(X_train=X_train, y_train=y_train, window_size=None)
monthly_mean = MonthlyMovingAveringTrain(X_train=X_train, y_train=y_train,)
LRmodel = LinearRegressionTrain(X_train=X_train, y_train=y_train)
## Results training period
y_preds_A_hist = AveragePredict(X_test=X_train, average=average)
y_preds_MA_hist = MonthlyMovingAveringPredict(X_test=X_train, monthly_mean=monthly_mean)
y_preds_LR_hist = LinearRegressionPredict(X_test=X_train, model=LRmodel)
## Predict future period (evaluation period & future period)
y_preds_A = AveragePredict(X_test=X_test, average=average)
y_preds_MA = MonthlyMovingAveringPredict(X_test=X_test, monthly_mean=monthly_mean)
y_preds_LR = LinearRegressionPredict(X_test=X_test, model=LRmodel)

## Make total DataFrame
# y_test t/m '2024-05-14' (evaluatie periode: 2023-05-14 t/m 2024-05-14)

# Create a DataFrame
df_end = pd.DataFrame({
    'datum': pd.concat([X_train, X_test]),
    'aantal_clienten': np.concatenate([y_train, y_test]),
    'Average': np.concatenate([y_preds_A_hist, y_preds_A]),
    'Moving Average': np.concatenate([y_preds_MA_hist, y_preds_MA]),
    'Linear Regression': np.concatenate([y_preds_LR_hist, y_preds_LR])
})

## Plot results
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_end['datum'], 
    y=df_end['aantal_clienten'], 
    mode='lines', 
    name=str('aantal_clienten')))
fig.add_trace(go.Scatter(
    x=df_end['datum'], 
    y=df_end['Average'], 
    mode='lines', 
    name=str('Average')))
fig.add_trace(go.Scatter(
    x=df_end['datum'], 
    y=df_end['Moving Average'], 
    mode='lines', 
    name=str('Moving Average')))
fig.add_trace(go.Scatter(
    x=df_end['datum'], 
    y=df_end['Linear Regression'], 
    mode='lines', 
    name=str('Linear Regression')))

# Add a shape to highlight a part of the graph
fig.update_layout(
    shapes=[
        dict(
            type="rect",
            xref="x",
            yref="paper",
            x0=df_end['datum'].min(),
            y0=0,
            x1='2023-05-14',
            y1=1,
            fillcolor="limegreen",
            opacity=0.1,
            layer="below",
            line_width=0,
        ),
        dict(
            type="rect",
            xref="x",
            yref="paper",
            x0='2023-05-14',
            y0=0,
            x1='2024-05-14',
            y1=1,
            fillcolor="LightSalmon",
            opacity=0.4,
            layer="below",
            line_width=0,
        ),
        dict(
            type="rect",
            xref="x",
            yref="paper",
            x0='2024-05-14',
            y0=0,
            x1=df_end['datum'].max(),
            y1=1,
            fillcolor="white",
            opacity=0.8,
            layer="below",
            line_width=0,
        )
    ]
)

fig.show()

In [None]:
from random import randint
# Generate a random start date
start_date = pd.to_datetime('2000-01-01') + pd.DateOffset(days=randint(0, 365*20))

# Generate a random number from a normal distribution with a mu of 9 (months) and a std of 4 (months)
random_number = np.random.normal(loc=9, scale=4)

# Add the random number to the start date to get the end date
end_date = start_date + pd.DateOffset(months=random_number)

# Check if the end date is less than 31 days after the start date
if (end_date - start_date).days < 31:
    end_date = start_date + pd.DateOffset(days=31)

# Create a weighted choice for the month
months = list(range(1, 13))
weights = [0.1 if month not in [12, 1, 7] else 0.2 for month in months]
chosen_month = np.random.choice(months, p=weights)

# Set the month of the end date to the chosen month
end_date = end_date.replace(month=chosen_month)

print(start_date, end_date)