In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [20]:
def get_duration(df):
     return (df['date'] - pd.to_datetime('2024-01-01')).dt.total_seconds()

In [21]:
dates = pd.date_range(start='2024-01-01', end='2024-01-02', freq='min')
values = np.random.randint(low=0, high=100, size=len(dates))

df = pd.DataFrame({'date': dates, 'value': values})
df['duration'] = get_duration(df)

df.head(3)

Unnamed: 0,date,value,duration
0,2024-01-01 00:00:00,77,0.0
1,2024-01-01 00:01:00,46,60.0
2,2024-01-01 00:02:00,67,120.0


In [17]:
def linear_model(df):
    X = df['duration'].values.reshape(-1, 1)
    y = df['value']
    
    model = LinearRegression()
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    var = df['value'].var()
    
    print(f'MSE: {mse} vs Var: {var}')

In [18]:
linear_model(df)

MSE: 837.2139846257983 vs Var: 819.0981561801218


In [22]:
def linear_function(x):
    return 2 * x + 10

values = linear_function(np.arange(len(dates)))

df = pd.DataFrame({'date': dates, 'value': values})
df['duration'] = get_duration(df)

linear_model(df)

MSE: 1.3901246820109464e-26 vs Var: 692640.6666666666


In [25]:
# Define a function to simulate user behavior
def simulate_user_behavior(dates):
    # Linear trend
    trend = np.arange(len(dates))
    
    # Hourly periodic component
    hourly_component = np.sin(2 * np.pi * dates.hour / 24)
    
    # Daily periodic component
    daily_component = np.sin(2 * np.pi * dates.dayofyear / 365)
    
    # Random noise
    noise = np.random.normal(0, 1, len(dates))
    
    # Combine components
    user_behavior = 10 + 0.5 * trend + 5 * hourly_component + 5 * daily_component + 3 * noise
    
    return user_behavior

values = simulate_user_behavior(dates)

df = pd.DataFrame({'date': dates, 'value': values})
df['duration'] = get_duration(df)

linear_model(df)

MSE: 12.552268314656468 vs Var: 42242.59794414516
