In [31]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path
import pyarrow.parquet as pq
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

In [32]:
train_df = pd.read_parquet('../Data/Data files/train_data.parquet')
test_df = pd.read_parquet('../Data/Data files/submission_data.parquet')

In [33]:
# Extracting the date from the datetime column
train_df['day'] = train_df['date'].dt.day
train_df['month'] = train_df['date'].dt.month
train_df['year'] = train_df['date'].dt.year

# Splitting the data into X and y
X_df = train_df[['country', 'brand', 'day', 'month', 'year']]
y_df = train_df['phase']

# Split the data into training and testing sets based on the year
X_train_df = X_df[X_df['year'] == 2020]
y_train = y_df[X_df['year'] == 2020]
X_test_df = X_df[X_df['year'] >= 2021]
y_test = y_df[X_df['year'] >= 2021]

In [34]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit and transform on training data
X_train_encoded = encoder.fit_transform(X_train_df[['country', 'brand']])
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(['country', 'brand']))
X_train_combined = pd.concat([X_train_encoded_df, X_train_df[['day', 'month', 'year']].reset_index(drop=True)], axis=1)

# Transform the test data
X_test_encoded = encoder.transform(X_test_df[['country', 'brand']])
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(['country', 'brand']))
X_test_combined = pd.concat([X_test_encoded_df, X_test_df[['day', 'month', 'year']].reset_index(drop=True)], axis=1)


In [35]:
# Instantiate and train the model
model = RandomForestRegressor(n_estimators=50, min_samples_leaf=4, random_state=42, verbose=1, n_jobs=6)
model.fit(X_train_combined, y_train)


[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  1.0min
[Parallel(n_jobs=6)]: Done  50 out of  50 | elapsed:  1.3min finished


In [37]:
# Make predictions and evaluate
predictions = model.predict(X_test_combined)
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error: ", mse)

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.


Mean Squared Error:  0.004227883160163199


[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.3s
[Parallel(n_jobs=6)]: Done  50 out of  50 | elapsed:    0.5s finished


In [36]:
def metric(df: pd.DataFrame) -> float:
    """Compute performance metric.

    :param df: Dataframe with target ('phase') and 'prediction', and identifiers.
    :return: Performance metric
    """
    df = df.copy()
    assert 'monthly' in df.columns, "Missing 'monthly' column, only available in the train set"
    assert 'phase' in df.columns, "Missing 'phase' column, only available in the train set"
    assert 'prediction' in df.columns, "Missing 'prediction' column with predictions"

    df["date"] = pd.to_datetime(df["date"])
    
    # create datetime columns
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['quarter'] = df['date'].dt.quarter

    # Sum of phasing country-brand-month = 1
    df['sum_pred'] = df.groupby(['year', 'month', 'brand', 'country'])['prediction'].transform(sum)
    assert np.isclose(df['sum_pred'], 1.0, rtol=1e-04).all(), "Condition phasing year-month-brand-country must sum 1 is not fulfilled"
    
    # define quarter weights 
    df['quarter_w'] = np.where(df['quarter'] == 1, 1, 
                    np.where(df['quarter'] == 2, 0.75,
                    np.where(df['quarter'] == 3, 0.66, 0.5)))
                    
    # compute and return metric
    return round(np.sqrt((1 / len(df)) * sum(((df['phase'] - df['prediction'])**2) * df['quarter_w'] * df['monthly'])), 8)

In [46]:
train_df = pd.read_parquet('../Data/Data files/train_data.parquet')
df_train = train_df[train_df['date'].dt.year.isin([2021])]

df_val_predictions = pd.DataFrame({'phase': y_test, 'prediction': predictions, 'monthly': df_train['monthly'], 'date': df_train['date'], 'brand': X_test_df['brand'], 'country': X_test_df['country']})

In [49]:
performance_metric = metric(df_val_predictions)


print(f'Performance Metric: {performance_metric}')

Performance Metric: 0.01150094
