# Import Required Libraries
Import necessary libraries such as pandas, numpy, tensorflow, and keras.

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
import gc
import matplotlib.pyplot as plt

# Load and Preprocess Data
Load the train_data.csv file and preprocess the data, including handling missing values and scaling features.

In [2]:
train_df = pd.read_csv('data/train_data.csv')
train_df.replace(-1.0, np.nan, inplace=True)
train_df.replace('-1', np.nan, inplace=True)

In [3]:
numeric_cols = train_df.select_dtypes(include=[np.number]).columns
numeric_cols

Index(['che_pc_usd', 'che_perc_gdp', 'insurance_perc_che', 'population',
       'prev_perc', 'price_month', 'price_unit', 'public_perc_che', 'target'],
      dtype='object')

In [4]:
train_df.columns

Index(['brand', 'che_pc_usd', 'che_perc_gdp', 'cluster_nl', 'corporation',
       'country', 'launch_date', 'date', 'drug_id', 'ind_launch_date',
       'indication', 'insurance_perc_che', 'population', 'prev_perc',
       'price_month', 'price_unit', 'public_perc_che', 'therapeutic_area',
       'target'],
      dtype='object')

In [5]:
train_df[numeric_cols] = train_df[numeric_cols].fillna(train_df[numeric_cols].median())

In [6]:
date_columns = ['launch_date', 'date', 'ind_launch_date']
for col in date_columns:
    train_df[col] = pd.to_datetime(train_df[col], errors='coerce')

In [7]:
from sklearn.preprocessing import LabelEncoder
label_enc_columns = ['brand', 'corporation', 'country', 'therapeutic_area', 'drug_id']
label_encoders = {}
for col in label_enc_columns:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    label_encoders[col] = le

## Rolling features

In [None]:
def create_aggregates(df, target='target'):
    """
    Creates aggregate statistics and adds them directly to the input dataframe.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Input dataframe to be modified
    target : str, optional (default='target')
        Name of the target column to aggregate
    
    Returns:
    --------
    pandas.DataFrame
        Modified dataframe with added aggregate features
    """
    # Date features
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month

    df['year_launch'] = df['launch_date'].dt.year
    df['month_launch'] = df['launch_date'].dt.month

    # # # Days since launch
    # df['days_since_launch'] = df['date'].dt.days - df['launch_date'].dt.days
    # # Daus since indication launch
    # df['days_since_ind_launch'] = df['date'].dt.days - df['ind_launch_date'].dt.days

    # Define the aggregation groups
    aggregation_groups = [
        ['country'],
        ['brand'], 
        ['drug_id'],
        ['country', 'brand'],
        ['brand', 'drug_id'],
        ['country', 'drug_id'],
        ['country', 'brand', 'drug_id']
    ]
    
    # Aggregation types
    agg_types = ['mean', 'median', 'std', 'min', 'max']
    
    # Iterate through different grouping combinations
    for group_columns in aggregation_groups:
        # Create a unique group name for column naming
        group_name = '_'.join(group_columns)
        
        # Compute aggregates
        if len(group_columns) == 1:
            # Simple groupby for single column
            grouped = df.groupby(group_columns)[target].agg(agg_types)
            grouped.columns = [f'{target}_{agg}_{group_name}' for agg in agg_types]
            
            # Map aggregates back to original dataframe
            for col in grouped.columns:
                df[col] = df[group_columns[0]].map(grouped[col])
        else:
            # Multi-column groupby
            grouped = df.groupby(group_columns)[target].agg(agg_types)
            grouped.columns = [f'{target}_{agg}_{group_name}' for agg in agg_types]
            
            # Map aggregates back to original dataframe
            for col in grouped.columns:
                df[col] = df[group_columns].apply(
                    lambda x: grouped.loc[tuple(x), col] if tuple(x) in grouped.index else np.nan, 
                    axis=1
                )
    
    return df


In [9]:
# train_df_rolling = create_features(train_df, label= 'target')
train_df_rolling = create_aggregates(train_df)

In [None]:
# save train_df
# train_df_rolling.to_csv('data/train_data_rolling.csv', index=False)

In [23]:
train_data = train_df_rolling[train_df_rolling['date'] < '2022-01-01']
test_data = train_df_rolling[train_df_rolling['date'] >= '2022-01-01']
# train_data = train_df[train_df['date'] < '2022-01-01']
# test_data = train_df[train_df['date'] >= '2022-01-01']

## Split Data

In [11]:
X_train = train_data.drop(['target', 'cluster_nl', 'launch_date', 'date', 'ind_launch_date', 'indication'], axis=1)
# X_train = train_data.drop(['target', 'launch_date', 'date', 'ind_launch_date', 'indication'], axis=1)
y_train = train_data['target']

X_test = test_data.drop(['target', 'cluster_nl', 'launch_date', 'date', 'ind_launch_date', 'indication'], axis=1)
# X_test = test_data.drop(['target', 'launch_date', 'date', 'ind_launch_date', 'indication'], axis=1)
y_test = test_data['target']

# Define Model Architecture
Define the architecture of the model using a suitable neural network for time series prediction.

# Train the Model
Train the model on the entire dataset without using cross-validation. Save the best model using ModelCheckpoint.

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

In [13]:
model = XGBRegressor(n_estimators=1000, n_jobs=-1, random_state=33)
model.fit(X_train, y_train, verbose=True) # Change verbose to True if you want to see it train

In [14]:
from catboost import CatBoostRegressor

model_cat = CatBoostRegressor(iterations=1000, 
                          learning_rate=0.03, 
                          depth=6, 
                          random_seed=33, 
                          verbose=100)  # Adjust verbosity as needed

model_cat.fit(X_train, y_train, verbose=True)  # Set `verbose` to control training logs


0:	learn: 1.0901024	total: 67ms	remaining: 1m 6s
1:	learn: 1.0646512	total: 84.4ms	remaining: 42.1s
2:	learn: 1.0405320	total: 99.9ms	remaining: 33.2s
3:	learn: 1.0164242	total: 121ms	remaining: 30.2s
4:	learn: 0.9936193	total: 137ms	remaining: 27.3s
5:	learn: 0.9711174	total: 151ms	remaining: 25.1s
6:	learn: 0.9492565	total: 163ms	remaining: 23.1s
7:	learn: 0.9282917	total: 174ms	remaining: 21.6s
8:	learn: 0.9081474	total: 183ms	remaining: 20.2s
9:	learn: 0.8886055	total: 193ms	remaining: 19.1s
10:	learn: 0.8701125	total: 203ms	remaining: 18.3s
11:	learn: 0.8521248	total: 213ms	remaining: 17.6s
12:	learn: 0.8340520	total: 226ms	remaining: 17.1s
13:	learn: 0.8166815	total: 237ms	remaining: 16.7s
14:	learn: 0.8000930	total: 247ms	remaining: 16.2s
15:	learn: 0.7836538	total: 257ms	remaining: 15.8s
16:	learn: 0.7673700	total: 270ms	remaining: 15.6s
17:	learn: 0.7519545	total: 285ms	remaining: 15.6s
18:	learn: 0.7374831	total: 296ms	remaining: 15.3s
19:	learn: 0.7231419	total: 306ms	remain

<catboost.core.CatBoostRegressor at 0x7f81301f7850>

In [15]:
from lightgbm import LGBMRegressor

model_lgbm = LGBMRegressor(n_estimators=1000, 
                      learning_rate=0.03, 
                      num_leaves=31, 
                      random_state=33)

model_lgbm.fit(X_train, y_train)  # Adjust verbosity with frequency

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002253 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9875
[LightGBM] [Info] Number of data points in the train set: 102587, number of used features: 52
[LightGBM] [Info] Start training from score 1.393553


In [16]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")

Mean Squared Error: 0.2939
R² Score: 0.8738


In [17]:
y_pred = model_cat.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")

Mean Squared Error: 0.3231
R² Score: 0.8612


In [18]:
y_pred = model_lgbm.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")

Mean Squared Error: 0.3238
R² Score: 0.8609


## Evaluate Model

In [19]:
from pathlib import Path
from typing import Tuple

def _CYME(df: pd.DataFrame) -> float:
    """ Compute the CYME metric, that is 1/2(median(yearly error) + median(monthly error))"""

    yearly_agg = df.groupby("cluster_nl")[["target", "prediction"]].sum().reset_index()
    yearly_error = abs((yearly_agg["target"] - yearly_agg["prediction"])/yearly_agg["target"]).median()

    monthly_error = abs((df["target"] - df["prediction"])/df["target"]).median()

    return 1/2*(yearly_error + monthly_error)


def _metric(df: pd.DataFrame) -> float:
    """Compute metric of submission.

    :param df: Dataframe with target and 'prediction', and identifiers.
    :return: Performance metric
    """
    df = df.copy()
    df["date"] = pd.to_datetime(df["date"])

    # Split 0 actuals - rest
    zeros = df[df["zero_actuals"] == 1]
    recent = df[df["zero_actuals"] == 0]

    # weight for each group
    zeros_weight = len(zeros)/len(df)
    recent_weight = 1 - zeros_weight

    # Compute CYME for each group
    return round(recent_weight*_CYME(recent) + zeros_weight*min(1,_CYME(zeros)), 8)


def compute_metric(submission: pd.DataFrame) -> Tuple[float, float]:
    """Compute metric.

    :param submission: Prediction. Requires columns: ['cluster_nl', 'date', 'target', 'prediction']
    :return: Performance metric.
    """

    submission["date"] = pd.to_datetime(submission["date"])
    submission = submission[['cluster_nl', 'date', 'target', 'prediction', 'zero_actuals']]

    return _metric(submission)

In [20]:
validation = test_data.copy()

validation["prediction"] = model.predict(validation[X_train.columns])

# Assign column ["zero_actuals"] in the depending if in your
# split the cluster_nl has already had actuals on train or not
existing_clusters = train_data['cluster_nl'].unique()
validation['zero_actuals'] = (~validation['cluster_nl'].isin(existing_clusters)).astype(int)

print("Performance:", compute_metric(validation))

Performance: 0.02960951


In [21]:
validation = test_data.copy()

validation["prediction"] = model_cat.predict(validation[X_train.columns])

# Assign column ["zero_actuals"] in the depending if in your
# split the cluster_nl has already had actuals on train or not
existing_clusters = train_data['cluster_nl'].unique()
validation['zero_actuals'] = (~validation['cluster_nl'].isin(existing_clusters)).astype(int)

print("Performance:", compute_metric(validation))

Performance: 0.03224313


In [22]:
validation = test_data.copy()

validation["prediction"] = model_lgbm.predict(validation[X_train.columns])

# Assign column ["zero_actuals"] in the depending if in your
# split the cluster_nl has already had actuals on train or not
existing_clusters = train_data['cluster_nl'].unique()
validation['zero_actuals'] = (~validation['cluster_nl'].isin(existing_clusters)).astype(int)

print("Performance:", compute_metric(validation))

Performance: 0.02998016


# Save the Model
Evaluate the model's performance on a validation set or using other suitable metrics.

In [None]:
test_df = pd.read_csv('SUBMISSION/Data Files/submission_data.csv')
test_df.replace(-1.0, np.nan, inplace=True)
test_df.replace('-1', np.nan, inplace=True)

In [None]:
test_df[numeric_cols] = test_df[numeric_cols].fillna(test_df[numeric_cols].median())

In [None]:
date_columns = ['launch_date', 'date', 'ind_launch_date']
for col in date_columns:
    test_df[col] = pd.to_datetime(test_df[col], errors='coerce')

In [None]:
test_df['launch_year'] = test_df['launch_date'].dt.year
test_df['launch_month'] = test_df['launch_date'].dt.month
test_df['date_year'] = test_df['date'].dt.year
test_df['date_month'] = test_df['date'].dt.month

In [None]:
for col in label_enc_columns:
    le = label_encoders[col]
    test_df[col] = le.fit_transform(test_df[col].astype(str))
    label_encoders[col] = le

In [None]:
X_test = test_df.drop(['target', 'cluster_nl', 'launch_date', 'date', 'ind_launch_date', 'indication'], axis=1)

In [None]:
y_pred = model.predict(X_test)

In [None]:
test_df['date_str'] = test_df['date'].astype(str)

In [None]:
submission_data = pd.DataFrame({'date_str':test_df['date_str'], 'cluster_nl':test_df['cluster_nl'], 'prediction': y_pred})

In [None]:
submission_csv = pd.read_csv('SUBMISSION/submission_template.csv')

In [None]:
submission_csv.drop('prediction', axis=1, inplace=True)

In [None]:
submission_csv['date']

In [None]:
submission_csv = pd.merge(submission_csv, submission_data, left_on=['date', 'cluster_nl'], right_on=['date_str', 'cluster_nl'], how='left')