# Import Required Libraries
Import necessary libraries such as pandas, numpy, tensorflow, and keras.

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
import gc
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# Load and Preprocess Data
Load the train_data.csv file and preprocess the data, including handling missing values and scaling features.

In [2]:
train_df = pd.read_csv('data/train_data.csv')
train_df.replace(-1.0, np.nan, inplace=True)
train_df.replace('-1', np.nan, inplace=True)

In [3]:
numeric_cols = train_df.select_dtypes(include=[np.number]).columns
numeric_cols

Index(['che_pc_usd', 'che_perc_gdp', 'insurance_perc_che', 'population',
       'prev_perc', 'price_month', 'price_unit', 'public_perc_che', 'target'],
      dtype='object')

In [4]:
train_df.columns

Index(['brand', 'che_pc_usd', 'che_perc_gdp', 'cluster_nl', 'corporation',
       'country', 'launch_date', 'date', 'drug_id', 'ind_launch_date',
       'indication', 'insurance_perc_che', 'population', 'prev_perc',
       'price_month', 'price_unit', 'public_perc_che', 'therapeutic_area',
       'target'],
      dtype='object')

In [5]:
train_df[numeric_cols] = train_df[numeric_cols].fillna(train_df[numeric_cols].median())

In [6]:
date_columns = ['launch_date', 'date', 'ind_launch_date']
for col in date_columns:
    train_df[col] = pd.to_datetime(train_df[col], errors='coerce')

In [7]:
label_enc_columns = ['brand', 'corporation', 'country', 'therapeutic_area', 'drug_id']
label_encoders = {}
for col in label_enc_columns:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    label_encoders[col] = le

In [8]:
train_df.head()

Unnamed: 0,brand,che_pc_usd,che_perc_gdp,cluster_nl,corporation,country,launch_date,date,drug_id,ind_launch_date,indication,insurance_perc_che,population,prev_perc,price_month,price_unit,public_perc_che,therapeutic_area,target
0,113,1.209114,1.665879,BRAND_354E_COUNTRY_88A3,116,28,2014-06-01,2014-06-01,121,NaT,['IND_C3B6'],1.893333,1.008039,0.028367,1.006444,1.013784,1.835821,10,1.000784
1,223,1.472378,1.753338,BRAND_626D_COUNTRY_8B47,0,30,2014-06-01,2014-06-01,223,2014-09-01,"['IND_1590', 'IND_ECAC']",1.546667,1.023562,4.7e-05,1.121036,1.626677,1.835821,9,1.0
2,155,1.209114,1.665879,BRAND_45D9_COUNTRY_88A3,28,28,2014-06-01,2014-06-01,236,NaT,['IND_B2EF'],1.893333,1.008039,0.001502,1.121036,3.144874,1.835821,9,1.002258
3,489,1.85128,2.05177,BRAND_D724_COUNTRY_445D,55,13,2014-06-01,2014-06-01,25,NaT,['IND_BAFB'],1.0,1.253186,0.001304,1.121036,1.213446,1.80597,7,1.068761
4,161,1.791199,2.05913,BRAND_4887_COUNTRY_D8B0,34,43,2014-06-01,2014-06-01,149,NaT,['IND_3F31'],2.013333,1.639352,0.054467,1.018589,1.008708,1.880597,7,1.036312


## Rolling features

In [9]:
def create_features(df, window_size = 3, label=None):
    """
    Creates time series features from datetime index
    """
    # Extract features from 'launch_date'
    df['launch_year'] = df['launch_date'].dt.year
    df['launch_month'] = df['launch_date'].dt.month
    df['launch_day'] = df['launch_date'].dt.day
    df['launch_dayofweek'] = df['launch_date'].dt.dayofweek  # 0 = Monday
    df['launch_is_weekend'] = df['launch_dayofweek'] >= 5

    # Extract features from 'date'
    df['date_year'] = df['date'].dt.year
    df['date_month'] = df['date'].dt.month
    df['date_day'] = df['date'].dt.day
    df['date_dayofweek'] = df['date'].dt.dayofweek
    df['date_is_weekend'] = df['date_dayofweek'] >= 5

    # Remove column target frmo numeric_cols
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if label in numeric_cols:
        numeric_cols = numeric_cols.drop(label)

    # Create rolling features
    for col in numeric_cols:
        df[f'{col}_rolling_mean'] = df[col].rolling(window=window_size).mean()
        df[f'{col}_rolling_sum'] = df[col].rolling(window=window_size).sum()
        df[f'{col}_rolling_std'] = df[col].rolling(window=window_size).std()
        df[f'{col}_rolling_min'] = df[col].rolling(window=window_size).min()
        df[f'{col}_rolling_max'] = df[col].rolling(window=window_size).max()
        df[f'{col}_lag_1'] = df[col].shift(1)
        df[f'{col}_lag_2'] = df[col].shift(2)
        df[f'{col}_lag_3'] = df[col].shift(3)

    # Interaction features
    df['launch_month_times_date_month'] = df['launch_month'] * df['date_month']

    return df

In [10]:
train_df_rolling = create_features(train_df, label= 'target')

  df[f'{col}_lag_3'] = df[col].shift(3)
  df[f'{col}_rolling_mean'] = df[col].rolling(window=window_size).mean()
  df[f'{col}_rolling_sum'] = df[col].rolling(window=window_size).sum()
  df[f'{col}_rolling_std'] = df[col].rolling(window=window_size).std()
  df[f'{col}_rolling_min'] = df[col].rolling(window=window_size).min()
  df[f'{col}_rolling_max'] = df[col].rolling(window=window_size).max()
  df[f'{col}_lag_1'] = df[col].shift(1)
  df[f'{col}_lag_2'] = df[col].shift(2)
  df[f'{col}_lag_3'] = df[col].shift(3)
  df[f'{col}_rolling_mean'] = df[col].rolling(window=window_size).mean()
  df[f'{col}_rolling_sum'] = df[col].rolling(window=window_size).sum()
  df[f'{col}_rolling_std'] = df[col].rolling(window=window_size).std()
  df[f'{col}_rolling_min'] = df[col].rolling(window=window_size).min()
  df[f'{col}_rolling_max'] = df[col].rolling(window=window_size).max()
  df[f'{col}_lag_1'] = df[col].shift(1)
  df[f'{col}_lag_2'] = df[col].shift(2)
  df[f'{col}_lag_3'] = df[col].shift(3)
  df[f

In [12]:
def create_aggregates(df, group_columns, target='target'):
    """
    Creates aggregate statistics (mean, median, std, min, max) for launches by specified group_columns.
    """
    # Group by the specified columns and compute aggregate statistics for the target variable
    grouped = df.groupby(group_columns)[target].agg(['mean', 'median', 'std', 'min', 'max'])

    # Flatten the multi-level columns and create new column names
    grouped.columns = [f'{target}_{agg}' for agg in grouped.columns]

    # Return the aggregates (they will be joined to the original dataframe later)
    return grouped

# 1. Aggregate by 'country'
df_country = create_aggregates(train_df, group_columns=['country'], target='target')

# 2. Aggregate by 'Brand'
df_brand = create_aggregates(train_df, group_columns=['brand'], target='target')

# 3. Aggregate by 'Drug_id'
df_drug_id = create_aggregates(train_df, group_columns=['drug_id'], target='target')

# 4. Aggregate by 'Country + Brand'
df_country_brand = create_aggregates(train_df, group_columns=['country', 'brand'], target='target')

# 5. Aggregate by 'Brand + Drug_id'
df_brand_drug_id = create_aggregates(train_df, group_columns=['brand', 'drug_id'], target='target')

# 6. Aggregate by 'Country + Drug_id'
df_country_drug_id = create_aggregates(train_df, group_columns=['country', 'drug_id'], target='target')

# 7. Aggregate by 'Country + Brand + Drug_id'
df_country_brand_drug_id = create_aggregates(train_df, group_columns=['country', 'brand', 'drug_id'], target='target')

# Map the aggregates into the original dataframe

train_df['mean_country'] = train_df['country'].map(df_country['target_mean'])
train_df['median_country'] = train_df['country'].map(df_country['target_median'])
train_df['std_country'] = train_df['country'].map(df_country['target_std'])
train_df['min_country'] = train_df['country'].map(df_country['target_min'])
train_df['max_country'] = train_df['country'].map(df_country['target_max'])

train_df['mean_brand'] = train_df['brand'].map(df_brand['target_mean'])
train_df['median_brand'] = train_df['brand'].map(df_brand['target_median'])
train_df['std_brand'] = train_df['brand'].map(df_brand['target_std'])
train_df['min_brand'] = train_df['brand'].map(df_brand['target_min'])
train_df['max_brand'] = train_df['brand'].map(df_brand['target_max'])

train_df['mean_drug_id'] = train_df['drug_id'].map(df_drug_id['target_mean'])
train_df['median_drug_id'] = train_df['drug_id'].map(df_drug_id['target_median'])
train_df['std_drug_id'] = train_df['drug_id'].map(df_drug_id['target_std'])
train_df['min_drug_id'] = train_df['drug_id'].map(df_drug_id['target_min'])
train_df['max_drug_id'] = train_df['drug_id'].map(df_drug_id['target_max'])

train_df['mean_country_brand'] = train_df[['country', 'brand']].apply(lambda x: df_country_brand.loc[tuple(x), 'target_mean'] if tuple(x) in df_country_brand.index else np.nan, axis=1)
train_df['median_country_brand'] = train_df[['country', 'brand']].apply(lambda x: df_country_brand.loc[tuple(x), 'target_median'] if tuple(x) in df_country_brand.index else np.nan, axis=1)
train_df['std_country_brand'] = train_df[['country', 'brand']].apply(lambda x: df_country_brand.loc[tuple(x), 'target_std'] if tuple(x) in df_country_brand.index else np.nan, axis=1)
train_df['min_country_brand'] = train_df[['country', 'brand']].apply(lambda x: df_country_brand.loc[tuple(x), 'target_min'] if tuple(x) in df_country_brand.index else np.nan, axis=1)
train_df['max_country_brand'] = train_df[['country', 'brand']].apply(lambda x: df_country_brand.loc[tuple(x), 'target_max'] if tuple(x) in df_country_brand.index else np.nan, axis=1)

train_df['mean_brand_drug_id'] = train_df[['brand', 'drug_id']].apply(lambda x: df_brand_drug_id.loc[tuple(x), 'target_mean'] if tuple(x) in df_brand_drug_id.index else np.nan, axis=1)
train_df['median_brand_drug_id'] = train_df[['brand', 'drug_id']].apply(lambda x: df_brand_drug_id.loc[tuple(x), 'target_median'] if tuple(x) in df_brand_drug_id.index else np.nan, axis=1)
train_df['std_brand_drug_id'] = train_df[['brand', 'drug_id']].apply(lambda x: df_brand_drug_id.loc[tuple(x), 'target_std'] if tuple(x) in df_brand_drug_id.index else np.nan, axis=1)
train_df['min_brand_drug_id'] = train_df[['brand', 'drug_id']].apply(lambda x: df_brand_drug_id.loc[tuple(x), 'target_min'] if tuple(x) in df_brand_drug_id.index else np.nan, axis=1)
train_df['max_brand_drug_id'] = train_df[['brand', 'drug_id']].apply(lambda x: df_brand_drug_id.loc[tuple(x), 'target_max'] if tuple(x) in df_brand_drug_id.index else np.nan, axis=1)

train_df['mean_country_drug_id'] = train_df[['country', 'drug_id']].apply(lambda x: df_country_drug_id.loc[tuple(x), 'target_mean'] if tuple(x) in df_country_drug_id.index else np.nan, axis=1)
train_df['median_country_drug_id'] = train_df[['country', 'drug_id']].apply(lambda x: df_country_drug_id.loc[tuple(x), 'target_median'] if tuple(x) in df_country_drug_id.index else np.nan, axis=1)
train_df['std_country_drug_id'] = train_df[['country', 'drug_id']].apply(lambda x: df_country_drug_id.loc[tuple(x), 'target_std'] if tuple(x) in df_country_drug_id.index else np.nan, axis=1)
train_df['min_country_drug_id'] = train_df[['country', 'drug_id']].apply(lambda x: df_country_drug_id.loc[tuple(x), 'target_min'] if tuple(x) in df_country_drug_id.index else np.nan, axis=1)
train_df['max_country_drug_id'] = train_df[['country', 'drug_id']].apply(lambda x: df_country_drug_id.loc[tuple(x), 'target_max'] if tuple(x) in df_country_drug_id.index else np.nan, axis=1)

train_df['mean_country_brand_drug_id'] = train_df[['country', 'brand', 'drug_id']].apply(lambda x: df_country_brand_drug_id.loc[tuple(x), 'target_mean'] if tuple(x) in df_country_brand_drug_id.index else np.nan, axis=1)
train_df['median_country_brand_drug_id'] = train_df[['country', 'brand', 'drug_id']].apply(lambda x: df_country_brand_drug_id.loc[tuple(x), 'target_median'] if tuple(x) in df_country_brand_drug_id.index else np.nan, axis=1)
train_df['std_country_brand_drug_id'] = train_df[['country', 'brand', 'drug_id']].apply(lambda x: df_country_brand_drug_id.loc[tuple(x), 'target_std'] if tuple(x) in df_country_brand_drug_id.index else np.nan, axis=1)
train_df['min_country_brand_drug_id'] = train_df[['country', 'brand', 'drug_id']].apply(lambda x: df_country_brand_drug_id.loc[tuple(x), 'target_min'] if tuple(x) in df_country_brand_drug_id.index else np.nan, axis=1)
train_df['max_country_brand_drug_id'] = train_df[['country', 'brand', 'drug_id']].apply(lambda x: df_country_brand_drug_id.loc[tuple(x), 'target_max'] if tuple(x) in df_country_brand_drug_id.index else np.nan, axis=1)

# Print the final dataframe with all aggregated features
print(train_df.head())

  train_df['mean_country'] = train_df['country'].map(df_country['target_mean'])
  train_df['median_country'] = train_df['country'].map(df_country['target_median'])
  train_df['std_country'] = train_df['country'].map(df_country['target_std'])
  train_df['min_country'] = train_df['country'].map(df_country['target_min'])
  train_df['max_country'] = train_df['country'].map(df_country['target_max'])
  train_df['mean_brand'] = train_df['brand'].map(df_brand['target_mean'])
  train_df['median_brand'] = train_df['brand'].map(df_brand['target_median'])
  train_df['std_brand'] = train_df['brand'].map(df_brand['target_std'])
  train_df['min_brand'] = train_df['brand'].map(df_brand['target_min'])
  train_df['max_brand'] = train_df['brand'].map(df_brand['target_max'])
  train_df['mean_drug_id'] = train_df['drug_id'].map(df_drug_id['target_mean'])
  train_df['median_drug_id'] = train_df['drug_id'].map(df_drug_id['target_median'])
  train_df['std_drug_id'] = train_df['drug_id'].map(df_drug_id['target

   brand  che_pc_usd  che_perc_gdp               cluster_nl  corporation  \
0    113    1.209114      1.665879  BRAND_354E_COUNTRY_88A3          116   
1    223    1.472378      1.753338  BRAND_626D_COUNTRY_8B47            0   
2    155    1.209114      1.665879  BRAND_45D9_COUNTRY_88A3           28   
3    489    1.851280      2.051770  BRAND_D724_COUNTRY_445D           55   
4    161    1.791199      2.059130  BRAND_4887_COUNTRY_D8B0           34   

   country launch_date       date  drug_id ind_launch_date  ...  \
0       28  2014-06-01 2014-06-01      121             NaT  ...   
1       30  2014-06-01 2014-06-01      223      2014-09-01  ...   
2       28  2014-06-01 2014-06-01      236             NaT  ...   
3       13  2014-06-01 2014-06-01       25             NaT  ...   
4       43  2014-06-01 2014-06-01      149             NaT  ...   

  mean_country_drug_id  median_country_drug_id  std_country_drug_id  \
0             1.033013                1.034500             0.010762  

  train_df['max_country_brand_drug_id'] = train_df[['country', 'brand', 'drug_id']].apply(lambda x: df_country_brand_drug_id.loc[tuple(x), 'target_max'] if tuple(x) in df_country_brand_drug_id.index else np.nan, axis=1)


In [27]:
train_data = train_df_rolling[train_df_rolling['date'] < '2022-01-01']
test_data = train_df_rolling[train_df_rolling['date'] >= '2022-01-01']

## Split Data

In [28]:
X_train = train_df.drop(['target', 'cluster_nl', 'launch_date', 'date', 'ind_launch_date', 'indication'], axis=1)
# X_train = train_data.drop(['target', 'launch_date', 'date', 'ind_launch_date', 'indication'], axis=1)
y_train = train_df['target']

X_test = test_data.drop(['target', 'cluster_nl', 'launch_date', 'date', 'ind_launch_date', 'indication'], axis=1)
# X_test = test_data.drop(['target', 'launch_date', 'date', 'ind_launch_date', 'indication'], axis=1)
y_test = test_data['target']

# Define Model Architecture
Define the architecture of the model using a suitable neural network for time series prediction.

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class AE_MLP(nn.Module):
    def __init__(self, num_columns, num_labels, hidden_units, dropout_rates):
        super(AE_MLP, self).__init__()
        
        # Initial batch normalization
        self.batch_norm0 = nn.BatchNorm1d(num_columns)
        
        # Encoder
        self.encoder_noise = nn.Dropout(dropout_rates[0])
        self.encoder_dense = nn.Linear(num_columns, hidden_units[0])
        self.encoder_batch_norm = nn.BatchNorm1d(hidden_units[0])
        
        # Decoder
        self.decoder_dropout = nn.Dropout(dropout_rates[1])
        self.decoder_dense = nn.Linear(hidden_units[0], num_columns)
        
        # AE branch
        self.x_ae_dense = nn.Linear(num_columns, hidden_units[1])
        self.x_ae_batch_norm = nn.BatchNorm1d(hidden_units[1])
        self.x_ae_dropout = nn.Dropout(dropout_rates[2])
        self.out_ae_dense = nn.Linear(hidden_units[1], num_labels)
        
        # Concatenation and main branch
        concat_input_dim = num_columns + hidden_units[0]
        self.concat_batch_norm = nn.BatchNorm1d(concat_input_dim)
        self.concat_dropout = nn.Dropout(dropout_rates[3])
        
        # Adjusted hidden layers
        self.hidden_layers = nn.ModuleList()
        input_dim = concat_input_dim  # Start with concatenated dimension
        for i in range(2, len(hidden_units)):
            self.hidden_layers.append(nn.Linear(input_dim, hidden_units[i]))
            self.hidden_layers.append(nn.BatchNorm1d(hidden_units[i]))
            self.hidden_layers.append(nn.Dropout(dropout_rates[i + 2]))
            input_dim = hidden_units[i]  # Update input_dim for next layer
        
        # Output layer
        self.out_dense = nn.Linear(input_dim, num_labels)
        
    def forward(self, x):
        x0 = self.batch_norm0(x)
        
        # Encoder
        encoder = self.encoder_noise(x0)
        encoder = self.encoder_dense(encoder)
        encoder = self.encoder_batch_norm(encoder)
        encoder = F.silu(encoder)
        
        # Decoder
        decoder = self.decoder_dropout(encoder)
        decoder = self.decoder_dense(decoder)
        
        # AE branch
        x_ae = self.x_ae_dense(decoder)
        x_ae = self.x_ae_batch_norm(x_ae)
        x_ae = F.silu(x_ae)
        x_ae = self.x_ae_dropout(x_ae)
        out_ae = torch.sigmoid(self.out_ae_dense(x_ae))
        
        # Main branch
        x_concat = torch.cat([x0, encoder], dim=1)
        x = self.concat_batch_norm(x_concat)
        x = self.concat_dropout(x)
        
        for layer in self.hidden_layers:
            if isinstance(layer, nn.Linear):
                x = layer(x)
                x = F.silu(x)
            else:
                x = layer(x)
                
        out = torch.sigmoid(self.out_dense(x))
        
        return decoder, out_ae, out

# Example usage:
# model = AE_MLP(num_columns, num_labels, hidden_units, dropout_rates)
# optimizer = torch.optim.Adam(model.parameters(), lr=lr)
# criterion_decoder = nn.MSELoss()
# criterion_ae_action = nn.BCEWithLogitsLoss()
# criterion_action = nn.BCEWithLogitsLoss()

# Train the Model
Train the model on the entire dataset without using cross-validation. Save the best model using ModelCheckpoint.

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

In [30]:
model = XGBRegressor(n_estimators=2000, n_jobs=-1, random_state=33)
model.fit(X_train, y_train, verbose=True) # Change verbose to True if you want to see it train

In [31]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")

Mean Squared Error: 0.0003
R² Score: 0.9999


## Evaluate Model

In [32]:
from pathlib import Path
from typing import Tuple

def _CYME(df: pd.DataFrame) -> float:
    """ Compute the CYME metric, that is 1/2(median(yearly error) + median(monthly error))"""

    yearly_agg = df.groupby("cluster_nl")[["target", "prediction"]].sum().reset_index()
    yearly_error = abs((yearly_agg["target"] - yearly_agg["prediction"])/yearly_agg["target"]).median()

    monthly_error = abs((df["target"] - df["prediction"])/df["target"]).median()

    return 1/2*(yearly_error + monthly_error)


def _metric(df: pd.DataFrame) -> float:
    """Compute metric of submission.

    :param df: Dataframe with target and 'prediction', and identifiers.
    :return: Performance metric
    """
    df = df.copy()
    df["date"] = pd.to_datetime(df["date"])

    # Split 0 actuals - rest
    zeros = df[df["zero_actuals"] == 1]
    recent = df[df["zero_actuals"] == 0]

    # weight for each group
    zeros_weight = len(zeros)/len(df)
    recent_weight = 1 - zeros_weight

    # Compute CYME for each group
    return round(recent_weight*_CYME(recent) + zeros_weight*min(1,_CYME(zeros)), 8)


def compute_metric(submission: pd.DataFrame) -> Tuple[float, float]:
    """Compute metric.

    :param submission: Prediction. Requires columns: ['cluster_nl', 'date', 'target', 'prediction']
    :return: Performance metric.
    """

    submission["date"] = pd.to_datetime(submission["date"])
    submission = submission[['cluster_nl', 'date', 'target', 'prediction', 'zero_actuals']]

    return _metric(submission)

In [35]:
validation = test_data.copy()

validation["prediction"] = model.predict(validation[X_train.columns])

# Assign column ["zero_actuals"] in the depending if in your
# split the cluster_nl has already had actuals on train or not
existing_clusters = train_data['cluster_nl'].unique()
validation['zero_actuals'] = (~validation['cluster_nl'].isin(existing_clusters)).astype(int)

print("Performance:", compute_metric(validation))

Performance: 0.00486111


# Save the Model
Evaluate the model's performance on a validation set or using other suitable metrics.

In [None]:
test_df = pd.read_csv('SUBMISSION/Data Files/submission_data.csv')
test_df.replace(-1.0, np.nan, inplace=True)
test_df.replace('-1', np.nan, inplace=True)

(0.0, 1.0, 8)

In [None]:
test_df[numeric_cols] = test_df[numeric_cols].fillna(test_df[numeric_cols].median())

In [None]:
date_columns = ['launch_date', 'date', 'ind_launch_date']
for col in date_columns:
    test_df[col] = pd.to_datetime(test_df[col], errors='coerce')

In [None]:
test_df['launch_year'] = test_df['launch_date'].dt.year
test_df['launch_month'] = test_df['launch_date'].dt.month
test_df['date_year'] = test_df['date'].dt.year
test_df['date_month'] = test_df['date'].dt.month

In [None]:
for col in label_enc_columns:
    le = label_encoders[col]
    test_df[col] = le.fit_transform(test_df[col].astype(str))
    label_encoders[col] = le

In [None]:
X_test = test_df.drop(['target', 'cluster_nl', 'launch_date', 'date', 'ind_launch_date', 'indication'], axis=1)

In [None]:
y_pred = model.predict(X_test)

In [None]:
test_df['date_str'] = test_df['date'].astype(str)

In [None]:
submission_data = pd.DataFrame({'date_str':test_df['date_str'], 'cluster_nl':test_df['cluster_nl'], 'prediction': y_pred})

In [None]:
submission_csv = pd.read_csv('SUBMISSION/submission_template.csv')

In [None]:
submission_csv.drop('prediction', axis=1, inplace=True)

In [None]:
submission_csv['date']

In [None]:
submission_csv = pd.merge(submission_csv, submission_data, left_on=['date', 'cluster_nl'], right_on=['date_str', 'cluster_nl'], how='left')