In [28]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.optimizers import Adam


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from typing import Tuple

epochs = 10
batch_size=64



In [18]:
def _CYME(df: pd.DataFrame) -> float:
    """ Compute the CYME metric, that is 1/2(median(yearly error) + median(monthly error))"""

    yearly_agg = df.groupby("cluster_nl")[["target", "prediction"]].sum().reset_index()
    yearly_error = abs((yearly_agg["target"] - yearly_agg["prediction"])/yearly_agg["target"]).median()

    monthly_error = abs((df["target"] - df["prediction"])/df["target"]).median()

    return 1/2*(yearly_error + monthly_error)


def _metric(df: pd.DataFrame) -> float:
    """Compute metric of submission.

    :param df: Dataframe with target and 'prediction', and identifiers.
    :return: Performance metric
    """
    df = df.copy()
    df["date"] = pd.to_datetime(df["date"])

    # Split 0 actuals - rest
    zeros = df[df["zero_actuals"] == 1]
    recent = df[df["zero_actuals"] == 0]

    # weight for each group
    zeros_weight = len(zeros)/len(df)
    recent_weight = 1 - zeros_weight

    # Compute CYME for each group
    return round(recent_weight*_CYME(recent) + zeros_weight*min(1,_CYME(zeros)),8)


def compute_metric(submission: pd.DataFrame) -> Tuple[float, float]:
    """Compute metric.

    :param submission: Prediction. Requires columns: ['cluster_nl', 'date', 'target', 'prediction']
    :return: Performance metric.
    """

    submission["date"] = pd.to_datetime(submission["date"])
    submission = submission[['cluster_nl', 'date', 'target', 'prediction', 'zero_actuals']]

    return _metric(submission)

In [19]:
# Define file paths
input_path = "dataset"
print("Files in dataset directory:")
print(os.path.dirname(os.getcwd()))
print(os.listdir(os.path.join(os.path.dirname(os.getcwd()), input_path)))

features_cols = [
    "brand", 
    "che_pc_usd", 
    "che_perc_gdp", 
    "corporation", 
    "country", 
    "launch_date", 
    "drug_id", 
    "ind_launch_date", 
    "indication", 
    "insurance_perc_che", 
    "population", 
    "prev_perc", 
    "price_month", 
    "price_unit", 
    "public_perc_che", 
    "therapeutic_area",
]
target_col = "target"
id_col = ["date","cluster_nl"]

base_dir = os.path.join(os.path.dirname(os.getcwd()), input_path)
# Load datasets
# data = pd.read_csv(f"{base_dir}/train_data.csv", usecols=features_cols + [target_col] + id_col)
data = pd.read_csv(f"{base_dir}/train_data_TRY1.csv", usecols=features_cols + [target_col] + id_col)
test_data = pd.read_csv(f"{base_dir}/submission_data.csv", usecols=features_cols + id_col)

y = data[target_col]

# Display dataset informations
print(f"Training data info: {data.info()}")
print(data.info(memory_usage="deep"))

Files in dataset directory:
/home/ferrandf/novartis-datathon
['submission_data.csv', 'train_data.csv', 'First_Clean_train_data.csv', 'train_data_TRY1.csv']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118906 entries, 0 to 118905
Data columns (total 19 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   brand               118906 non-null  object 
 1   che_pc_usd          118906 non-null  float64
 2   che_perc_gdp        118906 non-null  float64
 3   cluster_nl          118906 non-null  object 
 4   corporation         118906 non-null  object 
 5   country             118906 non-null  object 
 6   launch_date         118906 non-null  object 
 7   date                118906 non-null  object 
 8   drug_id             118906 non-null  object 
 9   ind_launch_date     118906 non-null  object 
 10  indication          118906 non-null  object 
 11  insurance_perc_che  118906 non-null  float64
 12  population          118906 n

In [20]:
# Separate numeric and categorical features for imputation
numeric_features = data.select_dtypes(include=['float64']).drop(columns=[target_col], errors='ignore').columns
categorical_features = data.select_dtypes(include=['object']).columns

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")


Numeric features: Index(['che_pc_usd', 'che_perc_gdp', 'insurance_perc_che', 'population',
       'prev_perc', 'price_month', 'price_unit', 'public_perc_che'],
      dtype='object')
Categorical features: Index(['brand', 'cluster_nl', 'corporation', 'country', 'launch_date', 'date',
       'drug_id', 'ind_launch_date', 'indication', 'therapeutic_area'],
      dtype='object')


In [21]:
# Drop unnecessary columns
X = data.drop(columns=[target_col]+id_col)
X_test = test_data.drop(columns=id_col)

# Preprocessing pipeline
def preprocess_data(X, preprocessor=None, fit=True):
    numerical_features = X.select_dtypes(include=['float64']).columns
    categorical_features = X.select_dtypes(include=['category']).columns

    if preprocessor is None:
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )

    if fit:
        X_transformed = preprocessor.fit_transform(X)
    else:
        X_transformed = preprocessor.transform(X)
    
    X_transformed = np.array(X_transformed)

    return X_transformed, preprocessor

# Preprocess data
X_transformed, preprocessor = preprocess_data(X, fit=True)
X_test_transformed, _ = preprocess_data(X_test, preprocessor=preprocessor, fit=False)

print(X_transformed)


[[-0.72931342  0.1093513   0.84690232 ...  0.06633365 -0.21597011
   0.24331455]
 [-5.76038552 -4.8459021  -1.9239651  ... -1.35219885  0.11488193
  -9.10311876]
 [-0.72931342  0.1093513   0.84690232 ... -1.35219885  0.93443557
   0.24331455]
 ...
 [-0.91518872 -0.21468219  0.78305745 ...  0.07397957 -0.21352666
   0.54315001]
 [ 0.78612113  0.76809541 -1.9239651  ...  0.74117745  0.58141327
   0.73777625]
 [-0.91518872 -0.21468219  0.78305745 ... -1.35219885 -0.22195944
   0.54315001]]


In [26]:
# Split data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X_transformed, y, test_size=0.05, random_state=42)

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1)
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['accuracy', 'mae', 'mse'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [27]:
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_valid, y_valid), verbose=1)

Epoch 1/100
[1m3530/3530[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 7ms/step - accuracy: 0.0028 - loss: 1.3278 - mae: 0.5128 - mse: 1.3278 - val_accuracy: 0.0024 - val_loss: 0.8726 - val_mae: 0.3426 - val_mse: 0.8726
Epoch 2/100
[1m3530/3530[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 7ms/step - accuracy: 0.0028 - loss: 1.0287 - mae: 0.3994 - mse: 1.0287 - val_accuracy: 0.0024 - val_loss: 0.8444 - val_mae: 0.3783 - val_mse: 0.8444
Epoch 3/100
[1m3530/3530[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 7ms/step - accuracy: 0.0029 - loss: 0.9497 - mae: 0.3750 - mse: 0.9497 - val_accuracy: 0.0024 - val_loss: 0.7338 - val_mae: 0.3476 - val_mse: 0.7338
Epoch 4/100
[1m3530/3530[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 7ms/step - accuracy: 0.0027 - loss: 0.8851 - mae: 0.3707 - mse: 0.8851 - val_accuracy: 0.0024 - val_loss: 0.6755 - val_mae: 0.3273 - val_mse: 0.6755
Epoch 5/100
[1m3530/3530[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m

<keras.src.callbacks.history.History at 0x7fa7584ea750>

In [29]:
results = model.evaluate(X_valid, y_valid)
print(results)

[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.0026 - loss: 0.4073 - mae: 0.2742 - mse: 0.4073
[0.42726361751556396, 0.0023545240983366966, 0.28378382325172424, 0.42726361751556396]


In [31]:
# Predict on test data
predictions = model.predict(X_test_transformed)


result = pd.DataFrame({
    id_col[0]: pd.to_datetime(test_data[id_col[0]]).dt.strftime("%m/%d/%Y"),
    id_col[1]: test_data[id_col[1]],
    # "target": data[target_col],
    "prediction": predictions.flatten()
})

result.to_csv('result.csv', index=False)
print("Result file saved as 'result.csv'")

result.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'")

[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Result file saved as 'result.csv'
Submission file saved as 'submission.csv'


In [None]:
import matplotlib.pyplot as plt

# Plot training history
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.legend()
plt.title("Training and Validation Loss")
plt.show()

# Compare predictions with actual values
plt.scatter(y_test, predictions)
plt.xlabel("Actual Sales")
plt.ylabel("Predicted Sales")
plt.title("Actual vs. Predicted Sales")
plt.show()


In [None]:
model2 = Sequential()
model2.add(LSTM(8, input_shape=[None, 1], return_sequences=True))
model2.add(LSTM(4, input_shape=[None, 1]))
model2.add(Dense(1))
model2.compile(loss="huber_loss", optimizer='adam')