In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


from pathlib import Path
from typing import Tuple

In [2]:
def _CYME(df: pd.DataFrame) -> float:
    """ Compute the CYME metric, that is 1/2(median(yearly error) + median(monthly error))"""

    yearly_agg = df.groupby("cluster_nl")[["target", "prediction"]].sum().reset_index()
    yearly_error = abs((yearly_agg["target"] - yearly_agg["prediction"])/yearly_agg["target"]).median()

    monthly_error = abs((df["target"] - df["prediction"])/df["target"]).median()

    return 1/2*(yearly_error + monthly_error)


def _metric(df: pd.DataFrame) -> float:
    """Compute metric of submission.

    :param df: Dataframe with target and 'prediction', and identifiers.
    :return: Performance metric
    """
    df = df.copy()
    df["date"] = pd.to_datetime(df["date"])

    # Split 0 actuals - rest
    zeros = df[df["zero_actuals"] == 1]
    recent = df[df["zero_actuals"] == 0]

    # weight for each group
    zeros_weight = len(zeros)/len(df)
    recent_weight = 1 - zeros_weight

    # Compute CYME for each group
    return round(recent_weight*_CYME(recent) + zeros_weight*min(1,_CYME(zeros)),8)


def compute_metric(submission: pd.DataFrame) -> Tuple[float, float]:
    """Compute metric.

    :param submission: Prediction. Requires columns: ['cluster_nl', 'date', 'target', 'prediction']
    :return: Performance metric.
    """

    submission["date"] = pd.to_datetime(submission["date"])
    submission = submission[['cluster_nl', 'date', 'target', 'prediction', 'zero_actuals']]

    return _metric(submission)

In [4]:
# Define file paths

features_cols = [
    "brand",
    "che_pc_usd",
    "che_perc_gdp",
    "corporation",
    "country",
    "launch_date",
    "drug_id",
    "ind_launch_date",
    "indication",
    "insurance_perc_che",
    "population",
    "prev_perc",
    "price_month",
    "price_unit",
    "public_perc_che",
    "therapeutic_area",
    "Country_Group",
    "Price_Group",
    "indication_number"
]
target_col = "target"
id_col = ["date","cluster_nl"]

base_dir = os.path.join(os.path.dirname(os.getcwd()), "dataset")
therap = os.path.join(os.path.dirname(os.getcwd()), "dataset", "therapeutic_area")
# Load datasets
# data = pd.read_csv(f"{base_dir}/train_data.csv", usecols=features_cols + [target_col] + id_col)
data = {}
y = {}
therapeutic_areas = os.listdir(therap)
extracted_parts = [name.split('_')[1].split('.')[0] for name in therapeutic_areas]
print(therapeutic_areas)

for i in extracted_parts:
    data[i] = pd.read_csv(f"{therap}/subset_{i}.csv", usecols=features_cols + [target_col] + id_col)
    print(f"Data loaded for therapeutic area {i}")
    y[i] = data[i][target_col]

test_data = pd.read_csv(f"{base_dir}/submission_data_added.csv", usecols=features_cols + id_col)



['subset_032C.csv', 'subset_22ED.csv', 'subset_CD59.csv', 'subset_4BA5.csv', 'subset_645F.csv', 'subset_8E53.csv', 'subset_644A.csv', 'subset_66C5.csv', 'subset_96D7.csv', 'subset_6CEE.csv', 'subset_051D.csv', 'subset_980E.csv']
Data loaded for therapeutic area 032C
Data loaded for therapeutic area 22ED
Data loaded for therapeutic area CD59
Data loaded for therapeutic area 4BA5
Data loaded for therapeutic area 645F
Data loaded for therapeutic area 8E53
Data loaded for therapeutic area 644A
Data loaded for therapeutic area 66C5
Data loaded for therapeutic area 96D7
Data loaded for therapeutic area 6CEE
Data loaded for therapeutic area 051D
Data loaded for therapeutic area 980E


In [5]:
numeric_features = {}
categorical_features = {}
for i in extracted_parts:
    # convert int64 to float64
    data[i] = data[i].astype({"Country_Group": "float64"})
    data[i] = data[i].astype({"Price_Group": "float64"})
    data[i] = data[i].astype({"indication_number": "float64"})
    numeric_features[i] = data[i].select_dtypes(include=['float64']).drop(columns=[target_col], errors='ignore').columns
    categorical_features[i] = data[i].select_dtypes(include=['object']).columns

test_data = test_data.astype({"Country_Group": "float64"})
test_data = test_data.astype({"Price_Group": "float64"})
test_data = test_data.astype({"indication_number": "float64"})

# Separate numeric and categorical features for imputation


In [6]:
X = {}
for i in extracted_parts:
    # Drop unnecessary columns
    X[i] = data[i].drop(columns=[target_col]+["cluster_nl"])

X_test = test_data.drop(columns=["cluster_nl"])

# Preprocessing pipeline
def preprocess_data(X, preprocessor=None, fit=True):
    numerical_features = X.select_dtypes(include=['float64']).columns
    categorical_features = X.select_dtypes(include=['category']).columns

    if preprocessor is None:
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )

    if fit:
        X_transformed = preprocessor.fit_transform(X)
    else:
        X_transformed = preprocessor.transform(X)

    X_transformed = np.array(X_transformed)

    return X_transformed, preprocessor

# Preprocess data
X_transformed = {}
for i in extracted_parts:
    X_transformed[i], preprocessor = preprocess_data(X[i], fit=True)


X_test_transformed = preprocess_data(X_test, preprocessor=preprocessor, fit=False)

In [7]:
X_train = {}
X_valid = {}
y_train = {}
y_valid = {}
for i in extracted_parts:
    print(f"X_transformed shape for {i}: {X_transformed[i].shape}")
    X_train[i], X_valid[i], y_train[i], y_valid[i] = train_test_split(X_transformed[i], y[i], test_size=0.05, random_state=42)


X_transformed shape for 032C: (2011, 11)
X_transformed shape for 644A: (7579, 11)
X_transformed shape for 980E: (20298, 11)
X_transformed shape for CD59: (4578, 11)
X_transformed shape for 645F: (589, 11)
X_transformed shape for 66C5: (22024, 11)
X_transformed shape for 22ED: (112, 11)
X_transformed shape for 96D7: (45858, 11)
X_transformed shape for 4BA5: (1628, 11)
X_transformed shape for 8E53: (1523, 11)
X_transformed shape for 051D: (846, 11)
X_transformed shape for 6CEE: (11871, 11)


In [None]:
# Let's train different models for different therapeutic areas
models = {}
for i in extracted_parts:
    models[f'RandomForest_{i}'] =  RandomForestRegressor(n_estimators=100, random_state=42)

print(models)
results = {}
for i in extracted_parts:
    results[i] = 0
for name, model in models.items():
    model.fit(X_train[i], y_train[i])
    scores = cross_val_score(model, X_train[i], y_train[i], scoring='neg_root_mean_squared_error', cv=5)
    results[name.split('_')[1].split('.')[0]] = -scores.mean()

{'RandomForest_032C': RandomForestRegressor(random_state=42), 'RandomForest_644A': RandomForestRegressor(random_state=42), 'RandomForest_980E': RandomForestRegressor(random_state=42), 'RandomForest_CD59': RandomForestRegressor(random_state=42), 'RandomForest_645F': RandomForestRegressor(random_state=42), 'RandomForest_66C5': RandomForestRegressor(random_state=42), 'RandomForest_22ED': RandomForestRegressor(random_state=42), 'RandomForest_96D7': RandomForestRegressor(random_state=42), 'RandomForest_4BA5': RandomForestRegressor(random_state=42), 'RandomForest_8E53': RandomForestRegressor(random_state=42), 'RandomForest_051D': RandomForestRegressor(random_state=42), 'RandomForest_6CEE': RandomForestRegressor(random_state=42)}


In [16]:
print(results)

{'032C': 0.24356115253075936, '644A': 0.24356115253075936, '980E': 0.24356115253075936, 'CD59': 0.24356115253075936, '645F': 0.24356115253075936, '66C5': 0.24356115253075936, '22ED': 0.24356115253075936, '96D7': 0.24356115253075936, '4BA5': 0.24356115253075936, '8E53': 0.24356115253075936, '051D': 0.24356115253075936, '6CEE': 0.24356115253075936}


In [48]:
X_test_transformed = preprocess_data(X_test, preprocessor=preprocessor, fit=False)
print(X_test_transformed[0])

# Ensure indices align between test_data and X_test_transformed
X_test_transformed = pd.DataFrame(X_test_transformed[0], index=test_data.index)  # Specify column names if necessary

def extract_part(text):
    return text.split('_')[2].split('.')[0]

X_test_transformed['therapeutic_area'] = test_data['therapeutic_area'].apply(extract_part)
print(X_test_transformed)

# Initialize a DataFrame to store the results
results_final = pd.DataFrame()

# Iterate over each therapeutic area
for area in models.keys():
    keyy = area.split('_')[1]
    print(keyy)
    # Filter test data for the current therapeutic area
    area_indices = X_test_transformed[X_test_transformed['therapeutic_area'] == keyy].index
    X_area = X_test_transformed.loc[area_indices].drop('therapeutic_area', axis=1)

    # Get the corresponding model
    model = models[area]
    print(models)

    # Make predictions
    if not X_area.empty:
      predictions = model.predict(X_area)

      # Prepare the result DataFrame
      test_area = test_data.loc[area_indices]
      area_results = pd.DataFrame({
          'date': pd.to_datetime(test_area['date']).dt.strftime("%m/%d/%Y"),
          'cluster_nl': test_area['cluster_nl'],
          'prediction': predictions
      })

      # Append to the results
      results_final = pd.concat([results_final, area_results], ignore_index=True)

results_final.to_csv('result.csv', index=False)
print("Result file saved as 'result.csv'")

[[ 1.01567531 -0.39652544  0.12659212 ...  1.10610031  1.7231267
  -0.22723853]
 [ 1.01567531 -0.39652544  0.12659212 ...  1.10610031  1.7231267
  -0.22723853]
 [-1.33456047 -1.10284558  0.32783526 ...  1.10610031  1.7231267
   3.71078869]
 ...
 [-0.96082442 -0.34642948  0.85039814 ...  1.10610031  1.7231267
  -0.22723853]
 [-0.96082442 -0.34642948  0.85039814 ...  1.10610031  1.7231267
  -0.22723853]
 [-1.33321338 -1.06392118  0.19474396 ...  1.10610031  1.7231267
  -0.22723853]]
             0         1         2         3          4         5         6  \
0     1.015675 -0.396525  0.126592 -0.347339   0.485704  0.043409 -0.094785   
1     1.015675 -0.396525  0.126592 -0.347339   2.578480 -1.452104 -0.088241   
2    -1.334560 -1.102846  0.327835  1.660868   0.619854 -1.452104 -0.095205   
3    -0.953909 -0.303391  0.823770 -0.313311  -0.523229 -1.452104 -0.079308   
4    -1.333213 -1.063921  0.194744  0.439429  -0.601617 -1.452104 -0.031373   
...        ...       ...       ...      