In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


from pathlib import Path
from typing import Tuple

In [3]:
def _CYME(df: pd.DataFrame) -> float:
    """ Compute the CYME metric, that is 1/2(median(yearly error) + median(monthly error))"""

    yearly_agg = df.groupby("cluster_nl")[["target", "prediction"]].sum().reset_index()
    yearly_error = abs((yearly_agg["target"] - yearly_agg["prediction"])/yearly_agg["target"]).median()

    monthly_error = abs((df["target"] - df["prediction"])/df["target"]).median()

    return 1/2*(yearly_error + monthly_error)


def _metric(df: pd.DataFrame) -> float:
    """Compute metric of submission.

    :param df: Dataframe with target and 'prediction', and identifiers.
    :return: Performance metric
    """
    df = df.copy()
    df["date"] = pd.to_datetime(df["date"])

    # Split 0 actuals - rest
    zeros = df[df["zero_actuals"] == 1]
    recent = df[df["zero_actuals"] == 0]

    # weight for each group
    zeros_weight = len(zeros)/len(df)
    recent_weight = 1 - zeros_weight

    # Compute CYME for each group
    return round(recent_weight*_CYME(recent) + zeros_weight*min(1,_CYME(zeros)),8)


def compute_metric(submission: pd.DataFrame) -> Tuple[float, float]:
    """Compute metric.

    :param submission: Prediction. Requires columns: ['cluster_nl', 'date', 'target', 'prediction']
    :return: Performance metric.
    """

    submission["date"] = pd.to_datetime(submission["date"])
    submission = submission[['cluster_nl', 'date', 'target', 'prediction', 'zero_actuals']]

    return _metric(submission)

In [4]:
# Define file paths
input_path = "dataset"
print("Files in dataset directory:")
print(os.path.dirname(os.getcwd()))
print(os.listdir(os.path.join(os.path.dirname(os.getcwd()), input_path)))

features_cols = [
    "brand", 
    "che_pc_usd", 
    "che_perc_gdp", 
    "corporation", 
    "country", 
    "launch_date", 
    "drug_id", 
    "ind_launch_date", 
    "indication", 
    "insurance_perc_che", 
    "population", 
    "prev_perc", 
    "price_month", 
    "price_unit", 
    "public_perc_che", 
    "therapeutic_area",
]
target_col = "target"
id_col = ["cluster_nl","date"]

base_dir = os.path.join(os.path.dirname(os.getcwd()), input_path)
# Load datasets
# data = pd.read_csv(f"{base_dir}/train_data.csv", usecols=features_cols + [target_col] + id_col)
data = pd.read_csv(f"{base_dir}/train_data_TRY1.csv", usecols=features_cols + [target_col] + id_col)
test_data = pd.read_csv(f"{base_dir}/submission_data.csv", usecols=features_cols + id_col)

y = data[target_col]

# Display dataset informations
print(f"Training data info: {data.info()}")
print(data.info(memory_usage="deep"))

Files in dataset directory:
/home/ferrandf/novartis-datathon
['submission_data.csv', 'train_data.csv', 'First_Clean_train_data.csv', 'train_data_TRY1.csv']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118906 entries, 0 to 118905
Data columns (total 19 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   brand               118906 non-null  object 
 1   che_pc_usd          118906 non-null  float64
 2   che_perc_gdp        118906 non-null  float64
 3   cluster_nl          118906 non-null  object 
 4   corporation         118906 non-null  object 
 5   country             118906 non-null  object 
 6   launch_date         118906 non-null  object 
 7   date                118906 non-null  object 
 8   drug_id             118906 non-null  object 
 9   ind_launch_date     118906 non-null  object 
 10  indication          118906 non-null  object 
 11  insurance_perc_che  118906 non-null  float64
 12  population          118906 n

In [5]:
# Separate numeric and categorical features for imputation
numeric_features = data.select_dtypes(include=['float64']).drop(columns=[target_col], errors='ignore').columns
categorical_features = data.select_dtypes(include=['object']).columns

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")


Numeric features: Index(['che_pc_usd', 'che_perc_gdp', 'insurance_perc_che', 'population',
       'prev_perc', 'price_month', 'price_unit', 'public_perc_che'],
      dtype='object')
Categorical features: Index(['brand', 'cluster_nl', 'corporation', 'country', 'launch_date', 'date',
       'drug_id', 'ind_launch_date', 'indication', 'therapeutic_area'],
      dtype='object')


In [6]:
# Drop unnecessary columns
X = data.drop(columns=[target_col]+id_col)
X_test = test_data.drop(columns=id_col)

# Preprocessing pipeline
def preprocess_data(X, preprocessor=None, fit=True):
    numerical_features = X.select_dtypes(include=['float64']).columns
    categorical_features = X.select_dtypes(include=['category']).columns

    if preprocessor is None:
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )

    if fit:
        X_transformed = preprocessor.fit_transform(X)
    else:
        X_transformed = preprocessor.transform(X)
    
    X_transformed = np.array(X_transformed)

    return X_transformed, preprocessor

# Preprocess data
X_transformed, preprocessor = preprocess_data(X, fit=True)
X_test_transformed, _ = preprocess_data(X_test, preprocessor=preprocessor, fit=False)

print(X_transformed)


[[-0.72931342  0.1093513   0.84690232 ...  0.06633365 -0.21597011
   0.24331455]
 [-5.76038552 -4.8459021  -1.9239651  ... -1.35219885  0.11488193
  -9.10311876]
 [-0.72931342  0.1093513   0.84690232 ... -1.35219885  0.93443557
   0.24331455]
 ...
 [-0.91518872 -0.21468219  0.78305745 ...  0.07397957 -0.21352666
   0.54315001]
 [ 0.78612113  0.76809541 -1.9239651  ...  0.74117745  0.58141327
   0.73777625]
 [-0.91518872 -0.21468219  0.78305745 ... -1.35219885 -0.22195944
   0.54315001]]


In [21]:
# Split data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X_transformed, y, test_size=0.05, random_state=42)

# Define models
models = {
    'RandomForest100': RandomForestRegressor(n_estimators=100, random_state=42),
}
# Evaluate models using cross-validation
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
    results[name] = -scores.mean()



In [8]:
validation = X_valid.copy()

# Ensure y_valid is a pandas Series
if not isinstance(y_valid, pd.Series):
    y_valid = pd.Series(y_valid)
print(len(y_valid), len(X_valid))
# Check if the length of y_valid matches the number of rows in X_valid
if len(y_valid) != len(X_valid):
    
    raise ValueError("Length of y_valid does not match the number of rows in X_valid")

print(y_valid)

y_valid.drop(y_valid.index[:1], inplace=True)

validation['target'] = y_valid



validation['prediction'] = model.predict(X_valid[features_cols])

23782 23782
41674     1.000273
108204    1.556673
77650     2.509877
36913     1.010462
13619     1.011719
            ...   
42758     1.003945
34976     1.017688
41932     1.154981
21926     1.053061
96697     1.272071
Name: target, Length: 23782, dtype: float64


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
# Optionally check performance
print("Performance:", compute_metric(validation))

In [89]:

# Prepare submission
submission_data = pd.read_parquet(f"{base_dir}/submission_data.csv")
submission = pd.read_csv(f"{base_dir}/submission_template.csv")

submission['prediction'] = model.predict(submission_data[features_cols])

ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

In [22]:
# Display RMSE results
results_df = pd.DataFrame(results.items(), columns=['Model', 'RMSE']).sort_values(by='RMSE')
results_df['RMSE'] = results_df['RMSE'].map("{:.5f}".format)
results_df

best_model_name = results_df.iloc[0]['Model']
best_model = models[best_model_name]
# Make predictions on the test set
train_predictions = best_model.predict(X_test_transformed)

result = pd.DataFrame({
    id_col[1]: pd.to_datetime(test_data[id_col[1]]).dt.strftime("%m/%d/%Y"),
    id_col[0]: test_data[id_col[0]],
    # "target": data[target_col],
    "prediction": train_predictions
})

result.to_csv('result.csv', index=False)
print("Result file saved as 'result.csv'")

result.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'")
results_df


Result file saved as 'result.csv'
Submission file saved as 'submission.csv'


Unnamed: 0,Model,RMSE
0,RandomForest100,0.25852


In [81]:
best_model_name = results_df.iloc[0]['Model']
best_model = models[best_model_name]
best_model.fit(X_transformed, y)
print(f"Selected Model: {best_model_name}")

Selected Model: RandomForest


In [None]:
for name, model in models.items():
    # Make predictions on the test set
    test_predictions = np.expm1(model.predict(X_test_transformed))

    result = pd.DataFrame({
    # In the provided CSV the longitude and latitude are inverted
        "Location.GIS.Latitude": test_data["Location.GIS.Longitude"].astype(float),
        "Location.GIS.Logitude": test_data["Location.GIS.Latitude"].astype(float),
        id_col: test_data[id_col].astype(str),
        target_col: test_predictions.astype(float)
    })
    
    result_name = f"result_{name}.csv"
    result.to_csv(result_name, index=False)
    print(f"Result file saved as '{result_name}'")

    submission_name = "submission_{name}.csv"
    result[[id_col, target_col]].to_csv(submission_name, index=False)
    print(f"Submission file saved as '{submission_name}'")

In [83]:
test_predictions = np.expm1(best_model.predict(X_test_transformed))

result = pd.DataFrame({
    # In the provided CSV the longitude and latitude are inverted
    "Location.GIS.Latitude": test_data["Location.GIS.Longitude"].astype(float),
    "Location.GIS.Logitude": test_data["Location.GIS.Latitude"].astype(float),
    id_col: test_data[id_col],
    target_col: test_predictions
})

result.to_csv('result_test.csv', index=False)
print("Result file saved as 'result_test.csv'")

result[[id_col, target_col]].to_csv('submission_test.csv', index=False)
print("Submission file saved as 'submission_test.csv'")

Result file saved as 'result_test.csv'
Submission file saved as 'submission_test.csv'
