In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from transformers import BertTokenizer, BertModel
import torch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import joblib
from flask import Flask, request, jsonify
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#loading the dataset
Drug = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Drug.csv")

### Data Preprocessing

In [None]:
Drug.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2219 entries, 0 to 2218
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Condition     2219 non-null   object 
 1   Drug          2219 non-null   object 
 2   Indication    2219 non-null   object 
 3   Type          2219 non-null   object 
 4   Reviews       2219 non-null   object 
 5   Effective     2219 non-null   float64
 6   EaseOfUse     2219 non-null   float64
 7   Satisfaction  2219 non-null   float64
 8   Information   2219 non-null   object 
dtypes: float64(3), object(6)
memory usage: 156.1+ KB


### Data Pre-processing

In [None]:
#removing all duplicate columns
Drug = Drug.drop_duplicates()

In [None]:
#Dropping the type  column
Drug.drop("Type", axis = 1,inplace = True)

In [None]:
Drug.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1753 entries, 0 to 2218
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Condition     1753 non-null   object 
 1   Drug          1753 non-null   object 
 2   Indication    1753 non-null   object 
 3   Reviews       1753 non-null   object 
 4   Effective     1753 non-null   float64
 5   EaseOfUse     1753 non-null   float64
 6   Satisfaction  1753 non-null   float64
 7   Information   1753 non-null   object 
dtypes: float64(3), object(5)
memory usage: 123.3+ KB


In [None]:
#Removing the post fix from the reviews
Drug['Reviews'] = Drug['Reviews'].str.replace(' Reviews', '', regex=False)
#converting reviews to a numerical value
Drug['Reviews'] = Drug['Reviews'].astype(int)

In [None]:
Drug.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1753 entries, 0 to 2218
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Condition     1753 non-null   object 
 1   Drug          1753 non-null   object 
 2   Indication    1753 non-null   object 
 3   Reviews       1753 non-null   int64  
 4   Effective     1753 non-null   float64
 5   EaseOfUse     1753 non-null   float64
 6   Satisfaction  1753 non-null   float64
 7   Information   1753 non-null   object 
dtypes: float64(3), int64(1), object(4)
memory usage: 123.3+ KB


In [None]:
#No missing values were found, hence no imputation

In [None]:
 #Declaring the label encoder
 label_encoders = {}
 categorical_features = ['Condition', 'Drug', 'Indication']

In [None]:
#Encoding the categorical variables
for feature in categorical_features:
    label_en = LabelEncoder()
    Drug[feature] = label_en.fit_transform(Drug[feature])
    label_encoders[feature] = label_en

### Feature Engineering

In [None]:
# Declaring the standard scaler
scaler = StandardScaler()

In [None]:
#Scaling the reviews feature
Drug[['Reviews']] = scaler.fit_transform(Drug[['Reviews']])

In [None]:
joblib.dump(label_encoders, '/content/drive/MyDrive/Shared Folder/label_encoders.joblib')
joblib.dump(scaler, '/content/drive/MyDrive/Shared Folder/scaler.joblib')

['/content/drive/MyDrive/Shared Folder/scaler.joblib']

In [None]:
Drug.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1753 entries, 0 to 2218
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Condition     1753 non-null   int64  
 1   Drug          1753 non-null   int64  
 2   Indication    1753 non-null   int64  
 3   Reviews       1753 non-null   float64
 4   Effective     1753 non-null   float64
 5   EaseOfUse     1753 non-null   float64
 6   Satisfaction  1753 non-null   float64
 7   Information   1753 non-null   object 
dtypes: float64(4), int64(3), object(1)
memory usage: 123.3+ KB


In [None]:
# Extracting target variables
X = Drug[['Condition', 'Drug', 'Indication', 'Reviews', 'Satisfaction', 'Information']]
y_effective = Drug['Effective']
y_ease_of_use = Drug['EaseOfUse']

In [None]:
# Splitting data into training and validation sets
X_train, X_val, y_train_effective, y_val_effective, y_train_ease, y_val_ease = train_test_split(
    X, y_effective, y_ease_of_use, test_size=0.2, random_state=42)


In [None]:
# Tokenize 'Information' column using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
#defining a function to tockenize the texts
def tokenize_texts(texts, tokenizer, max_len=512):
    return tokenizer.batch_encode_plus(
        texts,
        max_length=max_len,
        padding=True,
        truncation=True,
        return_tensors='pt'
    )

In [None]:
#Tockenizing the information feature in bith training and validation sets
train_encodings = tokenize_texts(X_train['Information'].tolist(), tokenizer)
val_encodings = tokenize_texts(X_val['Information'].tolist(), tokenizer)

In [None]:
#Extracting the pre-trained model from bert
bert_model = BertModel.from_pretrained('bert-base-uncased')


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
#Function to extract the important features
def extract_bert_features(encodings, model):
    with torch.no_grad():
        outputs = model(**encodings)
    return outputs.last_hidden_state.mean(dim=1).numpy()


In [None]:
#Extracting the most important features for the training and validation set
X_train_bert = extract_bert_features(train_encodings, bert_model)
X_val_bert = extract_bert_features(val_encodings, bert_model)


In [None]:
# Combine BERT features with other features
X_train_combined = pd.concat([X_train.drop(columns='Information').reset_index(drop=True),
                              pd.DataFrame(X_train_bert)], axis=1)
X_val_combined = pd.concat([X_val.drop(columns='Information').reset_index(drop=True),
                            pd.DataFrame(X_val_bert)], axis=1)

### Training the Models

In [None]:
# Defining the neural network model using the sequential API
input_size = X_train_combined.shape[1]
model_effective = nn.Sequential(
    nn.Linear(input_size, 128),
    nn.ReLU(),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, 1)
)

model_ease_of_use = nn.Sequential(
    nn.Linear(input_size, 128),
    nn.ReLU(),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, 1)
)

In [None]:
# Preparing the dataset
X_train_tensor = torch.tensor(X_train_combined.values, dtype=torch.float32)
y_train_effective_tensor = torch.tensor(y_train_effective.values, dtype=torch.float32).unsqueeze(1)
y_train_ease_tensor = torch.tensor(y_train_ease.values, dtype=torch.float32).unsqueeze(1)

X_val_tensor = torch.tensor(X_val_combined.values, dtype=torch.float32)
y_val_effective_tensor = torch.tensor(y_val_effective.values, dtype=torch.float32).unsqueeze(1)
y_val_ease_tensor = torch.tensor(y_val_ease.values, dtype=torch.float32).unsqueeze(1)

train_dataset = TensorDataset(X_train_tensor, y_train_effective_tensor, y_train_ease_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_effective_tensor, y_val_ease_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


In [None]:
# Initializing the loss function and optimizer
criterion = nn.MSELoss()
optimizer_effective = optim.Adam(model_effective.parameters(), lr=0.001)
optimizer_ease_of_use = optim.Adam(model_ease_of_use.parameters(), lr=0.001)


In [None]:
# Trainninh the models
num_epochs = 50

def train_model(model, optimizer, train_loader, val_loader, criterion, target_index):
    model.train()
    for epoch in range(num_epochs):
        for X_batch, y_batch_effective, y_batch_ease in train_loader:
            y_batch = y_batch_effective if target_index == 0 else y_batch_ease
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch_effective, y_batch_ease in val_loader:
                y_batch = y_batch_effective if target_index == 0 else y_batch_ease
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Val Loss: {val_loss/len(val_loader):.4f}')
        model.train()

In [None]:
train_model(model_effective, optimizer_effective, train_loader, val_loader, criterion, target_index=0)
train_model(model_ease_of_use, optimizer_ease_of_use, train_loader, val_loader, criterion, target_index=1)

Epoch [1/50], Loss: 1.1182, Val Loss: 1.1155
Epoch [2/50], Loss: 1.1394, Val Loss: 1.2582
Epoch [3/50], Loss: 0.8636, Val Loss: 0.9414
Epoch [4/50], Loss: 0.6758, Val Loss: 0.7669
Epoch [5/50], Loss: 0.4654, Val Loss: 0.6051
Epoch [6/50], Loss: 0.6292, Val Loss: 0.7454
Epoch [7/50], Loss: 0.3814, Val Loss: 0.5191
Epoch [8/50], Loss: 0.3528, Val Loss: 0.4868
Epoch [9/50], Loss: 0.1886, Val Loss: 0.3965
Epoch [10/50], Loss: 0.2327, Val Loss: 0.4020
Epoch [11/50], Loss: 0.3654, Val Loss: 0.6281
Epoch [12/50], Loss: 0.1860, Val Loss: 0.3739
Epoch [13/50], Loss: 0.1595, Val Loss: 0.3655
Epoch [14/50], Loss: 0.1954, Val Loss: 0.4263
Epoch [15/50], Loss: 0.1732, Val Loss: 0.3662
Epoch [16/50], Loss: 0.2402, Val Loss: 0.4108
Epoch [17/50], Loss: 0.1807, Val Loss: 0.3690
Epoch [18/50], Loss: 0.1716, Val Loss: 0.3842
Epoch [19/50], Loss: 0.1825, Val Loss: 0.3671
Epoch [20/50], Loss: 0.1583, Val Loss: 0.3699
Epoch [21/50], Loss: 0.2033, Val Loss: 0.3774
Epoch [22/50], Loss: 0.3099, Val Loss: 0.44

### Evaluation and optimization

In [None]:
#Evaluating the models performance
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import torch

def evaluate_model(model, X_val, y_val, model_type='pytorch'):
    if model_type == 'pytorch':
        model.eval()
        with torch.no_grad():
            predictions = model(X_val).numpy()
    else:  # For sklearn models
        with torch.no_grad():
            predictions = model.predict(X_val)

    mse = mean_squared_error(y_val, predictions)
    mae = mean_absolute_error(y_val, predictions)
    r2 = r2_score(y_val, predictions)


    return mse, mae, r2

# Convert validation data to tensor for PyTorch models
X_val_tensor = torch.tensor(X_val_combined.values, dtype=torch.float32)


In [None]:
# Evaluate the deep learning models
mse_effective, mae_effective, r2_effective = evaluate_model(model_effective, X_val_tensor, y_val_effective_tensor, model_type='pytorch')
print(f'Effectiveness Model - MSE: {mse_effective:.4f}, MAE: {mae_effective:.4f}, R-squared: {r2_effective:.4f}')

mse_ease_of_use, mae_ease_of_use, r2_ease_of_use = evaluate_model(model_ease_of_use, X_val_tensor, y_val_ease_tensor, model_type='pytorch')
print(f'Ease of Use Model - MSE: {mse_ease_of_use:.4f}, MAE: {mae_ease_of_use:.4f}, R-squared: {r2_ease_of_use:.4f}')


Effectiveness Model - MSE: 0.5017, MAE: 0.5451, R-squared: 0.5844
Ease of Use Model - MSE: 0.6153, MAE: 0.5445, R-squared: 0.4308


### Optimizing the features using the Grid search

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error

# Define the function to train and validate the model
def train_and_evaluate_model(params, X_train, y_train, X_val, y_val):
    input_size = X_train.shape[1]
    hidden_size1 = params['hidden_size1']
    hidden_size2 = params['hidden_size2']
    learning_rate = params['learning_rate']
    num_epochs = params['num_epochs']

    # Define the neural network
    model = nn.Sequential(
        nn.Linear(input_size, hidden_size1),
        nn.ReLU(),
        nn.Linear(hidden_size1, hidden_size2),
        nn.ReLU(),
        nn.Linear(hidden_size2, 1)
    )

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Prepare the dataset
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    # Train the model
    model.train()
    for epoch in range(num_epochs):
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

    # Evaluate on the validation set
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val)
        val_loss = criterion(val_outputs, y_val)

    return val_loss.item()


In [None]:
# Define the grid of hyperparameters
param_grid = {
    'hidden_size1': [32, 64, 128],
    'hidden_size2': [16, 32, 64],
    'learning_rate': [0.001, 0.01, 0.1],
    'num_epochs': [20, 50, 100]
}


In [None]:
# Convert datasets to tensors
X_train_tensor = torch.tensor(X_train_combined.values, dtype=torch.float32)
y_train_effective_tensor = torch.tensor(y_train_effective.values, dtype=torch.float32).unsqueeze(1)
X_val_tensor = torch.tensor(X_val_combined.values, dtype=torch.float32)
y_val_effective_tensor = torch.tensor(y_val_effective.values, dtype=torch.float32).unsqueeze(1)

In [None]:
# Perform grid search
best_params = None
best_mse = float('inf')

for params in ParameterGrid(param_grid):
    mse = train_and_evaluate_model(params, X_train_tensor, y_train_effective_tensor, X_val_tensor, y_val_effective_tensor)
    print(f"Params: {params}, MSE: {mse:.4f}")

    if mse < best_mse:
        best_mse = mse
        best_params = params

print(f"Best Params: {best_params}, Best MSE: {best_mse:.4f}")


Params: {'hidden_size1': 32, 'hidden_size2': 16, 'learning_rate': 0.001, 'num_epochs': 20}, MSE: 0.3843
Params: {'hidden_size1': 32, 'hidden_size2': 16, 'learning_rate': 0.001, 'num_epochs': 50}, MSE: 0.3705
Params: {'hidden_size1': 32, 'hidden_size2': 16, 'learning_rate': 0.001, 'num_epochs': 100}, MSE: 0.3572
Params: {'hidden_size1': 32, 'hidden_size2': 16, 'learning_rate': 0.01, 'num_epochs': 20}, MSE: 0.3665
Params: {'hidden_size1': 32, 'hidden_size2': 16, 'learning_rate': 0.01, 'num_epochs': 50}, MSE: 0.3467
Params: {'hidden_size1': 32, 'hidden_size2': 16, 'learning_rate': 0.01, 'num_epochs': 100}, MSE: 0.3421
Params: {'hidden_size1': 32, 'hidden_size2': 16, 'learning_rate': 0.1, 'num_epochs': 20}, MSE: 0.4032
Params: {'hidden_size1': 32, 'hidden_size2': 16, 'learning_rate': 0.1, 'num_epochs': 50}, MSE: 0.3713
Params: {'hidden_size1': 32, 'hidden_size2': 16, 'learning_rate': 0.1, 'num_epochs': 100}, MSE: 1.2074
Params: {'hidden_size1': 32, 'hidden_size2': 32, 'learning_rate': 0.00

Re-training model based on grid search results

In [None]:
# updating model parameters to match best parameters per the grid search
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Updated model architecture based on best parameters
input_size = X_train_combined.shape[1]
hidden_size1 = 128
hidden_size2 = 16

# Define the models directly
model_effective = nn.Sequential(
    nn.Linear(input_size, hidden_size1),
    nn.ReLU(),
    nn.Linear(hidden_size1, hidden_size2),
    nn.ReLU(),
    nn.Linear(hidden_size2, 1)
)

model_ease_of_use = nn.Sequential(
    nn.Linear(input_size, hidden_size1),
    nn.ReLU(),
    nn.Linear(hidden_size1, hidden_size2),
    nn.ReLU(),
    nn.Linear(hidden_size2, 1)
)

# Loss function and optimizer
criterion = nn.MSELoss()
learning_rate = 0.001
optimizer_effective = optim.Adam(model_effective.parameters(), lr=learning_rate)
optimizer_ease_of_use = optim.Adam(model_ease_of_use.parameters(), lr=learning_rate)

# Convert training and validation data to tensors
X_train_tensor = torch.tensor(X_train_combined.values, dtype=torch.float32)
y_train_effective_tensor = torch.tensor(y_train_effective.values, dtype=torch.float32).view(-1, 1)
y_train_ease_tensor = torch.tensor(y_train_ease.values, dtype=torch.float32).view(-1, 1)

X_val_tensor = torch.tensor(X_val_combined.values, dtype=torch.float32)
y_val_effective_tensor = torch.tensor(y_val_effective.values, dtype=torch.float32).view(-1, 1)
y_val_ease_tensor = torch.tensor(y_val_ease.values, dtype=torch.float32).view(-1, 1)

# Create DataLoader for training and validation sets
train_dataset = TensorDataset(X_train_tensor, y_train_effective_tensor, y_train_ease_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_effective_tensor, y_val_ease_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Number of epochs
num_epochs = 100

def train_model(model, optimizer, train_loader, val_loader, criterion, target_index):
    model.train()
    for epoch in range(num_epochs):
        for X_batch, y_batch_effective, y_batch_ease in train_loader:
            y_batch = y_batch_effective if target_index == 0 else y_batch_ease
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch_effective, y_batch_ease in val_loader:
                y_batch = y_batch_effective if target_index == 0 else y_batch_ease
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Val Loss: {val_loss/len(val_loader):.4f}')
        model.train()

# Train the models
print("Training model for effectiveness...")
train_model(model_effective, optimizer_effective, train_loader, val_loader, criterion, target_index=0)

print("Training model for ease of use...")
train_model(model_ease_of_use, optimizer_ease_of_use, train_loader, val_loader, criterion, target_index=1)

Training model for effectiveness...
Epoch [1/100], Loss: 1.0210, Val Loss: 1.0991
Epoch [2/100], Loss: 1.2969, Val Loss: 1.2265
Epoch [3/100], Loss: 0.8431, Val Loss: 0.9185
Epoch [4/100], Loss: 0.7607, Val Loss: 0.8404
Epoch [5/100], Loss: 0.7008, Val Loss: 0.8159
Epoch [6/100], Loss: 0.4992, Val Loss: 0.6382
Epoch [7/100], Loss: 0.3340, Val Loss: 0.5297
Epoch [8/100], Loss: 0.3204, Val Loss: 0.4736
Epoch [9/100], Loss: 0.2539, Val Loss: 0.4257
Epoch [10/100], Loss: 0.2116, Val Loss: 0.3967
Epoch [11/100], Loss: 0.1708, Val Loss: 0.3786
Epoch [12/100], Loss: 0.4143, Val Loss: 0.5561
Epoch [13/100], Loss: 0.1821, Val Loss: 0.3969
Epoch [14/100], Loss: 0.2200, Val Loss: 0.3950
Epoch [15/100], Loss: 0.1814, Val Loss: 0.3738
Epoch [16/100], Loss: 0.1610, Val Loss: 0.3678
Epoch [17/100], Loss: 0.1610, Val Loss: 0.3712
Epoch [18/100], Loss: 0.2799, Val Loss: 0.4326
Epoch [19/100], Loss: 0.1680, Val Loss: 0.3660
Epoch [20/100], Loss: 0.3208, Val Loss: 0.4611
Epoch [21/100], Loss: 0.1538, Val

In [None]:
# Evaluate the deep learning models
mse_effective, mae_effective, r2_effective = evaluate_model(model_effective, X_val_tensor, y_val_effective_tensor, model_type='pytorch')
print(f'Effectiveness Model - MSE: {mse_effective:.4f}, MAE: {mae_effective:.4f}, R-squared: {r2_effective:.4f}')

mse_ease_of_use, mae_ease_of_use, r2_ease_of_use = evaluate_model(model_ease_of_use, X_val_tensor, y_val_ease_tensor, model_type='pytorch')
print(f'Ease of Use Model - MSE: {mse_ease_of_use:.4f}, MAE: {mae_ease_of_use:.4f}, R-squared: {r2_ease_of_use:.4f}')


Effectiveness Model - MSE: 0.3853, MAE: 0.4560, R-squared: 0.6808
Ease of Use Model - MSE: 0.6137, MAE: 0.5444, R-squared: 0.4322


### Saving the models

In [None]:
import joblib

# Define the paths to save the models
model_effective_path = '/content/drive/MyDrive/Shared Folder/model_effective.joblib'
model_ease_of_use_path = '/content/drive/MyDrive/Shared Folder/model_ease_of_use.joblib'

# Save the models
joblib.dump(model_effective, model_effective_path)
joblib.dump(model_ease_of_use, model_ease_of_use_path)

print(f'Model for effectiveness saved to {model_effective_path}')
print(f'Model for ease of use saved to {model_ease_of_use_path}')

Model for effectiveness saved to /content/drive/MyDrive/Shared Folder/model_effective.joblib
Model for ease of use saved to /content/drive/MyDrive/Shared Folder/model_ease_of_use.joblib


### Training Ensemble models(XGBoost and Random Forest Regressor)

In [None]:
#Importing relevant libraries
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV

# Initializing the models
#Using a Random Forest Regressor
rf_model_effective = RandomForestRegressor(random_state=42)
rf_model_ease_of_use = RandomForestRegressor(random_state=42)

#Using an XGBoost Regressor
xgb_model_effective = XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model_ease_of_use = XGBRegressor(objective='reg:squarederror', random_state=42)


In [None]:
# Define parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Define parameter grid for XGBoost
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

In [None]:
# Perform GridSearchCV for Random Forest
rf_grid_search_effective = GridSearchCV(estimator=rf_model_effective, param_grid=rf_param_grid,
                                        cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
X_train_combined.columns = X_train_combined.columns.astype(str)
rf_grid_search_effective.fit(X_train_combined, y_train_effective)
best_rf_model_effective = rf_grid_search_effective.best_estimator_

rf_grid_search_ease_of_use = GridSearchCV(estimator=rf_model_ease_of_use, param_grid=rf_param_grid,
                                          cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
rf_grid_search_ease_of_use.fit(X_train_combined, y_train_ease)
best_rf_model_ease_of_use = rf_grid_search_ease_of_use.best_estimator_

In [None]:
# Perform GridSearchCV for XGBoost
xgb_grid_search_effective = GridSearchCV(estimator=xgb_model_effective, param_grid=xgb_param_grid,
                                         cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
xgb_grid_search_effective.fit(X_train_combined, y_train_effective)
best_xgb_model_effective = xgb_grid_search_effective.best_estimator_

xgb_grid_search_ease_of_use = GridSearchCV(estimator=xgb_model_ease_of_use, param_grid=xgb_param_grid,
                                           cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
xgb_grid_search_ease_of_use.fit(X_train_combined, y_train_ease)
best_xgb_model_ease_of_use = xgb_grid_search_ease_of_use.best_estimator_

### Evaluating performance of the ensemble models

In [None]:
#Evaluating the performance of the ensemble models
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import torch

def evaluate_model(model, X_val, y_val, model_type='pytorch'):
    if model_type == 'pytorch':
        model.eval()
        with torch.no_grad():
            predictions = model(X_val).numpy()
    else:  # For sklearn models
        predictions = model.predict(X_val)

    mse = mean_squared_error(y_val, predictions)
    mae = mean_absolute_error(y_val, predictions)
    r2 = r2_score(y_val, predictions)

    return mse, mae, r2

X_val_combined.columns = X_val_combined.columns.astype(str)
# Evaluate Random Forest model for effectiveness
rf_mse_effective, rf_mae_effective, rf_r2_effective = evaluate_model(
    best_rf_model_effective, X_val_combined, y_val_effective, model_type='sklearn'
)

# Evaluate Random Forest model for ease of use
rf_mse_ease, rf_mae_ease, rf_r2_ease = evaluate_model(
    best_rf_model_ease_of_use, X_val_combined, y_val_ease, model_type='sklearn'
)

# Print results
print("Random Forest - Effectiveness:")
print(f"MSE: {rf_mse_effective:.4f}")
print(f"MAE: {rf_mae_effective:.4f}")
print(f"R2 Score: {rf_r2_effective:.4f}")
print()

print("Random Forest - Ease of Use:")
print(f"MSE: {rf_mse_ease:.4f}")
print(f"MAE: {rf_mae_ease:.4f}")
print(f"R2 Score: {rf_r2_ease:.4f}")

Random Forest - Effectiveness:
MSE: 0.3191
MAE: 0.3702
R2 Score: 0.7356

Random Forest - Ease of Use:
MSE: 0.5466
MAE: 0.5132
R2 Score: 0.4943


### Saving the models

In [None]:
#Saving the random forest model
import joblib

# Define the paths to save the Random Forest models
rf_model_effective_path = '/content/drive/MyDrive/Shared Folder/rf_model_effective.joblib'
rf_model_ease_of_use_path = '/content/drive/MyDrive/Shared Folder/rf_model_ease_of_use.joblib'

# Save the Random Forest models
joblib.dump(best_rf_model_effective, rf_model_effective_path)
joblib.dump(best_rf_model_ease_of_use, rf_model_ease_of_use_path)

print(f'Random Forest model for effectiveness saved to {rf_model_effective_path}')
print(f'Random Forest model for ease of use saved to {rf_model_ease_of_use_path}')

Random Forest model for effectiveness saved to /content/drive/MyDrive/Shared Folder/rf_model_effective.joblib
Random Forest model for ease of use saved to /content/drive/MyDrive/Shared Folder/rf_model_ease_of_use.joblib


### Training another ensemble regressor with one dependent feature(concat of ease of use and effectiveness)

In [None]:
#Importing relevant libraries
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
import numpy as np

In [None]:
# Combine the target variables
y = np.column_stack((y_effective, y_ease_of_use))

In [None]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Combine BERT features with other features
X_train_combined = pd.concat([X_train.drop(columns='Information').reset_index(drop=True),
                              pd.DataFrame(X_train_bert)], axis=1)
X_val_combined = pd.concat([X_val.drop(columns='Information').reset_index(drop=True),
                            pd.DataFrame(X_val_bert)], axis=1)

In [None]:
# Initialize the models
xgb_model = MultiOutputRegressor(XGBRegressor(objective='reg:squarederror', random_state=42))

In [None]:
# Define parameter grid for XGBoost
xgb_param_grid = {
    'estimator__n_estimators': [50, 100, 200],
    'estimator__max_depth': [3, 6, 9],
    'estimator__learning_rate': [0.01, 0.1, 0.2],
    'estimator__subsample': [0.8, 1.0]
}

In [None]:
# Perform GridSearchCV for XGBoost
xgb_grid_search = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid,
                               cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
xgb_grid_search.fit(X_train_combined, y_train)
best_xgb_model = xgb_grid_search.best_estimator_

In [None]:
# Make predictions on validation set
xgb_predictions = best_xgb_model.predict(X_val_combined)

In [None]:
# Evaluate models
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred, multioutput='raw_values')
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values')
    r2 = r2_score(y_true, y_pred, multioutput='raw_values')

    print(f"{model_name} Performance:")
    print(f"RMSE: Effectiveness = {rmse[0]:.4f}, Ease of Use = {rmse[1]:.4f}")
    print(f"MAE: Effectiveness = {mae[0]:.4f}, Ease of Use = {mae[1]:.4f}")
    print(f"R2 Score: Effectiveness = {r2[0]:.4f}, Ease of Use = {r2[1]:.4f}")
    print()

In [None]:
evaluate_model(y_val, xgb_predictions, "XGBoost")

XGBoost Performance:
RMSE: Effectiveness = 0.5816, Ease of Use = 0.7555
MAE: Effectiveness = 0.4170, Ease of Use = 0.5549
R2 Score: Effectiveness = 0.7197, Ease of Use = 0.4719



### Using the random forest regressor to predict the effectiveness of a drug, since it had the best evaluation score