In [28]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)
import os
import random
import numpy as np
import pandas as pd
import xgboost as xgb
from FRUFS import FRUFS
from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from scipy.stats import skew, kurtosis
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.cluster import KMeans
from scipy.stats import mode
from sklearn.neighbors import NearestNeighbors
SEED = 721991
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)    
    tf.random.set_seed(seed)

In [29]:
# Reading the data
df_train = pd.read_csv('./train_features.csv')
df_train_targets_scored = pd.read_csv('./train_targets_scored.csv')
df_train_targets_nonscored = pd.read_csv('./train_targets_nonscored.csv')
# Identifying target features
target_features_scored = list(df_train_targets_scored.columns[1:])
target_features_nonscored = list(df_train_targets_nonscored.columns[1:])
# Type conversion for optimization
df_train_targets_scored[target_features_scored] = df_train_targets_scored[target_features_scored].astype(np.uint8)
df_train_targets_nonscored[target_features_nonscored] = df_train_targets_nonscored[target_features_nonscored].astype(np.uint8)
df_train = df_train.merge(df_train_targets_scored, on='sig_id', how='left')
# Dropping 'sig_id' column
df_train = df_train.drop(columns=['sig_id'])
# One-hot encoding for 'cp_type', 'cp_time', 'cp_dose'
one_hot_encoder = OneHotEncoder(sparse=False)
cp_time_encoder = LabelEncoder()
cp_dose_encoder = LabelEncoder()
encoded_columns = one_hot_encoder.fit_transform(df_train[['cp_type']])
column_names = one_hot_encoder.get_feature_names_out(['cp_type'])
df_train['cp_time'] = cp_time_encoder.fit_transform(df_train['cp_time'])
df_train['cp_dose'] = cp_dose_encoder.fit_transform(df_train['cp_dose'])
# Create a DataFrame with the encoded columns
encoded_df = pd.DataFrame(encoded_columns, columns=column_names)
# Drop original columns and concat the new encoded columns
df_train = df_train.drop(['cp_type'], axis=1)
df_train = pd.concat([df_train, encoded_df], axis=1)
print(f'Training Set Shape = {df_train.shape}')
print(f'Training Set Memory Usage = {df_train.memory_usage().sum() / 1024 ** 2:.2f} MB')

Training Set Shape = (23814, 1082)
Training Set Memory Usage = 163.74 MB




In [30]:
# Identify the one-hot encoded features for cp_type, cp_time, and cp_dose
one_hot_features = [feature for feature in df_train.columns if feature.startswith('cp_type_')]
g_features = [feature for feature in df_train.columns if feature.startswith('g-')]
c_features = [feature for feature in df_train.columns if feature.startswith('c-')]
other_features = [feature for feature in df_train.columns if feature not in g_features and 
                                                             feature not in c_features and 
                                                             feature not in target_features_scored and
                                                             feature not in target_features_nonscored]

# Combine all feature lists to create the X dataset
feature_columns = g_features + c_features + one_hot_features + ['cp_time', 'cp_dose']
X = df_train[feature_columns]

# For Y, use the columns that are not in the features list
Y = df_train.drop(feature_columns, axis=1)

print(f'Number of g- Features: {len(g_features)}')
print(f'Number of c- Features: {len(c_features)}')
print(f'Number of One Hot Features: {len(one_hot_features)} ({one_hot_features})')

Number of g- Features: 772
Number of c- Features: 100
Number of One Hot Features: 2 (['cp_type_ctl_vehicle', 'cp_type_trt_cp'])


In [31]:
from skmultilearn.model_selection import iterative_train_test_split

# Store the column names before the split
X_columns = X.columns
Y_columns = Y.columns

# Perform the split with numpy arrays
X_np, Y_np, X_test_orig_np, y_test_orig_np = iterative_train_test_split(X.values, Y.values, test_size=0.2)

# Convert numpy arrays back to pandas DataFrames
X = pd.DataFrame(X_np, columns=X_columns)
Y = pd.DataFrame(Y_np, columns=Y_columns)

X_test_orig = pd.DataFrame(X_test_orig_np, columns=X_columns)
y_test_orig = pd.DataFrame(y_test_orig_np, columns=Y_columns)

In [32]:
def get_tail_label(df: pd.DataFrame, ql=[0.05, 1.]) -> list:
    """
    Find the underrepresented targets.
    Underrepresented targets are those which are observed less than the median occurance.
    Targets beyond a quantile limit are filtered.
    """
    irlbl = df.sum(axis=0)
    irlbl = irlbl[(irlbl > irlbl.quantile(ql[0])) & ((irlbl < irlbl.quantile(ql[1])))]  # Filtering
    irlbl = irlbl.max() / irlbl
    threshold_irlbl = irlbl.median()
    tail_label = irlbl[irlbl > threshold_irlbl].index.tolist()
    return tail_label

def get_minority_samples(X: pd.DataFrame, y: pd.DataFrame, ql=[0.05, 1.]):
    """
    return
    X_sub: pandas.DataFrame, the feature vector minority dataframe
    y_sub: pandas.DataFrame, the target vector minority dataframe
    """
    tail_labels = get_tail_label(y, ql=ql)
    index = y[y[tail_labels].apply(lambda x: (x == 1).any(), axis=1)].index.tolist()
    
    X_sub = X[X.index.isin(index)].reset_index(drop = True)
    y_sub = y[y.index.isin(index)].reset_index(drop = True)
    return X_sub, y_sub

def nearest_neighbour(X: pd.DataFrame, neigh) -> list:
    """
    Give index of 10 nearest neighbor of all the instance
    
    args
    X: np.array, array whose nearest neighbor has to find
    
    return
    indices: list of list, index of 5 NN of each element in X
    """
    nbs = NearestNeighbors(n_neighbors=neigh, metric='euclidean', algorithm='kd_tree').fit(X)
    euclidean, indices = nbs.kneighbors(X)
    return indices

def MLSMOTE(X, y, n_sample, neigh=5, categorical_features=[]):
    """
    Give the augmented data using MLSMOTE algorithm
    
    args
    X: pandas.DataFrame, input vector DataFrame
    y: pandas.DataFrame, feature vector dataframe
    n_sample: int, number of newly generated sample
    categorical_features: list, list of categorical feature names
    
    return
    new_X: pandas.DataFrame, augmented feature vector data
    target: pandas.DataFrame, augmented target vector data
    """
    indices2 = nearest_neighbour(X, neigh=5)
    n = len(indices2)
    new_X = np.zeros((n_sample, X.shape[1]))
    target = np.zeros((n_sample, y.shape[1]))
    for i in range(n_sample):
        reference = random.randint(0, n-1)
        neighbor = random.choice(indices2[reference, 1:])
        all_point = indices2[reference]
        nn_df = y[y.index.isin(all_point)]
        ser = nn_df.sum(axis = 0, skipna = True)
        target[i] = np.array([1 if val > 0 else 0 for val in ser])
        ratio = random.random()
        gap = X.loc[reference,:] - X.loc[neighbor,:]
        new_X[i] = np.array(X.loc[reference,:] + ratio * gap)

        # Handle categorical features
        for feature in categorical_features:
            mode_value = X.loc[all_point, feature].mode()
            if mode_value.empty:  # If mode cannot be computed (all values are different)
                mode_value = [X.loc[reference, feature]]  # Use the value of the reference sample
            new_X[i, X.columns.get_loc(feature)] = mode_value[0]

    new_X = pd.DataFrame(new_X, columns=X.columns)
    target = pd.DataFrame(target, columns=y.columns)
    return new_X, target

# X_sub, y_sub = get_minority_samples(X, Y)  # Getting minority samples of that datframe
# X_res, y_res = MLSMOTE(X_sub, y_sub, 2000, 5)  # Applying MLSMOTE to augment the dataframe

In [43]:
print(Y['acat_inhibitor'].value_counts())
print(y_res['acat_inhibitor'].value_counts())

acat_inhibitor
0.0    72699
1.0     1425
Name: count, dtype: int64
acat_inhibitor
0.0    9753
1.0     247
Name: count, dtype: int64


In [34]:
# Apply MLSMOTE on continuous features
X_sub, y_sub = get_minority_samples(X, Y)  # Getting minority samples of that dataframe
X_res, y_res = MLSMOTE(X_sub, y_sub, 20000, 5, categorical_features=['cp_type_ctl_vehicle','cp_type_trt_cp','cp_time','cp_dose'])  # Applying MLSMOTE to augment the dataframe

# Iterate over each target in y_res
for target in y_res.columns:
    # Get the indices of the rows where the target is 1
    target_indices = y_res[y_res[target] == 1].index
    
    # Get the corresponding rows from X_res
    X_res_target = X_res.loc[target_indices]
    
    # Get the corresponding rows from y_res
    y_res_target = y_res.loc[target_indices]
    
    # Append these rows to X and Y
    X = pd.concat([X, X_res_target], ignore_index=True)
    Y = pd.concat([Y, y_res_target], ignore_index=True)

In [35]:
model = MultiOutputClassifier(xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')) # F1 Score = 0.2747
model.fit(X, Y)

In [36]:
y_pred = model.predict(X_test_orig)

In [44]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, log_loss

score = f1_score(y_test_orig, y_pred, average='micro')
print('F1 Score:', score)

# Compute accuracy
accuracy = accuracy_score(y_test_orig, y_pred)
print('Accuracy:', accuracy)

# Compute precision
precision = precision_score(y_test_orig, y_pred, average='micro')
print('Precision:', precision)

# Compute recall
recall = recall_score(y_test_orig, y_pred, average='micro')
print('Recall:', recall)


F1 Score: 0.3162055335968379
Accuracy: 0.47386101196724756
Precision: 0.9302325581395349
Recall: 0.19047619047619047
Entropy Loss: 3.4742554578043108


In [45]:
cross_entropy_losses = []
for i in range(y_test_orig.shape[1]):
    # Check if y_test.iloc[:, i] contains only one class
    if len(np.unique(y_test_orig.iloc[:, i])) == 1:
        # If predictions are the same as the constant label, the loss is 0
        if np.unique(y_test_orig.iloc[:, i])[0] == y_pred[:, i].all():
            cross_entropy_losses.append(0)
        else:
            # If predictions do not match the constant label, the loss is maximal
            cross_entropy_losses.append(np.log(2))
    else:
        # If there are both classes present, compute log loss as usual
        cross_entropy_losses.append(log_loss(y_test_orig.iloc[:, i], y_pred[:, i], labels=[0, 1]))

average_loss = np.mean(cross_entropy_losses)

In [46]:
average_loss

0.10168270444412379