In [0]:
# Importing Relevant Libraries

import math
import numpy as np
import pandas as pd
from pyspark.sql import functions as F
from pyspark.ml.feature import *
from pyspark.sql.types import *
from pyspark.ml.linalg import *
from pyspark.sql.functions import *
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
# Reading files

# File location and type
file_location = "/FileStore/tables/MIE1628_Project/Data/train_features.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
train_feats_df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

In [0]:
# File location and type
file_location = "/FileStore/tables/MIE1628_Project/Data/train_targets_scored.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
train_targets_df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

In [0]:
# Selecting samples with cp_type = 'trt_cp'
train_feats_df = train_feats_df.filter("cp_type = 'trt_cp'")


In [0]:
# Splitting data into train and test

X_test_df = train_feats_df.sample(fraction = 0.2)
X_test_df.createOrReplaceTempView('X_test_table')
train_feats_df.createOrReplaceTempView('train_feats_df_table')
X_train_df = spark.sql('select * from train_feats_df_table as t1 where t1.sig_id not in (select t2.sig_id from X_test_table as t2)') 

# Assuring No Data Leakage

X_train_df.createOrReplaceTempView('X_train_table')
intersection = spark.sql('select t1.sig_id from X_train_table as t1 where t1.sig_id in (select t2.sig_id from X_test_table as t2)').count()
assert intersection == 0

In [0]:
# Joining features and target columns 
X_test_df = X_test_df.join(train_targets_df, X_test_df.sig_id == train_targets_df.sig_id).drop(train_targets_df.sig_id)

In [0]:
combinations = [(24, 'D1'), (48, 'D1'), (72, 'D1'), (24, 'D2'), (48, 'D2'), (72, 'D2')] # DIfferent possible combinations for (cp_time and cp_dose)

target_names = train_targets_df.columns # List to store target names
feature_names = train_feats_df.columns # List to store feature names
# Gene and cell feature names
gene_feature_names = [] # List to store gene feature names
cell_feature_names = [] # List to store cell feature names
for i in range(772):
  gene_feature_names.append('g-' + str(i))
for i in range(100):
  cell_feature_names.append('c-' + str(i))

In [0]:
# loss for the case when there are no activated MoAs for a particular label

def logloss(predictedProb, trueLabel):
    loss = (-trueLabel*np.log(predictedProb + 10e-9) - (1-trueLabel)*(np.log(1-predictedProb))).mean()
    return loss

In [0]:
clf_dict = {} # A dictionary for saving the individual models
losses = []
for combination in combinations:
  label_clfs_container = [] # List to keep the classifier for each target label
  label_feats_container = [] # List to keep the feature imporatnce values
  test_df = [] # List to keep target df for each label
  
  # features having the given combination of cp_time and cp_dose    
  features_df = X_train_df.filter(X_train_df.cp_time == combination[0]).filter(X_train_df.cp_dose == combination[1])
    
  # features and labels having the given combination of cp_time and cp_dose        
  X_train_combn_df = features_df.join(train_targets_df, features_df.sig_id == train_targets_df.sig_id).drop(train_targets_df.sig_id)
  loss = 0
  for i, label in enumerate(train_targets_df.columns[1:]):
    # df having features and only one target label     
    X_train_label_df = X_train_combn_df.select(gene_feature_names + cell_feature_names + [label])
    
    # Assembling the features vector    
    va = VectorAssembler(inputCols = gene_feature_names + cell_feature_names, outputCol = 'features')
    X_train_feat_imp_df  = va.transform(X_train_label_df)
    
    # Running RF model to get feature imporatnace    
    randomForest = RandomForestClassifier(labelCol = label)
    print('Training Random Forest Model')
    randomForestModel = randomForest.fit(X_train_feat_imp_df)
    print('Random Forest Model Trained')
    feature_importance = randomForestModel.featureImportances.toArray()
    
    # Selecting features whose importance != 0
    selected_features = [x for x, y in zip(gene_feature_names + cell_feature_names, list(feature_importance != 0)) if y == True]
    
    # Storing imporatnt features for the label in the list
    label_feats_container.append(selected_features)
    X_train_label_df = X_train_combn_df.select(selected_features + [label]) # Feature Selection on train set
    test_df = X_test_df.select(selected_features + [label]) # Feature Selection on test set
    
    one_count_train = X_train_label_df.filter(X_train_label_df[label] == 1).count()
        
    if one_count_train > 0:
      # Asserting label 1 count < label 0 count  
      assert X_train_label_df.filter(X_train_label_df[label] == 1).count() < X_train_label_df.filter(X_train_label_df[label] == 0).count()
        
      # Filtering out samples with label = 1
      temp_df = X_train_label_df.filter(X_train_label_df[label] == 1).toPandas()
      
      # Oversampling temp_df to make label 1 count = label 0 count
      temp_df = spark.createDataFrame(temp_df.sample(n = X_train_label_df.groupBy(label).count().orderBy(label).collect()[0][1], replace = True)) 
               
      train_df = X_train_label_df.filter(X_train_label_df[label] == 0).union(temp_df).orderBy(rand()) # train set with equal label 0 and label 1 count (shuffled)
      
      logisticRegression = LogisticRegression(labelCol = label) # Logistic Regression Model
    
      # Assembling all features
      va = VectorAssembler(inputCols = selected_features, outputCol = 'features') 
      train_df  = va.transform(train_df)
      test_df = va.transform(test_df)
    
      print('Training Logistic Regression Model')
      logisticRegressionModel = logisticRegression.fit(train_df)
      print('Logistic Model Trained')
      label_clfs_container.append(logisticRegressionModel) # storing trained models
      pred_df = logisticRegressionModel.transform(test_df)
      
      # Calculating logloss error
      evaluator = MulticlassClassificationEvaluator(labelCol=label, metricName='logLoss')
      loss += evaluator.evaluate(pred_df)
    
    else:# Else (if no activiation is found), then add zero as a classifier. 
      label_clfs_container.append(0)
      label_feats_container.append([])
      n = test_df.count()
      prob = np.array([0]*n)
      true_label = np.array(test_df.select(label).toPandas())[:,0]
      loss += logloss(prob, true_label)
    print(f'Loss after {i}: {label} = {loss/(i+1)}')
  print(f'Loss the combination {combination}: {loss/(i+1)}')
  losses.append(loss/(i+1))
  clf_dict[combination] = (label_clfs_container, label_feats_container)
      
print('Losses', losses)
   
    
    
    