In [1]:
# Importing Relevant Libraries

import math
import numpy as np
import pandas as pd
from pyspark.sql import Row
from pyspark.sql import functions as F
from pyspark.ml.feature import *
from pyspark.sql.types import *
from pyspark.ml.linalg import *
from pyspark.sql.functions import *
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession

In [2]:
# Reading Relevant Files

# File location and type
file_location = "./lish-moa/train_features.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
train_feats_df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

In [3]:
# File location and type
file_location = "./lish-moa/train_targets_scored.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
train_targets_df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

In [4]:
# Selecting samples with cp_type = 'trt_cp'
train_feats_df = train_feats_df.filter("cp_type = 'trt_cp'")


In [5]:
# Label Encoding cp_time feature (24:0, 48:1, 72:2)

# Converting cp_time column to cp_time vector
assembler = VectorAssembler(inputCols = ['cp_time'], outputCol = 'cp_time_vector')
train_feats_df = assembler.transform(train_feats_df)
train_feats_df = train_feats_df.drop('cp_time')

mm_scaler = MinMaxScaler(inputCol = 'cp_time_vector', outputCol = 'cp_time_scaled_vector', min = 0, max = 2)
train_feats_df = mm_scaler.fit(train_feats_df).transform(train_feats_df)
train_feats_df = train_feats_df.drop('cp_time_vector')

firstElement=udf(lambda v:float(v[0]),FloatType())
train_feats_df =  train_feats_df.withColumn('cp_time',firstElement(train_feats_df['cp_time_scaled_vector'])).drop('cp_time_scaled_vector')   

In [6]:
# One hot encoding cp_dose feature
string_indexer = StringIndexer(inputCol = 'cp_dose', outputCol = 'cp_dose_indexed')
train_feats_df = string_indexer.fit(train_feats_df).transform(train_feats_df).drop('cp_dose')
one_hot_encoder = OneHotEncoder(inputCol = 'cp_dose_indexed', outputCol = 'cp_dose', dropLast = False)
train_feats_df  = one_hot_encoder.fit(train_feats_df).transform(train_feats_df).drop('cp_dose_indexed')

one_hot_vectors = (train_feats_df.select('cp_dose','sig_id').collect())
l = []
for i in one_hot_vectors:
  a = (list(i[0].toArray()) + [i[1]])
  l.append(a)
cp_dose_one_hot = spark.createDataFrame(pd.DataFrame(l, columns = ['D1', 'D2', 'sig_id']))

train_feats_df = train_feats_df.join(cp_dose_one_hot, train_feats_df.sig_id == cp_dose_one_hot.sig_id).drop(cp_dose_one_hot.sig_id).drop('cp_dose')

In [7]:
target_names = train_targets_df.columns # List to store target names
feature_names = train_feats_df.columns # List to store feature names

gene_feature_names = [] # List to store gene feature names
cell_feature_names = [] # List to store cell feature names
for i in range(772):
  gene_feature_names.append('g-' + str(i))
for i in range(100):
  cell_feature_names.append('c-' + str(i))

In [8]:
# Logloss for the case when all MoAs are 0 for the label
def logloss(predictedProb, trueLabel):
    loss = (-trueLabel*np.log(predictedProb + 10e-9) - (1-trueLabel)*(np.log(1-predictedProb))).mean()
    return loss

In [9]:
test_size = 0.2
losses =[]
fraction = {0: test_size, 1: test_size}
loss = 0
# Joininig features and targets
data = train_feats_df.join(train_targets_df, train_feats_df.sig_id == train_targets_df.sig_id).drop(train_targets_df.sig_id)
n = len(train_targets_df.columns) # total number of labels

# Looping over every label
for i in sc.range(1,n).collect():
  label = train_targets_df.columns[i] 
  # Selecting gene features, cell features and the label     
  label_df = data.select(feature_names + [label])
  
  # Splitting into train and test    
  test_df = label_df.sampleBy(label, fractions = fraction, seed = 42)
  label_df.createOrReplaceTempView('label_df_table')
  test_df.createOrReplaceTempView('test_df_table')

  # Removing the samples that went to test_df from train_df     
  train_df = spark.sql('select * from label_df_table as t1 where t1.sig_id not in (select t2.sig_id from test_df_table as t2)')
  train_df.createOrReplaceTempView('train_df_table')
  
  one_count_test = test_df.groupBy(label).count().count() # = 2 if has both 1 and 0
                                                          # = 1 if has only 0's
  
  one_count_train = train_df.filter(train_df[label] == 1).count() # Number of train samples with label = 1
 
  
   # if test_data has no samples of activated moa but train samples have atleast two activated moa samples, transferring one sample of activated moa from train to test 
  if one_count_test == 1 and one_count_train >= 2:
    
    temp_df = train_df.filter(train_df[label] == 1).limit(1)
    test_df = test_df.union(temp_df)
    test_df.createOrReplaceTempView('test_df_table')
    train_df = spark.sql('select * from train_df_table as t1 where t1.sig_id not in (select t2.sig_id from test_df_table as t2)')
    
  
  # Ensuring no data leakage
  train_df.createOrReplaceTempView('train_df_table')
  intersection = spark.sql('select t1.sig_id from train_df_table as t1 where t1.sig_id in (select t2.sig_id from test_df_table as t2)').count()
  assert intersection == 0
  
  ############### Dealing with class inbalance by over sampling the activated samples ##########################
  if one_count_train > 0:
      
    assert train_df.filter(train_df[label] == 1).count() < train_df.filter(train_df[label] == 0).count()
    
    temp_df = train_df.filter(train_df[label] == 1).toPandas()
    temp_df = spark.createDataFrame(temp_df.sample(n = train_df.groupBy(label).count().orderBy(label).collect()[0][1], replace = True))
    train_df = train_df.filter(train_df[label] == 0).union(temp_df)
    
    X_train = train_df.select(gene_feature_names + cell_feature_names + ['cp_time', 'D1', 'D2', label])
    X_train = X_train.orderBy(rand()) #Shuffling X_train
    X_test = test_df.select(gene_feature_names + cell_feature_names + ['cp_time', 'D1', 'D2', label])
    
    #Assembling features
    va = VectorAssembler(inputCols = gene_feature_names + cell_feature_names + ['cp_time', 'D1', 'D2'], outputCol = 'features')
    X_train  = va.transform(X_train).rdd.repartition(1000).toDF()
    
    # Training Random Forest for feature selection
    randomForest = RandomForestClassifier(labelCol = label)
    print('Training Random Forest Model')
    randomForestModel = randomForest.fit(X_train)
    print('Random Forest Model Trained')
    feature_importance = randomForestModel.featureImportances.toArray()
    selected_features = [x for x, y in zip(gene_feature_names + cell_feature_names + ['cp_time', 'D1', 'D2'], list(feature_importance != 0)) if y == True] # List of features with imporatnce > 0
    va = VectorAssembler(inputCols = selected_features, outputCol = 'features')
    
    # Feature Selection
    X_train = va.transform(X_train.select(selected_features + [label]))
    X_test = va.transform(X_test.select(selected_features + [label]))
    
    # Training Logistic Regression Model
    logisticRegression = LogisticRegression(labelCol = label)
    print('Training Logistic Regression Model')
    logisticRegressionModel = logisticRegression.fit(X_train)
    print('Logistic Regression Trained')
    pred_df = logisticRegressionModel.transform(X_test)
    evaluator = MulticlassClassificationEvaluator(labelCol=label, metricName='logLoss')
    loss += evaluator.evaluate(pred_df)

      
    
  else:# Else (if no activiation is found), then always predict 0
    n = test_df.count()
    prob = np.array([0]*n)
    true_label = np.array(test_df.select(label).toPandas())[:,0]
    loss += logloss(prob, true_label)
    

  print(f'Loss after {i}: {label} = {loss/(i+1)}')
  del X_train, X_test, train_df, test_df  
losses.append(loss/(i+1))

    
  
print('Losses', losses)
  

Training Random Forest Model


KeyboardInterrupt: 