# Mechanisms of Action Project Submission

Team Members: Abhi Gandhi, Jeff Won, Shashank Saurav, Shengebo Zhang, Faraz Khoshbakhtian

This document outlines the steps taken by our group to approach the Mechanisms of Action competition. This document presents are original approach whereas another document presents the challanger approach. 

## Importing the needed libraries

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce
from operator import add

### This part is to run pyspark locally
import findspark  # Get rid of this in DataBricks
# findspark.init('/opt/spark-3.0.1')  # Get rid of this in DataBricks #faraz: you can remove the parameter. it only worked like this for me
findspark.init()
########################################

import pyspark
from pyspark.sql import Row
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
from pyspark.sql.types import FloatType
from pyspark.sql.types import DoubleType
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F 
from pyspark.sql.functions import explode, col, udf, mean as _mean, stddev as _stddev, log, log10, sqrt
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.functions import lit
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import LogisticRegression, NaiveBayes
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorSlicer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml.functions import vector_to_array


from tqdm import tqdm

from pyspark.ml.regression import DecisionTreeRegressor
from scv import StratifiedCrossValidator

from pyspark.ml.classification import DecisionTreeClassifier
import os

In [2]:
sc = pyspark.SparkContext(appName="Pi")
spark = SparkSession(sc)


## Seeting up spark, contexts and configurations

In [2]:
config = SparkConf().setAll([('spark.executor.memory', '50g'), ('spark.executor.cores', '8'), ('spark.cores.max', '8'), ('spark.driver.memory','8g')])
config.setAppName("proj")
config.set("spark.dynamicAllocation.minExecutors", "1");
config.set("spark.dynamicAllocation.maxExecutors", "8");
config.set("spark.dynamicAllocation.initialExecutors", "2"); # the number must be between the min and max
sc = SparkContext(conf=config)  # start a new sc with the current config
spark = SparkSession(sc)
sqlc=SQLContext(sc)
print(sc.getConf().getAll())  # print all the configuration

[('spark.dynamicAllocation.minExecutors', '1'), ('spark.driver.port', '41451'), ('spark.dynamicAllocation.initialExecutors', '2'), ('spark.executor.id', 'driver'), ('spark.driver.host', '192.168.176.164'), ('spark.app.id', 'local-1607963848311'), ('spark.rdd.compress', 'True'), ('spark.driver.memory', '8g'), ('spark.serializer.objectStreamReset', '100'), ('spark.cores.max', '8'), ('spark.master', 'local[*]'), ('spark.submit.pyFiles', ''), ('spark.submit.deployMode', 'client'), ('spark.dynamicAllocation.maxExecutors', '8'), ('spark.executor.cores', '8'), ('spark.app.name', 'proj'), ('spark.ui.showConsoleProgress', 'true'), ('spark.executor.memory', '50g')]


## Reading and Preprocessing the Train Data

### Reading and Joining dataframes

First, we need to read the features and labels for the training data, get rid of controll cases and join all the training data into a singular Spark dataframe.

In [3]:
# This is for being able to store the data in github and concatenation on local computer
!cat train_features_*.csv > train_feats.csv

In [3]:
# Train Features Dataset
df_train = spark.read.csv('train_feats.csv', header='true', inferSchema= 'true')   # path in HDFS file system

In [4]:
# Labels Dataset
df_label = spark.read.csv('train_targets_scored.csv', header='true', inferSchema= 'true')

In [5]:
# join all data together
df = df_train.join(df_label, on=['sig_id'], how='left_outer')  

In [6]:
# Dropping all the control cases since the test data also comes with control/case flag
df = df.filter(df.cp_type == 'trt_cp')
df = df.drop('cp_type')

### One-Hot Encoding for Categorical features

Second, we need to **One-Hot Encode** our categorical features. 

In [7]:
# String indexer for cp_dose
indexer = StringIndexer(inputCol="cp_dose", outputCol="cp_dose_cat")
# index cp_dose in data
df1 = indexer.fit(df).transform(df)

# String indexer for cp_time
indexer = StringIndexer(inputCol="cp_time", outputCol="cp_time_cat")
# index cp_time in data
df1 = indexer.fit(df1).transform(df1)
df1 = df1.drop('cp_dose')
df1 = df1.drop('cp_time')

# One-hot enocder 
encoder = OneHotEncoder(inputCols=["cp_time_cat", "cp_dose_cat"],
                        outputCols=["cp_time_onehot", "cp_dose_onehot"])

model = encoder.fit(df1)
df1 = model.transform(df1)

# Keeping n-1 dummy variables for each feature. (dummy variables have degree of fredom n-1)
df1 = df1.withColumn("cp_time_cols", vector_to_array("cp_time_onehot")).select(df1.columns + [col("cp_time_cols")[i] for i in range(2)])
df1 = df1.withColumn("cp_dose_cols", vector_to_array("cp_dose_onehot")).select(df1.columns + [col("cp_dose_cols")[i] for i in range(1)])

# drop leftover cols
df1 = df1.drop('cp_dose_cat',
 'cp_time_cat',
 'cp_time_onehot',
 'cp_dose_onehot',
)


### Feature Engineering  

Third, we create some features that describes the statistics of each row. We look at the row-wise min, max, mean, standard deviation for gene and cell information. We also use KMeans clustering to classify the data into 3 cluster classes based on Euclidean distance and use the cluster results as an additional engineered feature.

In [8]:
gene_feature_names = [name for name in df1.columns if 'g-' in name]
cell_feature_names =  [name for name in df1.columns if 'c-' in name]

df2 = df1.withColumn("gene_max", F.greatest(*gene_feature_names))
df2 = df2.withColumn("gene_min", F.least(*gene_feature_names))
df2 = df2.withColumn("cell_max", F.greatest(*cell_feature_names))
df2 = df2.withColumn("cell_min", F.least(*cell_feature_names))


df2 = df2.withColumn("gene_mean", reduce(lambda x,y: x+y, (col(x) for x in gene_feature_names)) / len(gene_feature_names))
df2 = df2.withColumn("cell_mean", reduce(lambda x,y: x+y, (col(x) for x in cell_feature_names)) / len(cell_feature_names))

'''
gene_std = sqrt(
    reduce(add, ((col(x) - col("gene_mean")) ** 2 for x in gene_feature_names)) / (len(gene_feature_names) - 1)
)

cell_std = sqrt(
    reduce(add, ((col(x) - col("cell_mean")) ** 2 for x in cell_feature_names)) / (len(cell_feature_names) - 1)
)


df2 = df2.withColumn("gene_std", gene_std)
df2 = df2.withColumn("cell_std", cell_std)
# df2 = df2.withColumn("gene_std", sqrt(reduce(lambda x,y: x-col('gene_mean') + y-col('gene_mean'), (col(x) for x in gene_feature_names))**2 / len(gene_feature_names)))
# df2 = df2.withColumn("cell_std", sqrt(reduce(lambda x,y: x-col('cell_mean') + y-col('cell_mean'), (col(x) for x in cell_feature_names))**2 / len(cell_feature_names)))


df2 = df2.withColumn("gene_sum", reduce(lambda x,y: x+y, (col(x) for x in gene_feature_names)))
df2 = df2.withColumn("cell_sum", reduce(lambda x,y: x+y, (col(x) for x in cell_feature_names)))


'''

'''
KMeans clustering - engineer new feature based on cluster results
Use K=3 to fit the assembled features

from pyspark.ml.clustering import KMeans

feature_label_assembler = VectorAssembler(inputCols=df1.columns[1:], outputCol="assemebled")
cluster_df = feature_label_assembler.transform(df1)

kmeans = KMeans(k=3, featuresCol='assemebled', 
                predictionCol='clusterClassPrediction', distanceMeasure='euclidean',)

model = kmeans.fit(cluster_df)
transformed = model.transform(cluster_df).select("sig_id", "clusterClassPrediction")

df2 = df2.join(transformed, on=['sig_id'], how='inner')
df2.select("clusterClassPrediction").show()
'''

'\nKMeans clustering - engineer new feature based on cluster results\nUse K=3 to fit the assembled features\n\nfrom pyspark.ml.clustering import KMeans\n\nfeature_label_assembler = VectorAssembler(inputCols=df1.columns[1:], outputCol="assemebled")\ncluster_df = feature_label_assembler.transform(df1)\n\nkmeans = KMeans(k=3, featuresCol=\'assemebled\', \n                predictionCol=\'clusterClassPrediction\', distanceMeasure=\'euclidean\',)\n\nmodel = kmeans.fit(cluster_df)\ntransformed = model.transform(cluster_df).select("sig_id", "clusterClassPrediction")\n\ndf2 = df2.join(transformed, on=[\'sig_id\'], how=\'inner\')\ndf2.select("clusterClassPrediction").show()\n'

### General Purpose Feature Selection Based on Correlation

In our initial step for feature selection we use correlation meassure to reduce the number of features.

## Machine Learning Pipeline 

The task for Mechanisms of Action is formally considred a Multi-label classification problem. We are employing the Binary-Relevance approach for the problem at hand. For each of the possible binary labels we first use a **Random Forrest CLassifier** to achive another level of feature selection based on the feature importance meassure. Subsequently, we train one **logistic regression** model and a **naive bayes** model for each label.

We have use upsampling the minority class and downsampling the majority class to overcome issues of imbalanced data. 

In [9]:
list_of_label_names = df_label.columns
list_of_feature_names = list(set(df2.columns) - set(df_label.columns))


In [None]:
list_of_feature_names

In [None]:
list_of_label_names

In [10]:
df2.cache()
df2.show()

+------------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+--

In [11]:
# This function is responsible for running our ml pipeline

#df is the train dataset, label_name is the label we want to do the training on
def train_individual_label(df, label_name, seed = 43, models=['dt', 'lr']):

    try:
    # we need at least 2 positive instances for this particular label_name
        if df.filter(df[label_name] == 1).count() >= 2:

            # dataframe consisting only of the train features and the label name
            temp_df = df.select(*list_of_feature_names, label_name)

            # stratify split of the dataframe for train-test split
            fractions = {1: 0.8, 0: 0.8}
            train_df = temp_df.stat.sampleBy(label_name, fractions, )
            test_df =  temp_df.subtract(train_df)
            
            high_corr_names = get_high_corr_features(train_df, gene_feature_names, cell_feature_names )
            train_df, test_df  =  train_test_vectorize(train_df, test_df, high_corr_names,  label_name)

    

            # another layer of feature selection using random forrest before training the ml models
            clf = RandomForestClassifier(numTrees=20, maxDepth=5, featuresCol='features',  seed=42)
            model = clf.fit(train_df)
            feature_importance = model.featureImportances.toArray()

            # indeces of only the top 5% of features
            important_feature_idx = feature_importance.argsort()[-int(0.05 * len(feature_importance)):]  



            # Now, after getting the index, filter the feature vector based on the above feature importance index
            slicer = VectorSlicer(inputCol="features", outputCol="sub_features", indices=important_feature_idx)
            final_train_df_sub_feats =  slicer.transform(train_df).drop('features')
            final_test_df_sub_feats = slicer.transform(test_df).drop('features')

            final_train_df_sub_feats = final_train_df_sub_feats\
                                 .select(F.col('label'), F.col('sub_features').alias('features'))  # Just renaming these columns

            final_test_df_sub_feats = final_test_df_sub_feats\
                                 .select(F.col('label'), F.col('sub_features').alias('features'))  # Just renaming these columns




            # Use CV to train a logistic regression model
            if('lr' in models):
                lr = LogisticRegression(maxIter=10)
                lr_paramGrid = ParamGridBuilder() \
                                .addGrid(lr.regParam, [ 0.1, 0.01]) \
                                .addGrid(lr.elasticNetParam, [1,  0])\
                                .build()


                lr_evaluator = MulticlassClassificationEvaluator(metricName='logLoss')
#                 print("starting training")
                lr_crossval = StratifiedCrossValidator(estimator=lr,
                                      estimatorParamMaps=lr_paramGrid,
                                      evaluator=lr_evaluator,
                                      numFolds=3,
                                      parallelism=4)  

                lr_cvModel = lr_crossval.fit(final_train_df_sub_feats)
#                 print("finish training")

                lr_prediction_df = lr_cvModel.transform(final_test_df_sub_feats)

                lr_cvModel.save(f"./logistics_new/{label_name}.model")
                lr_prediction_df.write.save(f"./logistics_new/{label_name}_prediction_df.parquet", format="parquet")
                final_train_df_sub_feats.write.save(f"./logistics_new/{label_name}_train_df.parquet", format="parquet")

                lr_log_loss = lr_evaluator.evaluate(lr_prediction_df)

                with open(f"./logistics_new/log.log", 'a') as f:
                    f.write('logistics, '+ str(label_name) + ', ' + str(lr_log_loss) + '\n') 

            ##########

            # Use CV to train a naive-bayes classifer
            if('dt' in models):
                dt = DecisionTreeClassifier(featuresCol='features')





                dt_paramGrid = ParamGridBuilder() \
                                                .addGrid(dt.maxDepth, [5, 10])\
                                                .addGrid(dt.maxBins, [16, 32])\
                                                .build()

                dt_evaluator = MulticlassClassificationEvaluator(metricName='logLoss')
                dt_crossval = StratifiedCrossValidator(estimator=dt,
                                      estimatorParamMaps=dt_paramGrid,
                                      evaluator=dt_evaluator,
                                      numFolds=3,
                                      parallelism=4)  

                dt_cvModel = dt_crossval.fit(final_train_df_sub_feats)
                dt_prediction_df = dt_cvModel.transform(final_test_df_sub_feats)

                dt_cvModel.save(f"./decision-tree/{label_name}.model")
                dt_prediction_df.write.save(f"./decision-tree/{label_name}_prediction_df.parquet", format="parquet")
                final_train_df_sub_feats.write.save(f"./decision-tree/{label_name}_train_df.parquet", format="parquet")

                dt_log_loss = dt_evaluator.evaluate(dt_prediction_df)

                with open(f"./decision-tree/log.log", 'a') as f:
                    f.write('decision-tree, '+ str(label_name) + ', ' + str(dt_log_loss) + '\n') 



#             return True
        else:
            if('lr' in models): 
                with open(f"./logistics_new/log.log", 'a') as f:
                    f.write('logistics, '+ str(label_name) + ', ' + 'False' + '\n') 
            if('dt' in models): 
                with open(f"./decision-tree/log.log", 'a') as f:
                    f.write('decision-tree, '+ str(label_name) + ', ' + 'False' + '\n') 


#             return False

    except Exception as e:
        if('lr' in models): 
            with open(f"./logistics_new/log.log", 'a') as f:
                f.write('logistics, '+ str(label_name) + ', ' + 'errored' + '\n') 
            print(traceback.format_exc())
        if('dt' in models): 
            with open(f"./decision-tree/log.log", 'a') as f:
                f.write('decision-tree, '+ str(label_name) + ', ' + 'errored' + '\n') 

#         print("label name " + label_name + 'errored')

In [12]:
import traceback


Stratify 
ref: https://stackoverflow.com/questions/47637760/stratified-sampling-with-pyspark/47672336

# Training

In [17]:
names1 = ['histamine_receptor_agonist', 'histamine_receptor_antagonist',
        'histone_lysine_demethylase_inhibitor',
        'histone_lysine_methyltransferase_inhibitor', 'hiv_inhibitor',
        'hmgcr_inhibitor', 'hsp_inhibitor', 'igf-1_inhibitor', 'ikk_inhibitor',
        'imidazoline_receptor_agonist', 'immunosuppressant',
        'insulin_secretagogue', 'insulin_sensitizer', 'integrin_inhibitor',
        'jak_inhibitor', 'kit_inhibitor', 'laxative', 'leukotriene_inhibitor',
        'leukotriene_receptor_antagonist', 'lipase_inhibitor',
        'lipoxygenase_inhibitor', 'lxr_agonist', 'mdm_inhibitor',
        'mek_inhibitor', 'membrane_integrity_inhibitor',
        'mineralocorticoid_receptor_antagonist',
        'monoacylglycerol_lipase_inhibitor', 'monoamine_oxidase_inhibitor',
        'monopolar_spindle_1_kinase_inhibitor', 'mtor_inhibitor',
        'mucolytic_agent', 'neuropeptide_receptor_antagonist', 'nfkb_inhibitor',
        'nicotinic_receptor_agonist', 'nitric_oxide_donor',
        'nitric_oxide_production_inhibitor', 'nitric_oxide_synthase_inhibitor',
        'norepinephrine_reuptake_inhibitor', 'nrf2_activator',
        'opioid_receptor_agonist', 'opioid_receptor_antagonist',
        'orexin_receptor_antagonist', 'p38_mapk_inhibitor',
        'p-glycoprotein_inhibitor', 'parp_inhibitor', 'pdgfr_inhibitor',
        'pdk_inhibitor', 'phosphodiesterase_inhibitor',
        'phospholipase_inhibitor', 'pi3k_inhibitor', 'pkc_inhibitor']


names2 = ['potassium_channel_activator', 'potassium_channel_antagonist',
        'ppar_receptor_agonist', 'ppar_receptor_antagonist',
        'progesterone_receptor_agonist', 'progesterone_receptor_antagonist',
        'prostaglandin_inhibitor', 'prostanoid_receptor_antagonist',
        'proteasome_inhibitor', 'protein_kinase_inhibitor',
        'protein_phosphatase_inhibitor', 'protein_synthesis_inhibitor',
        'protein_tyrosine_kinase_inhibitor', 'radiopaque_medium',
        'raf_inhibitor', 'ras_gtpase_inhibitor', 'retinoid_receptor_agonist',
        'retinoid_receptor_antagonist', 'rho_associated_kinase_inhibitor',
        'ribonucleoside_reductase_inhibitor', 'rna_polymerase_inhibitor',
        'serotonin_receptor_agonist', 'serotonin_receptor_antagonist',
        'serotonin_reuptake_inhibitor', 'sigma_receptor_agonist',
        'sigma_receptor_antagonist', 'smoothened_receptor_antagonist',
        'sodium_channel_inhibitor', 'sphingosine_receptor_agonist',
        'src_inhibitor', 'steroid', 'syk_inhibitor', 'tachykinin_antagonist',
        'tgf-beta_receptor_inhibitor', 'thrombin_inhibitor',
        'thymidylate_synthase_inhibitor', 'tlr_agonist', 'tlr_antagonist',
        'tnf_inhibitor', 'topoisomerase_inhibitor',
        'transient_receptor_potential_channel_antagonist',
        'tropomyosin_receptor_kinase_inhibitor', 'trpv_agonist',
        'trpv_antagonist', 'tubulin_inhibitor', 'tyrosine_kinase_inhibitor',
        'ubiquitin_specific_protease_inhibitor', 'vegfr_inhibitor', 'vitamin_b',
        'vitamin_d_receptor_agonist', 'wnt_inhibitor']


temp_dict = {}
for name in tqdm(['laxative']):
    model = 'lr'
#     print(os.path.exists(f'./logistics/{name}.model'), name)
    if not os.path.exists(f'./logistics_new/{name}.model'):
        train_individual_label(df2,  name, models = [model] )


100%|██████████| 1/1 [23:16<00:00, 1396.82s/it]


In [18]:
test_df1.cache()

DataFrame[label: int, features: vector]

In [None]:
test_df1.count()

In [17]:
train_df1.cache()

DataFrame[label: int, features: vector]

In [24]:
test_df1.count()

4002

In [23]:
train_df1.take(1)

[Row(label=0, features=DenseVector([-0.1821, -0.5239, 0.694, -2.504, 0.7352, 0.4564, -1.526, -0.4542, -0.581, 0.198, -0.4803, 0.4647, -0.7395, 0.2965, 0.1475, 1.634, 1.324, 0.7007, -0.3423, -0.011, -0.5369, -0.8976, 0.207, 0.1789, -1.154, 0.9574, 1.267, 0.7017, 1.242, -0.4386, 0.1778, -0.2031, -0.0491, 1.082, 0.2416, -0.0452, -0.1076, -0.008, -0.1537, -0.1026, -4.041, -0.1411, 0.0236, 0.6145, -0.2457, -0.5659, -0.1634, 0.505, 2.072, -0.5155, 0.4572, 0.2318, -0.1816, 0.293, -0.8744, -1.85, -0.0072, -0.8311, 0.9586, 0.2706, -0.26, 0.4683, -0.9041, -1.968, 0.7657, 0.2554, -0.0202, -1.798, -1.559, -0.6972, 0.1019, 0.1143, 0.9209, 0.2421, 0.9369, 0.6799, -0.7682, -2.972, 0.0917, 0.0, -0.3313, -0.2598, -0.2066, 0.7621, -0.0539, 0.6061, -0.0324, 0.8088, -0.357, -0.0452, 3.394, 0.3008, -0.5688, 0.4357, -1.747, 0.7828, -0.2966, 0.2655, -0.0828, 0.7226, 1.446, 1.099, 0.659, 0.6781, -0.6593, 0.289, 0.0652, 0.4673, 0.4969, -0.2075, 0.4233, -1.143, -1.001, -0.2473, 0.1489, -0.1803, 0.6063, -0.6129,

In [13]:
def get_high_corr_features(df, gene_feature_names, cell_feature_names ):
    
    # Keeping track of the feature names
    feature_columns = gene_feature_names + cell_feature_names 

    # Creating the feature vector
    vectorAssembler = VectorAssembler(inputCols = feature_columns, outputCol = 'feats' )
    feature_vector = vectorAssembler.transform(df).select("feats")

    # Computing the correlations with pySpark
    corr_matrix = Correlation.corr(feature_vector, "feats").head()[0]

    # Convert the correlation desne matrix and apply mask and to get the indicies where high correlations are observed
    # In here, I convert the correlation matrix to numpy, and then use numpy's mask to obtain the lower traingle of the
    # matrix. I used numpy becasue pyspark does not have mask.

    # Detection highly correlated features
    corr_Array = corr_matrix.toArray()
    masked_corr = np.ma.masked_where(np.triu(np.ones_like(corr_Array, dtype=bool)), corr_Array, copy=True)  
    idx_high_corr_feats = set(np.argwhere(abs(masked_corr) > 0.60)[:,0])  # Set threshold to 90%
    # Identify the column to drop 
    features_to_drop = np.array(feature_columns)[list(idx_high_corr_feats)].tolist()


    return features_to_drop

In [14]:
def train_test_vectorize(train, test, drop_features,  label_name):
    train  = train.drop(*drop_features)
    test = test.drop(*drop_features)
    # Getting the final feature set
    final_feature_names = list(set(train.columns) - set((label_name, )))

    # Create Feature vector
    vectorAssembler = VectorAssembler(inputCols = final_feature_names, outputCol = 'features' )

    # drop all unnecessary features
    train = vectorAssembler.transform(train).drop(*final_feature_names)
    test = vectorAssembler.transform(test).drop(*final_feature_names)
    
    
    
    train = train.select(F.col(label_name).alias('label'), F.col('features'))  # Just renaming these columns
    test = test.select(F.col(label_name).alias('label'), F.col('features')) # Just renaming these columns
    
    return train, test

# SCRAP

In [2]:
from pyspark.ml.regression import DecisionTreeRegressor
obj = DecisionTreeRegressor()


obj.

SyntaxError: invalid syntax (<ipython-input-2-0a09355b97ea>, line 5)

In [23]:
DecisionTreeClassifier().

DecisionTreeClassifier_8acc7bf55f18

In [60]:
my_set = {'Geeks', 'for', 'geeks'} 
  
s = list(my_set)

TypeError: 'list' object is not callable

In [62]:
type(list)

list

In [None]:
cell_feature_names

In [32]:
df.select("cp_type").distinct().show()

+-------+
|cp_type|
+-------+
| trt_cp|
+-------+



In [None]:
fractions = df.select("x1").distinct().withColumn("fraction", lit(0.8)).rdd.collectAsMap()


In [6]:
from pyspark.sql.functions import lit


In [7]:
lit(0.8)

Column<b'0.8'>

In [8]:
from pyspark.sql.functions import lit
list = [(2147481832,23355149,1),(2147481832,973010692,1),(2147481832,2134870842,1),(2147481832,541023347,1),(2147481832,1682206630,1),(2147481832,1138211459,1),(2147481832,852202566,1),(2147481832,201375938,1),(2147481832,486538879,1),(2147481832,919187908,1),(214748183,919187908,1),(214748183,91187908,1)]
df = spark.createDataFrame(list, ["x1","x2","x3"])

In [9]:
df

DataFrame[x1: bigint, x2: bigint, x3: bigint]

In [10]:
fractions = df.select("x1").distinct().withColumn("fraction", lit(0.8)).rdd.collectAsMap()


In [12]:
fractions

{2147481832: 0.8, 214748183: 0.8}

In [15]:
df.select("x1").distinct().withColumn("fraction", lit(0.8)).show()

+----------+--------+
|        x1|fraction|
+----------+--------+
|2147481832|     0.8|
| 214748183|     0.8|
+----------+--------+



In [None]:
temp_df = features_and_targets.withColumn('target_vector', (vector_to_string(array([features_and_targets[col] for col in target_names])))).select(['sig_id', 'target_vector'])
string_indexer = StringIndexer(inputCol = 'target_vector', outputCol = 'target')
string_indexer_model = string_indexer.fit(temp_df)
temp_df = string_indexer_model.transform(temp_df).drop('target_vector')

data = features_and_targets.join(temp_df, features_and_targets.sig_id == temp_df.sig_id, how = 'inner').drop(temp_df.sig_id)