Main contributor: Jeff Won

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce
from operator import add

### This part is to run pyspark locally
import findspark  # Get rid of this in DataBricks
# findspark.init('/opt/spark-3.0.1')  # Get rid of this in DataBricks #faraz: you can remove the parameter. it only worked like this for me
findspark.init()
########################################

import pyspark
from pyspark.sql import Row
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
from pyspark.sql.types import FloatType
from pyspark.sql.types import DoubleType
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F 
from pyspark.sql.functions import explode, col, udf, mean as _mean, stddev as _stddev, log, log10, sqrt
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.functions import lit
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import LogisticRegression, NaiveBayes
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorSlicer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml.functions import vector_to_array


from tqdm import tqdm

from pyspark.ml.regression import DecisionTreeRegressor
from scv import StratifiedCrossValidator

from pyspark.ml.classification import DecisionTreeClassifier
import os

In [None]:
df_train = spark.read.csv('train_feats.csv', header='true', inferSchema= 'true')   # path in HDFS file system
df_label = spark.read.csv('train_targets_scored.csv', header='true', inferSchema= 'true')
df = df_train.join(df_label, on=['sig_id'], how='left_outer')  # Jjoin them together
df = df.filter(df.cp_type == 'trt_cp')
df = df.drop('cp_type')

# String indexer for cp_dose
indexer = StringIndexer(inputCol="cp_dose", outputCol="cp_dose_cat")
# index cp_dose in data
df1 = indexer.fit(df).transform(df)

# String indexer for cp_time
indexer = StringIndexer(inputCol="cp_time", outputCol="cp_time_cat")
# index cp_time in data
df1 = indexer.fit(df1).transform(df1)
df1 = df1.drop('cp_dose')
df1 = df1.drop('cp_time')

# One-hot enocder 
encoder = OneHotEncoder(inputCols=["cp_time_cat", "cp_dose_cat"],
                        outputCols=["cp_time_onehot", "cp_dose_onehot"])

model = encoder.fit(df1)
df1 = model.transform(df1)

# Keeping n-1 dummy variables for each feature. (dummy variables have degree of fredom n-1)
df1 = df1.withColumn("cp_time_cols", vector_to_array("cp_time_onehot")).select(df1.columns + [col("cp_time_cols")[i] for i in range(2)])
df1 = df1.withColumn("cp_dose_cols", vector_to_array("cp_dose_onehot")).select(df1.columns + [col("cp_dose_cols")[i] for i in range(1)])

# drop leftover cols
df1 = df1.drop('cp_dose_cat',
 'cp_time_cat',
 'cp_time_onehot',
 'cp_dose_onehot',
)


df2 = df1.withColumn("gene_max", F.greatest(*gene_feature_names))
df2 = df2.withColumn("gene_min", F.least(*gene_feature_names))
df2 = df2.withColumn("cell_max", F.greatest(*cell_feature_names))
df2 = df2.withColumn("cell_min", F.least(*cell_feature_names))


df2 = df2.withColumn("gene_mean", reduce(lambda x,y: x+y, (col(x) for x in gene_feature_names)) / len(gene_feature_names))
df2 = df2.withColumn("cell_mean", reduce(lambda x,y: x+y, (col(x) for x in cell_feature_names)) / len(cell_feature_names))



# K-Mean

In [None]:

'''
KMeans clustering - engineer new feature based on cluster results
Use K=3 to fit the assembled features
'''
from pyspark.ml.clustering import KMeans

feature_label_assembler = VectorAssembler(inputCols=df1.columns[1:], outputCol="assemebled")
cluster_df = feature_label_assembler.transform(df1)

kmeans = KMeans(k=3, featuresCol='assemebled', 
                predictionCol='clusterClassPrediction', distanceMeasure='euclidean',)

model = kmeans.fit(cluster_df)
transformed = model.transform(cluster_df).select("sig_id", "clusterClassPrediction")

df2 = df2.join(transformed, on=['sig_id'], how='inner')
df2.select("clusterClassPrediction").show()
