#### Import Library

In [None]:
import findspark
findspark.init()

from pyspark import SparkContext
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("l7").master('local[2]').getOrCreate()
sc = spark.sparkContext

#### Import Data

In [None]:
spark_df = spark.read.options(header='true', inferSchema='true').csv("file:///root/code/datasets/PBMC_16k_RNA.csv")

In [None]:
label_df = spark.read.options(header='true', inferSchema='true').csv("file:///root/code/datasets/PBMC_16k_RNA_label.csv")

#### Preprocess

**Column Name**

In [None]:
transformed_columns = []
for col in spark_df.columns:
    transformed_columns.append(col.strip().replace('.', ''))

spark_df = spark_df.toDF(*transformed_columns)

**Vector Assemble**

In [None]:
# Merge the features into one vector
from pyspark.ml.feature import VectorAssembler

feature_cols = spark_df.drop('index').columns
feature_assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
feature_df = feature_assembler.transform(spark_df).select('index', 'features')
feature_df.show(5)

**Standardized**

In [None]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol='features', outputCol='standardized_features')
standardized_df = scaler.fit(feature_df).transform(feature_df).select('index', 'standardized_features')
standardized_df.show(5)


**PCA Upon Standardized Vector with More Principal Components**

In [None]:
len(spark_df.columns)

In [None]:
from pyspark.ml.feature import PCA

pca = PCA(k=5, inputCol='standardized_features', outputCol='pca_features')
pca_model = pca.fit(standardized_df)
pca_df = pca_model.transform(standardized_df)

**Explained Variance**

In [None]:
pca_model.explainedVariance.cumsum()

In [None]:
labeled_pca_df = pca_df.join(label_df, on='index')

In [None]:
labeled_pca_df.show(5)

#### Non-A cells

In [None]:
labeled_pca_df.groupBy('CITEsort').count().orderBy("count").show()

**Divide Dataframe**

In [None]:
labeled_pca_df.columns

In [None]:
labeled_pca_df.createOrReplaceTempView("data")

t_cell_df = spark.sql("SELECT pca_features FROM data WHERE CITEsort == 'CD4+ T'")
non_t_cell_df = spark.sql("SELECT pca_features FROM data WHERE CITEsort != 'CD4+ T'")
t_cell_df.show(5)

**Label Data: change the label**

In [None]:
from pyspark.mllib.regression import LabeledPoint

t_cell_rdd = t_cell_df.rdd.map(lambda x: LabeledPoint(0, [x[0]]))
non_t_cell_rdd = non_t_cell_df.rdd.map(lambda x: LabeledPoint(1, [x[0]]))

t_cell_rdd.take(5)

**Reunion and Split Data with Random Seed**

In [None]:
data = t_cell_rdd.union(non_t_cell_rdd)
(training_data, test_data) = data.randomSplit([0.7, 0.3], seed=22)

**Preprocess**

In [None]:
labeled_pca_df.createOrReplaceTempView("data")
cell_df = spark.sql("SELECT pca_features, cast (CITEsort == 'CD4+ T' as int) as y FROM data")

In [None]:
cell_df.show(5)

In [None]:
from pyspark.sql import Row 
cell_df = cell_df.rdd.map(lambda x: Row(PC1=float(x[0][0]), PC2=float(x[0][1]), PC3=float(x[0][2]), PC4=float(x[0][3]), PC5=float(x[0][4]), y=x[1])).toDF()

In [None]:
cell_df.repartition(1).write.csv("file:///root/code/res", encoding="utf-8", header=True)