In [0]:
from pyspark.sql.functions import col, sum, when, mean, countDistinct
from pyspark.sql import functions as F
from pyspark.ml.feature import PCA
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.window import Window
import pyspark.pandas as ps

In [0]:
# Point file path
path = '/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv'

# Load Data
df = spark.read.csv(path, header=True, inferSchema= True)
df = df.filter( col('y') < 30)

# Get only our selected columns
cols = [ 'carat', 'table', 'depth']

df_num = df.select(F.log1p('carat').alias('carat'), 
                   F.log1p('table').alias('table'), 
                   F.log1p('depth').alias('depth'))

# df_num = df.select(F.log1p('x').alias('x'), 
#                    F.log1p('y').alias('y'), 
#                    F.log1p('z').alias('z'), 
#                    F.log1p('table').alias('table'), 
#                    F.log1p('depth').alias('depth'))

In [0]:
df_num.limit(3).display()

carat,table,depth
0.2070141693843261,4.02535169073515,4.135166556742356
0.1906203596086497,4.127134385045092,4.107589788972121
0.2070141693843261,4.189654742026425,4.05871738457895


### Vectorize the Dataset
The first step to be done when working with MLlib is to vectorize the dataset, as that is the format how MLlib algorithms require the data.

In [0]:
# Transform all features into a vector using VectorAssembler
# We removed price, as this would be our target variable
assembler = VectorAssembler(inputCols= cols, outputCol="features")
prepared_df = assembler.transform(df_num)

In [0]:
display(prepared_df.select('features').limit(2))

features
"Map(vectorType -> dense, length -> 3, values -> List(0.20701416938432615, 4.02535169073515, 4.135166556742356))"
"Map(vectorType -> dense, length -> 3, values -> List(0.1906203596086497, 4.127134385045092, 4.107589788972121))"


### Running PCA
Now that we have the vectorized version of our data, let's run PCA

In [0]:
from scipy import stats as scs
scs.bartlett( df_num.toPandas()['carat'], 
             df_num.toPandas()['depth'],
             df_num.toPandas()['table'],
            #  df_num.toPandas()['x'], 
            #  df_num.toPandas()['y'],
            #  df_num.toPandas()['z'] 
            )

BartlettResult(statistic=284679.93937348214, pvalue=0.0)

In [0]:
# First, we will run it with all the possible components
pca = PCA(k= len(df_num.columns), inputCol="features")
pca.setOutputCol("pca_features")

# Fit
model = pca.fit(prepared_df)

In [0]:
# See explained variance of the PCs
df_var = ps.DataFrame(model.explainedVariance, columns=['explained_var'])
df_var.insert(0, 'Component', value= ['PC'+str(n) for n in range(1,len(df_num.columns)+1)])
display(df_var)

Component,explained_var
PC1,0.969359536218893
PC2,0.0233908282360098
PC3,0.0072496355450972


Databricks visualization. Run in Databricks to view.

In [0]:
# Get transformed Output
model.setOutputCol("output")
pca_transformed = model.transform(prepared_df).select('output')

In [0]:
display(pca_transformed)

output
"Map(vectorType -> dense, length -> 3, values -> List(-0.337854542043682, -2.7279272511201453, 5.078403368525109))"
"Map(vectorType -> dense, length -> 3, values -> List(-0.32446613567339955, -2.833791365368773, 5.079981934273924))"
"Map(vectorType -> dense, length -> 3, values -> List(-0.3426183667407044, -2.906836771193466, 5.049966452562161))"
"Map(vectorType -> dense, length -> 3, values -> List(-0.3870652924383873, -2.772813774386362, 5.1059660041188275))"
"Map(vectorType -> dense, length -> 3, values -> List(-0.4024765682415171, -2.768508812545547, 5.1193580291946725))"
"Map(vectorType -> dense, length -> 3, values -> List(-0.3470528920960793, -2.7557749568923597, 5.107740557594817))"
"Map(vectorType -> dense, length -> 3, values -> List(-0.3470342664094098, -2.757934928841296, 5.100175016362643))"
"Map(vectorType -> dense, length -> 3, values -> List(-0.3619562129785228, -2.725494136953833, 5.084283911928863))"
"Map(vectorType -> dense, length -> 3, values -> List(-0.3328907410349031, -2.810613510014358, 5.160262668967488))"
"Map(vectorType -> dense, length -> 3, values -> List(-0.3408368476058274, -2.835139680786452, 5.073462128678454))"


In [0]:
temp = spark.createDataFrame(pca_transformed.collect())
temp = temp.select(col('output').cast('string'))

# split columns
df_transformed = (
    temp
    .select(
        F.split('output', ',')[0][2:15].alias('PC1').cast('float'),
        F.split('output', ',')[1].cast('float').alias('PC2'),
                            #F.split('output', ',')[2].cast('float').alias('PC3'),
                            #F.split('output', ',')[3].cast('float').alias('PC4'),
                            #F.split('output', ',')[4].cast('float').alias('PC5'),
        F.split('output', ',')[2][0:14].alias('PC3').cast('float')       ) 
        .fillna(0)
        )

In [0]:
display(temp.limit(2))

output
"[-0.337854542043682,-2.7279272511201453,5.078403368525109]"
"[-0.32446613567339955,-2.833791365368773,5.079981934273924]"


In [0]:
display(df_transformed.limit(3))

PC1,PC2,PC3
-0.33785453,-2.7279272,5.0784035
-0.32446614,-2.8337913,5.079982
-0.34261838,-2.9068367,5.0499663


In [0]:
# Explained variance array
expl_var = model.explainedVariance

df_transformed = (
        df_transformed
        .withColumn('_c0', F.row_number().over(Window.partitionBy().orderBy(F.lit(1))))
        .withColumn('score', (col('PC1') * expl_var[0]) + (col('PC2') * expl_var[1]) + (col('PC3') * expl_var[2]) ) #+ (col('PC4') * expl_var[3]) + (col('PC5') * expl_var[4]))
        .withColumn('rank', F.dense_rank().over(Window.partitionBy().orderBy('score')) )
        .sort('_c0')
        )

df_transformed.count()

53938

In [0]:
display(df
        .join(df_transformed.select('_c0', 'rank'),
              on='_c0',
              how='inner')
        .sort(col('rank'))
        )

_c0,carat,cut,color,clarity,depth,table,price,x,y,z,rank
27415,2.05,Ideal,G,SI2,61.6,56.0,18017,8.11,8.16,5.01,1
27630,2.22,Ideal,I,VS2,61.3,56.0,18531,8.45,8.36,5.15,2
27130,2.04,Ideal,J,IF,61.5,58.0,17327,8.11,8.14,5.0,3
25999,4.01,Premium,I,I1,61.0,61.0,15223,10.14,10.1,6.17,4
25998,1.01,Very Good,D,IF,63.4,59.0,15219,6.39,6.26,4.01,5
26444,2.0,Very Good,H,SI1,63.2,58.0,15984,7.95,7.93,5.02,6
26534,2.02,Very Good,I,SI1,59.8,59.0,16192,8.21,8.3,4.94,7
23645,3.65,Fair,H,I1,67.1,53.0,11668,9.53,9.48,6.38,8
27679,2.02,Ideal,G,VS2,62.0,57.0,18700,8.1,8.05,5.01,9
24328,1.5,Premium,E,VS2,59.4,62.0,12587,7.48,7.4,4.42,10
