In [3]:
import findspark
findspark.init("/spark")

from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .master('local[4]')
    .appName('example')
    .getOrCreate()
)

spark

## Create DataFrame from Vectors

In [4]:
from pyspark.ml.linalg import Vectors

In [5]:
# how to create df from vectores
data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
        (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
        (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
        (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]

df = spark.createDataFrame(data, ["features"])

In [8]:
df.show()

+--------------------+
|            features|
+--------------------+
|(4,[0,3],[1.0,-2.0])|
|   [4.0,5.0,0.0,3.0]|
|   [6.0,7.0,0.0,8.0]|
| (4,[0,3],[9.0,1.0])|
+--------------------+



## Correlation

In [20]:
from pyspark.ml.stat import Correlation

# pearson
r1 = Correlation.corr(df, "features")
r1.show()


+--------------------+
|   pearson(features)|
+--------------------+
|1.0              ...|
+--------------------+



In [21]:
print(str(r1.head()[0]))

DenseMatrix([[1.        , 0.05564149,        nan, 0.40047142],
             [0.05564149, 1.        ,        nan, 0.91359586],
             [       nan,        nan, 1.        ,        nan],
             [0.40047142, 0.91359586,        nan, 1.        ]])


In [22]:
r2 = Correlation.corr(df, "features", "spearman")
print(str(r2.head()[0]))

DenseMatrix([[1.        , 0.10540926,        nan, 0.4       ],
             [0.10540926, 1.        ,        nan, 0.9486833 ],
             [       nan,        nan, 1.        ,        nan],
             [0.4       , 0.9486833 ,        nan, 1.        ]])
