In [None]:
import glow
from glow import linear_regression_gwas, expand_struct
import numpy as np
from pyspark.ml.linalg import DenseMatrix
from pyspark.sql.session import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions as F

spark = SparkSession.builder\
    .config('spark.jars.packages', 'io.projectglow:glow_2.11:0.5.0')\
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")\
    .getOrCreate()
glow.register(spark)

In [None]:
np.random.seed(0)
g = np.array([0., 1., 2., 0.])
x = np.array([
    [1, -1],
    [2, -2],
    [3, -3],
    [4, -4.],
])
b = np.array([0., 1.])
y = g + np.dot(x, b) + np.random.normal(scale=.01, size=g.size)

In [26]:
import pyspark
pyspark.__version__

'2.4.1'

In [24]:
import numpy as np
import pandas as pd
from pyspark.sql import functions as F
x = np.column_stack([np.arange(4), -np.arange(4, dtype=float)])
print(x)
r = spark.createDataFrame(pd.DataFrame({'i': [0]})).withColumn('x', F.lit(x)).limit(1).collect()[0]
print(r)
print(r.x.toArray())

[[ 0. -0.]
 [ 1. -1.]
 [ 2. -2.]
 [ 3. -3.]]
Row(i=0, x=DenseMatrix(4, 2, [0.0, -0.0, 1.0, -1.0, 2.0, -2.0, 3.0, -3.0], False))
[[ 0.  2.]
 [-0. -2.]
 [ 1.  3.]
 [-1. -3.]]


In [20]:
r.x.toArray()

array([[0., 4.],
       [1., 5.],
       [2., 6.],
       [3., 7.]])

In [5]:
spark.createDataFrame([Row(genotypes=g.tolist(), phenotypes=y.tolist())])\
    .select(expand_struct(linear_regression_gwas('genotypes', 'phenotypes', F.lit(np.asfortranarray(x)))))\
    .show()

+------------------+-------------------+--------------------+
|              beta|      standardError|              pValue|
+------------------+-------------------+--------------------+
|-2.382793056158103|0.20221757585071454|0.053898082087957454|
+------------------+-------------------+--------------------+



In [1]:
HR = '-' * 50
print(HR)
print('Version 1')
# Correct version
dm = DenseMatrix(numRows=x.shape[0], numCols=x.shape[1], values=x.ravel(order='F').tolist())
np.testing.assert_equal(x, dm.toArray())
print(dm.toArray())
spark.createDataFrame([Row(genotypes=g.tolist(), phenotypes=y.tolist(), covariates=dm)])\
    .select(expand_struct(linear_regression_gwas('genotypes', 'phenotypes', 'covariates')))\
    .show()

print(HR)
print('Version 2')
# Version also like demo notebook with explicit matrix field (also wrong)
dm = DenseMatrix(numRows=x.shape[0], numCols=x.shape[1], values=x.ravel(order='C').tolist())
print(dm.toArray())
spark.createDataFrame([Row(genotypes=g.tolist(), phenotypes=y.tolist(), covariates=dm)])\
    .select(expand_struct(linear_regression_gwas('genotypes', 'phenotypes', 'covariates')))\
    .show()

print(HR)
print('Version 3')
# Version like demo notebook (wrong)
spark.createDataFrame([Row(genotypes=g.tolist(), phenotypes=y.tolist())])\
    .select(expand_struct(linear_regression_gwas('genotypes', 'phenotypes', F.lit(x))))\
    .show()

print(HR)
print('Version 4')
# Correct version using numpy literal column
x_weird = x.T.ravel(order='C').reshape(x.shape)
print(x_weird)
spark.createDataFrame([Row(genotypes=g.tolist(), phenotypes=y.tolist())])\
    .select(expand_struct(linear_regression_gwas('genotypes', 'phenotypes', F.lit(x_weird))))\
    .show()

--------------------------------------------------
Version 1
[[ 1. -1.]
 [ 2. -2.]
 [ 3. -3.]
 [ 4. -4.]]
+------------------+--------------------+--------------------+
|              beta|       standardError|              pValue|
+------------------+--------------------+--------------------+
|0.9948866298677059|0.001300287478677...|8.320427901051293E-4|
+------------------+--------------------+--------------------+

--------------------------------------------------
Version 2
[[ 1.  3.]
 [-1. -3.]
 [ 2.  4.]
 [-2. -4.]]
+------------------+-------------------+--------------------+
|              beta|      standardError|              pValue|
+------------------+-------------------+--------------------+
|-2.382793056158103|0.20221757585071454|0.053898082087957454|
+------------------+-------------------+--------------------+

--------------------------------------------------
Version 3
+------------------+-------------------+--------------------+
|              beta|      standardErro