In [1]:
import pandas as pd
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession 
from pyspark.ml.feature import VectorAssembler, StandardScaler, PCA

# create the SparkSession class,
# which is the entry point into all functionality in Spark
spark = (SparkSession
         .builder
         .master('local[4]') # set it to run on 4 cores on local
         .appName('PCA')
         .config(conf = SparkConf())
         .getOrCreate())

In [3]:
data = {
    'hostid': [1, 1, 2, 2],
    'itemname': ['A', 'B', 'A', 'C'],
    'itemvalue': [10, 3, 9, 40]
}

data = pd.DataFrame(data)
df = spark.createDataFrame(data)
df.show()

+------+--------+---------+
|hostid|itemname|itemvalue|
+------+--------+---------+
|     1|       A|       10|
|     1|       B|        3|
|     2|       A|        9|
|     2|       C|       40|
+------+--------+---------+



[Stackoverflow: Pivot rows in mysql](https://stackoverflow.com/questions/1241178/mysql-rows-to-columns/9668036#9668036)

In [11]:
df.createOrReplaceTempView("history")
sql_df1 = spark.sql(
    """
    SELECT 
        hostid,
        CASE WHEN itemname = "A" THEN itemvalue END AS A,
        CASE WHEN itemname = "B" THEN itemvalue END AS B,
        CASE WHEN itemname = "C" THEN itemvalue END AS C
    FROM 
        history
    """
)
sql_df1.show()

+------+----+----+----+
|hostid|   A|   B|   C|
+------+----+----+----+
|     1|  10|null|null|
|     1|null|   3|null|
|     2|   9|null|null|
|     2|null|null|  40|
+------+----+----+----+



In [21]:
sql_df1.createOrReplaceTempView('history_extended')
sql_df2 = spark.sql(
    """
    SELECT
        hostid,
        MIN(A) AS A,
        MIN(B) AS B,
        MIN(C) AS C
    FROM 
        history_extended
    GROUP BY 
        hostid
    """
)
sql_df2.show()

+------+---+---+---+
|hostid|  A|  B|  C|
+------+---+---+---+
|     1| 10|  3|  0|
|     2|  9|  0| 40|
+------+---+---+---+



In [20]:
sql_df1 = spark.sql(
    """
    SELECT
        hostid, 
        SUM( IF(itemname = 'A', itemvalue, 0) ) AS A,  
        SUM( IF(itemname = 'B', itemvalue, 0) ) AS B, 
        SUM( IF(itemname = 'C', itemvalue, 0) ) AS C 
    FROM 
        history
    GROUP BY
        hostid
    """
)
sql_df1.show()

+------+---+---+---+
|hostid|  A|  B|  C|
+------+---+---+---+
|     1| 10|  3|  0|
|     2|  9|  0| 40|
+------+---+---+---+

