In [7]:
import pandas as pd
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import LongType
from pyspark.sql.functions import col, pandas_udf


In [None]:
spark = SparkSession.builder.appName("chaper-5").getOrCreate()

### Spark UDF

In [4]:
def cubed(s):
    return s ** 3

spark.udf.register("cubed", cubed, LongType())

spark.range(1, 9).createOrReplaceTempView("udf_test")

In [None]:
spark.sql(
    """
        SELECT   id
               , cubed(id) AS id_cubed
          FROM   udf_test
    """
).show()

In [None]:
spark.sql(
    """
        SELECT   s
          FROM   test1
         WHERE   s IS NOT NULL
           AND   strlen(s) > 1
    """
)

In [5]:
def cubed(a: pd.Series) -> pd.Series:
    return a * a * a

cubed_udf = pandas_udf(cubed, returnType=LongType())

In [None]:
df = spark.range(1, 9)
df.select("id", cubed_udf(col("id")).alias("cubed_id")).show()

### External Data Sources

* PostgreSQL
* MySQL
* Snowflake

In [None]:
# postgresql

# read1
jdbc_df1 = (
    spark
    .read
    .format("jdbc")
    .option("url", "jdbc:postgresql://[DBSERVER]")
    .option("dbtable", "[SCHEMA].[TABLENAME]")
    .option("user", "[USERNAME]")
    .option("password", "[PASSWORD]")
    .load()
)

# read2
jdbc_df2 = (
    spark
    .read
    .jdbc("jdbc:postgresql://[DBSERVER]", "[SCHEMA]:[TABLENAME]", properties={"user": "[USERNAME]", "password": ["PASSWORD"]})
)

# write1
(
    jdbc_df1
    .write
    .format("jdbc")
    .option("url", "jdbc:postgresql://[DBSERVER]")
    .option("dbtable", "[SCHEMA].[TABLENAME]")
    .option("user", "[USERNAME]")
    .option("password", "[PASSWORD]")
    .save()
)

# write2
(
    jdbc_df2
    .write
    .jdbc("jdbc:postgresql://[DBSERVER]", "[SCHEMA]:[TABLENAME]", properties={"user": "[USERNAME]", "password": ["PASSWORD"]})
)


In [None]:
# mysql

# read
jdbc_df = (
    spark
    .read
    .format("jdbc")
    .option("url", "jdbc:mysql://[DBSERVER]:3306/[DATABASE]")
    .option("driver", "com.mysql.jdbcDriver")
    .option("dbtable", "[TABLENAME]")
    .option("user", "[USERNAME]")
    .option("password", "[PASSWORD]")
    .load()
)

# write
(
    jdbc_df
    .write
    .format("jdbc")
    .option("url", "jdbc:mysql://[DBSERVER]:3306/[DATABASE]")
    .option("driver", "com.mysql.jdbcDriver")
    .option("dbtable", "[TABLENAME]")
    .option("user", "[USERNAME]")
    .option("password", "[PASSWORD]")
    .save()
)

In [None]:
# Reference
# https://docs.snowflake.com/ko/user-guide/spark-connector-use

sc = SparkContext("local", "chaper-5")
spark = SQLContext(sc)
spark_conf = SparkConf().setMaster("local").setAppName("chaper-5-snowflake-app")

sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", "[AWS_KEY]")
sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", "[AWS_SECRET_KEY]")

# use password
sf_option = {
    "sfURL": "<account-identifier>.snowflakecomputing.com",
    "sfUser": "<user-name>",
    "sfPassword": "<password>",
    "sfDatabase": "<database>",
    "sfSchema": "<schema>",
    "sfWarehouse": "<warehouse>"
}

# use oauth token (recommand)
sf_option = {
    "sfURL": "<account-identifier>.snowflakecomputing.com",
    "sfUser": "<user-name>",
    "sfAuthenticator": "oauth",
    "sfToken": "<external-oauth-access-token>",
    "sfDatabase": "<database>",
    "sfSchema": "<schema>",
    "sfWarehouse": "<warehouse>"
}


SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake"

QUERY = ""

df = (
    spark
    .read
    .format(SNOWFLAKE_SOURCE_NAME)
    .option(**sf_option)
    .option("query", QUERY)
    .load()
)

df.show()

### Spark Build-In Functions

In [12]:
spark.sql(
    """
        SELECT array_distinct(array(1, 2, 3, null, 3))
    """
).show()

+---------------------------------------+
|array_distinct(array(1, 2, 3, NULL, 3))|
+---------------------------------------+
|                        [1, 2, 3, NULL]|
+---------------------------------------+



                                                                                

In [15]:
spark.sql(
    """
        SELECT array_intersect(array(1, 2, 3), array(1, 3, 5))
    """
).show()

+-----------------------------------------------+
|array_intersect(array(1, 2, 3), array(1, 3, 5))|
+-----------------------------------------------+
|                                         [1, 3]|
+-----------------------------------------------+



In [16]:
spark.sql(
    """
        SELECT array_union(array(1, 2, 3), array(1, 3, 5))
    """
).show()

+-------------------------------------------+
|array_union(array(1, 2, 3), array(1, 3, 5))|
+-------------------------------------------+
|                               [1, 2, 3, 5]|
+-------------------------------------------+



In [17]:
spark.sql(
    """
        SELECT array_except(array(1, 2, 3), array(1, 3, 5))
    """
).show()

+--------------------------------------------+
|array_except(array(1, 2, 3), array(1, 3, 5))|
+--------------------------------------------+
|                                         [2]|
+--------------------------------------------+



In [18]:
spark.sql(
    """
        SELECT array_join(array('hello', 'world'), ' ')
    """
).show()

+----------------------------------+
|array_join(array(hello, world),  )|
+----------------------------------+
|                       hello world|
+----------------------------------+



In [19]:
spark.sql(
    """
        SELECT array_max(array(1, 2, 3, null, 3))
    """
).show()

+----------------------------------+
|array_max(array(1, 2, 3, NULL, 3))|
+----------------------------------+
|                                 3|
+----------------------------------+



In [20]:
spark.sql(
    """
        SELECT array_min(array(1, 2, 3, null, 3))
    """
).show()

+----------------------------------+
|array_min(array(1, 2, 3, NULL, 3))|
+----------------------------------+
|                                 1|
+----------------------------------+



In [21]:
spark.sql(
    """
        SELECT array_position(array(3, 2, 1), 1)
    """
).show()

+---------------------------------+
|array_position(array(3, 2, 1), 1)|
+---------------------------------+
|                                3|
+---------------------------------+



In [23]:
spark.sql(
    """
        SELECT array_remove(array(1, 2, 3, null, 3), 3)
    """
).show()

+----------------------------------------+
|array_remove(array(1, 2, 3, NULL, 3), 3)|
+----------------------------------------+
|                            [1, 2, NULL]|
+----------------------------------------+



In [25]:
spark.sql(
    """
        SELECT arrays_overlap(array(1, 2, 3), array(3, 4, 5))
    """
).show()

+----------------------------------------------+
|arrays_overlap(array(1, 2, 3), array(3, 4, 5))|
+----------------------------------------------+
|                                          true|
+----------------------------------------------+



In [26]:
spark.sql(
    """
        SELECT array_sort(array('b', 'd', null, 'c', 'a'))
    """
).show()

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|array_sort(array(b, d, NULL, c, a), lambdafunction((IF(((namedlambdavariable() IS NULL) AND (namedlambdavariable() IS NULL)), 0, (IF((namedlambdavariable() IS NULL), 1, (IF((namedlambdavariable() IS NULL), -1, (IF((namedlambdavariable() < namedlambdavariable()), -1, (IF((namedlambdavariable() > namedlambdavariable()), 1, 0)))))))))), namedlambdavariable(), namedlambdavariable()))|
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [27]:
spark.sql(
    """
        SELECT concat(array(1, 2, 3), array(3, 4, 5))
    """
).show()

+--------------------------------------+
|concat(array(1, 2, 3), array(3, 4, 5))|
+--------------------------------------+
|                    [1, 2, 3, 3, 4, 5]|
+--------------------------------------+



In [28]:
spark.sql(
    """
        SELECT flatten(array(array(1, 2, 3), array(3, 4, 5)))
    """
).show()

+----------------------------------------------+
|flatten(array(array(1, 2, 3), array(3, 4, 5)))|
+----------------------------------------------+
|                            [1, 2, 3, 3, 4, 5]|
+----------------------------------------------+



In [30]:
spark.sql(
    """
        SELECT array_repeat('123', 3)
    """
).show()

+--------------------+
|array_repeat(123, 3)|
+--------------------+
|     [123, 123, 123]|
+--------------------+



In [31]:
spark.sql(
    """
        SELECT reverse(array(2, 1, 4, 3))
    """
).show()

+--------------------------+
|reverse(array(2, 1, 4, 3))|
+--------------------------+
|              [3, 4, 1, 2]|
+--------------------------+



In [None]:
spark.sql(
    """
        SELECT sequence(1, 5);
    """
).show()


+---------------+
| sequence(1, 5)|
+---------------+
|[1, 2, 3, 4, 5]|
+---------------+



In [36]:
spark.sql(
    """
        SELECT sequence(5, 1);
    """
).show()

+---------------+
| sequence(5, 1)|
+---------------+
|[5, 4, 3, 2, 1]|
+---------------+



In [37]:
spark.sql(
    """
        SELECT sequence(to_date('2018-01-01'), to_date('2018-03-01'), interval 1 month);
    """
).show()

+----------------------------------------------------------------------+
|sequence(to_date(2018-01-01), to_date(2018-03-01), INTERVAL '1' MONTH)|
+----------------------------------------------------------------------+
|                                                  [2018-01-01, 2018...|
+----------------------------------------------------------------------+



In [40]:
spark.sql(
    """
        SELECT shuffle(array(1, 2, null, 3))
    """
).show()

+-----------------------------+
|shuffle(array(1, 2, NULL, 3))|
+-----------------------------+
|              [2, 1, 3, NULL]|
+-----------------------------+



In [41]:
spark.sql(
    """
        SELECT slice(array(1, 2, 3, 4), -2, 2)
    """
).show()

+-------------------------------+
|slice(array(1, 2, 3, 4), -2, 2)|
+-------------------------------+
|                         [3, 4]|
+-------------------------------+



In [43]:
spark.sql(
    """
        SELECT arrays_zip(array(1, 2, 3, 4), array(5, 6, 7, 8), array(9, 10, 11, 12))
    """
).show()

+----------------------------------------------------------------------+
|arrays_zip(array(1, 2, 3, 4), array(5, 6, 7, 8), array(9, 10, 11, 12))|
+----------------------------------------------------------------------+
|                                                  [{1, 5, 9}, {2, 6...|
+----------------------------------------------------------------------+



In [46]:
spark.sql(
    """
        SELECT element_at(array(3, 1, 2), 3)
    """
).show()

+-----------------------------+
|element_at(array(3, 1, 2), 3)|
+-----------------------------+
|                            2|
+-----------------------------+



In [47]:
spark.sql(
    """
        SELECT cardinality(array('b', 'd', 'c', 'a'))
    """
).show()

+------------------------------+
|cardinality(array(b, d, c, a))|
+------------------------------+
|                             4|
+------------------------------+



In [49]:
spark.sql(
    """
        SELECT map_from_arrays(array(1.0, 3.0), array('2', '4'))
    """
).show()

+---------------------------------------------+
|map_from_arrays(array(1.0, 3.0), array(2, 4))|
+---------------------------------------------+
|                         {1.0 -> 2, 3.0 -> 4}|
+---------------------------------------------+



In [None]:
spark.sql(
    """
        SELECT map_from_entries(array(struct(1, 'a'), struct(2, 'b')))
    """
).show()

+---------------------------------------------------+
|map_from_entries(array(struct(1, a), struct(2, b)))|
+---------------------------------------------------+
|                                   {1 -> a, 2 -> b}|
+---------------------------------------------------+



In [53]:
spark.sql(
    """
        SELECT map_concat(map(1, 'a', 2, 'b'), map(3, 'c', 4, 'd'))
    """
).show()

+--------------------------------------------+
|map_concat(map(1, a, 2, b), map(3, c, 4, d))|
+--------------------------------------------+
|                        {1 -> a, 2 -> b, ...|
+--------------------------------------------+



In [54]:
spark.sql(
    """
        SELECT element_at(map(1, 'a', 2, 'b'), 2)
    """
).show()

+------------------------------+
|element_at(map(1, a, 2, b), 2)|
+------------------------------+
|                             b|
+------------------------------+



In [55]:
spark.sql(
    """
        SELECT cardinality(map(1, 'a', 2, 'b'))
    """
).show()

+----------------------------+
|cardinality(map(1, a, 2, b))|
+----------------------------+
|                           2|
+----------------------------+



### Spark High-Order Functions