In [1]:
import pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()


In [2]:
df = spark.sql("select 'spark' as hello ")
df.show()

+-----+
|hello|
+-----+
|spark|
+-----+



In [3]:
x = [[1,10],[2,14],[3,17]]
df = spark.createDataFrame(data=x, schema = ["id","value"])
df.printSchema()
df.show()

root
 |-- id: long (nullable = true)
 |-- value: long (nullable = true)

+---+-----+
| id|value|
+---+-----+
|  1|   10|
|  2|   14|
|  3|   17|
+---+-----+



In [4]:
from pyspark.sql.types import *
from pyspark.sql import Row
rdd = spark.sparkContext.parallelize(
    [Row("abc", [1, 2]), Row("cd", [3, 4])]
)
schema = StructType([
    StructField("id", StringType(), True),
    StructField("numbers", ArrayType(IntegerType(), True), True)
])
df = spark.createDataFrame(rdd, schema)
df.show()

+---+-------+
| id|numbers|
+---+-------+
|abc| [1, 2]|
| cd| [3, 4]|
+---+-------+



In [5]:
df = spark.createDataFrame(
    [(33, 44), (55, 66)], ["num1", "num2"]
)
df.show()

+----+----+
|num1|num2|
+----+----+
|  33|  44|
|  55|  66|
+----+----+



In [6]:
from pyspark.sql.functions import *
df.withColumn("nums", array(df.num1, df.num2)).show()

+----+----+--------+
|num1|num2|    nums|
+----+----+--------+
|  33|  44|[33, 44]|
|  55|  66|[55, 66]|
+----+----+--------+



In [7]:
df = spark.createDataFrame(
    [("joe", "red"), ("joe", "blue"), ("lisa", "yellow")], ["first_name", "color"]
)
df.show()

+----------+------+
|first_name| color|
+----------+------+
|       joe|   red|
|       joe|  blue|
|      lisa|yellow|
+----------+------+



In [8]:
res = (df
    .groupBy(df.first_name)
    .agg(collect_list(col("color")).alias("colors")))
res.show()

+----------+-----------+
|first_name|     colors|
+----------+-----------+
|      lisa|   [yellow]|
|       joe|[red, blue]|
+----------+-----------+



In [9]:
res.printSchema()

root
 |-- first_name: string (nullable = true)
 |-- colors: array (nullable = false)
 |    |-- element: string (containsNull = false)



In [10]:
df = spark.createDataFrame(
    [("abc", [1, 2]), ("cd", [3, 4])], ["id", "numbers"]
)
df.show()

+---+-------+
| id|numbers|
+---+-------+
|abc| [1, 2]|
| cd| [3, 4]|
+---+-------+



In [11]:
df.select(col("id"), explode(col("numbers")).alias("number")).show()


+---+------+
| id|number|
+---+------+
|abc|     1|
|abc|     2|
| cd|     3|
| cd|     4|
+---+------+



In [12]:

data = [
 ("James,,Smith",["Java","Scala","C++"],["Spark","Java"],"OH","CA"),
 ("Michael,Rose,",["Spark","Java","C++"],["Spark","Java"],"NY","NJ"),
 ("Robert,,Williams",["CSharp","VB"],["Spark","Python"],"UT","NV")
]

from pyspark.sql.types import StringType, ArrayType,StructType,StructField
schema = StructType([ 
    StructField("name",StringType(),True), 
    StructField("languagesAtSchool",ArrayType(StringType()),True), 
    StructField("languagesAtWork",ArrayType(StringType()),True), 
    StructField("currentState", StringType(), True), 
    StructField("previousState", StringType(), True)
  ])

df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show()


root
 |-- name: string (nullable = true)
 |-- languagesAtSchool: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- languagesAtWork: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- currentState: string (nullable = true)
 |-- previousState: string (nullable = true)

+----------------+------------------+---------------+------------+-------------+
|            name| languagesAtSchool|languagesAtWork|currentState|previousState|
+----------------+------------------+---------------+------------+-------------+
|    James,,Smith|[Java, Scala, C++]|  [Spark, Java]|          OH|           CA|
|   Michael,Rose,|[Spark, Java, C++]|  [Spark, Java]|          NY|           NJ|
|Robert,,Williams|      [CSharp, VB]|[Spark, Python]|          UT|           NV|
+----------------+------------------+---------------+------------+-------------+



In [13]:

from pyspark.sql.functions import explode
df.select(df.name,explode(df.languagesAtSchool)).show()
 


+----------------+------+
|            name|   col|
+----------------+------+
|    James,,Smith|  Java|
|    James,,Smith| Scala|
|    James,,Smith|   C++|
|   Michael,Rose,| Spark|
|   Michael,Rose,|  Java|
|   Michael,Rose,|   C++|
|Robert,,Williams|CSharp|
|Robert,,Williams|    VB|
+----------------+------+



In [14]:

from pyspark.sql.functions import split
df.select(split(df.name,",").alias("nameAsArray")).show()

 

+--------------------+
|         nameAsArray|
+--------------------+
|    [James, , Smith]|
|   [Michael, Rose, ]|
|[Robert, , Williams]|
+--------------------+



In [15]:

from pyspark.sql.functions import array
df.select(df.name,array(df.currentState,df.previousState).alias("States")).show()
 

+----------------+--------+
|            name|  States|
+----------------+--------+
|    James,,Smith|[OH, CA]|
|   Michael,Rose,|[NY, NJ]|
|Robert,,Williams|[UT, NV]|
+----------------+--------+



In [16]:
from pyspark.sql.functions import array_contains
df.select(df.name,array_contains(df.languagesAtSchool,"Java")
    .alias("array_contains")).show()


+----------------+--------------+
|            name|array_contains|
+----------------+--------------+
|    James,,Smith|          true|
|   Michael,Rose,|          true|
|Robert,,Williams|         false|
+----------------+--------------+

