In [1]:
import pyspark
from pyspark.sql import SparkSession

sc = pyspark.SparkContext()
spark = SparkSession(sc)

In [4]:
from pyspark.sql.types import *
schema = StructType([
    StructField('product_id', IntegerType(), False),
    StructField('product_name', StringType(), False),
    StructField('qty', IntegerType(), False)
])

In [5]:
schema2 = "product_id STRING, product_name STRING, qty INT"

In [6]:
data = [[1, "product1", 10], [2, "product2", 20]]

df1 = spark.createDataFrame(data, schema)
df2 = spark.createDataFrame(data, schema2)

df1.show()
df2.show()

+----------+------------+---+
|product_id|product_name|qty|
+----------+------------+---+
|         1|    product1| 10|
|         2|    product2| 20|
+----------+------------+---+

+----------+------------+---+
|product_id|product_name|qty|
+----------+------------+---+
|         1|    product1| 10|
|         2|    product2| 20|
+----------+------------+---+



In [7]:
df1.printSchema()

root
 |-- product_id: integer (nullable = false)
 |-- product_name: string (nullable = false)
 |-- qty: integer (nullable = false)



In [19]:
# create DF from RDD
rdd = sc.parallelize(data)
df3 = rdd.toDF(schema)
df3.show()

+----------+------------+---+
|product_id|product_name|qty|
+----------+------------+---+
|         1|    product1| 10|
|         2|    product2| 20|
+----------+------------+---+



In [20]:
# Schema discovery
df4 = spark.createDataFrame(data, ['product_id', 'product_name', 'qty'])
df4.printSchema()

root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- qty: long (nullable = true)



In [9]:
df1.columns

['product_id', 'product_name', 'qty']

In [11]:
from pyspark.sql.functions import *
df1.select(col('product_name'))

DataFrame[product_name: string]

In [13]:
df1.select(col('product_name'), col('qty') * 2).show()

+------------+---------+
|product_name|(qty * 2)|
+------------+---------+
|    product1|       20|
|    product2|       40|
+------------+---------+



In [14]:
# add computed column
df1.withColumn('Forecasted Qty', expr('qty * 2')).show()

+----------+------------+---+--------------+
|product_id|product_name|qty|Forecasted Qty|
+----------+------------+---+--------------+
|         1|    product1| 10|            20|
|         2|    product2| 20|            40|
+----------+------------+---+--------------+



In [17]:
# using Exprs
df1.select(expr("avg(qty)")).show()

+--------+
|avg(qty)|
+--------+
|    15.0|
+--------+



In [18]:
# 3 types of "select"
df1.select('product_name').show()
df1.select(col('product_name')).show()
df1.select(expr('product_name')).show()

+------------+
|product_name|
+------------+
|    product1|
|    product2|
+------------+

+------------+
|product_name|
+------------+
|    product1|
|    product2|
+------------+

+------------+
|product_name|
+------------+
|    product1|
|    product2|
+------------+

