# Chapter 5 - Basic Structured Operations

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName("chapter5").getOrCreate()

## Schemas

In [2]:
df = spark.read.format("json").load("../data/flight-data/json/2015-summary.json")
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [3]:
print(df.schema)

StructType(List(StructField(DEST_COUNTRY_NAME,StringType,true),StructField(ORIGIN_COUNTRY_NAME,StringType,true),StructField(count,LongType,true)))


A schema can also be manually specified

In [4]:
from pyspark.sql.types import StructType, StructField, StringType, LongType

myManualSchema = StructType([StructField("DEST_COUNTRY_NAME", StringType(), True), 
                             StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
                             StructField("count", LongType(), False, metadata={"hello":"world"})])

df = spark.read.format("json").schema(myManualSchema).load("../data/flight-data/json/2015-summary.json")
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



## Columns and expressions

In [5]:
from pyspark.sql.functions import col, column

col("someColumnName")
column("someColumnName")

Column<b'someColumnName'>

In [12]:
print(df.select(column("count")))
print(df.select("count"))
print(df.select(col("count")))

DataFrame[count: bigint]
DataFrame[count: bigint]
DataFrame[count: bigint]


In [18]:
((col("someCol") + 5) * 200 - 6) < col("otherCol")

Column<b'((((someCol + 5) * 200) - 6) < otherCol)'>

In [19]:
from pyspark.sql.functions import expr

expr("((col(someCol) + 5) * 200 - 6) < col(otherCol)")

Column<b'((((col(someCol) + 5) * 200) - 6) < col(otherCol))'>

## Records and rows

In [20]:
df.first()

Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15)

In [26]:
from pyspark.sql import Row

myRow = Row("Hello", None, 1, False)

# access values in a Row using []
print(myRow[0])
print(myRow[1])

Hello
None
