### Creating DataFrames

In [2]:
# spark is an existing SparkSession
df = spark.read.csv('/FileStore/tables/people2.csv', inferSchema='true', header='true', nullValue='null')

# Displays the content of the DataFrame to stdout
df.show()

### Untyped Dataset Operations (aka DataFrame Operations)

In [4]:
# spark, df are from the previous example
# Print the schema in a tree format
df.printSchema()

# Select only the "name" column
df.select("name").show()

# Select everybody, but increment the age by 1
df.select(df['name'], df['age'] + 1).show()

In [5]:
# Select people older than 21
df.filter(df['age'] > 21).show()

# Count people by age
df.groupBy("age").count().show()

### Running SQL Queries Programmatically

In [7]:
# Register the DataFrame as a SQL temporary view
df.createOrReplaceTempView("people")

sqlDF = spark.sql("SELECT * FROM people")
sqlDF.show()

### Interoperating with RDDs

In [9]:
from pyspark.sql import Row

# Load a text file and convert each line to a Row.
lines = sc.textFile("/FileStore/tables/people1.txt")

parts = lines.map(lambda l: l.split(","))
people = parts.map(lambda p: Row(name=p[0], age=p[1]))

# Infer the schema, and register the DataFrame as a table.
schemaPeople = spark.createDataFrame(people)
schemaPeople.createOrReplaceTempView("people_rdd")

# SQL can be run over DataFrames that have been registered as a table.
teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")

# The results of SQL queries are Dataframe objects.
# rdd returns the content as an :class:`pyspark.RDD` of :class:`Row`.
teenNames = teenagers.rdd.map(lambda p: "Name: " + p.name).collect()
for name in teenNames:
    print(name)

### Programmatically Specifying the Schema

In [11]:
# Import data types
from pyspark.sql.types import *

# Load a text file and convert each line to a Row.
lines = sc.textFile("/FileStore/tables/people2.txt")
parts = lines.map(lambda l: l.split(","))

# Each line is converted to a tuple.
people = parts.map(lambda p: (p[0], p[1].strip(), p[2], p[3]))

# The schema is encoded in a string.
schemaString = "name age city country"

fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)


# Apply the schema to the RDD.
schemaPeople = spark.createDataFrame(people, schema)

# Creates a temporary view using the DataFrame
schemaPeople.createOrReplaceTempView("people")

# SQL can be run over DataFrames that have been registered as a table.
results = spark.sql("SELECT name FROM people")

results.show()

In [12]:
results.write.csv('/FileStore/tables/output_test')