# Simple example of how to work with SparkSQL and DataFrame

### Connecting

In [None]:
from pyspark.sql import SparkSession

spark_session = SparkSession.builder \
                            .appName("myApp") \
                            .master("local") \
                            .enableHiveSupport() \
                            .getOrCreate()

spark_session

### Read data from a text file

In [None]:
ips_rdd = spark_session.sparkContext.textFile('/data/subnets/ips/subnets_var1_len250.txt')
ips_rdd.take(5)

### Create a DataFrame

In [None]:
from pyspark.sql.types import StructType, StringType

# Create data scheme
data_schema = StructType().add("ip", StringType())\
                          .add("mask", StringType())

# Create DataFrame from RDD
ips_df = spark_session.createDataFrame(ips_rdd.map(lambda x: x.split("\t", 1)), data_schema)

ips_df

### Simple operations with DataFrame

In [None]:
# Show top N rows
ips_df.show(3)

In [None]:
# Show DataFrame schema
ips_df.printSchema()

In [None]:
# Store DataFrame to Hive table
# Values for mode property:
#   * error - throw an error if table already exists
#   * overwrite
#   * append
ips_df.write.saveAsTable("default.ips", mode="overwrite")
spark_session.catalog.listTables("default")

In [None]:
# Store DataFrame as Parquet file
ips_df.write.save("ips.parquet", mode="overwrite")

### Work with RDD inside DataFrame

In [None]:
# Work with DataFrame' RDD
ips_df.rdd.take(3)

In [None]:
%%time 
# Select data
# 
# Return types:
#   * select: Spark DataFrame [n columns] -> Spark DataFrame [m columns]
#   * where:  Spark DataFrame -> Spark DataFrame
#   * show:   NoneType
ips_df.select("ip").where("mask = '255.255.255.128'").show(5)

### Work with DataFrame via SQL

In [None]:
# Create a temporary view.
# The view is temporary because it exists
# only during Spark session.
ips_df.createTempView("ips")

In [None]:
# Execute an SQL query
select_result = spark_session.sql("""SELECT * FROM ips""")
select_result.show(3)

### Work with Hive

In [None]:
# Show databases and tables
spark_session.sql("""SHOW DATABASES""").toPandas()
spark_session.sql("""SHOW TABLES IN default""").toPandas()

# or 
spark_session.catalog.listDatabases()
spark_session.catalog.listTables("default")

In [None]:
# Make a permanent table from view
spark_session.sql("""CREATE DATABASE IF NOT EXISTS main""")
spark_session.sql("""DROP TABLE IF EXISTS main.ips""")
spark_session.sql("""
    CREATE TABLE main.ips AS
    SELECT * FROM ips
""")

spark_session.catalog.listTables("main")

In [None]:
# Read table from disk
ips_from_disk = spark_session.read.table("main.ips")
ips_from_disk.show(3)