### csv_ingestion_with_schema

In [1]:
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown('# <span style="color:red">'+string+'</span>'))

if ('sc' in locals() or 'sc' in globals()):
    printmd('<<<<<!!!!! It seems that you are running in a IBM Watson Studio Apache Spark Notebook. Please run it in an IBM Watson Studio Default Runtime (without Apache Spark) !!!!!>>>>>')

!pip install pyspark==2.4.5

try:
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SparkSession
except ImportError as e:
    printmd('<<<<<!!!!! Please restart your kernel after installing Apache Spark !!!!!>>>>>')
    
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

spark = SparkSession \
    .builder \
    .getOrCreate()

print("Apache Spark session created.")

Apache Spark session created.


In [6]:
import os
import json
from pyspark.sql import SparkSession
from pyspark.sql.types import (StructType, StructField,
                               IntegerType,DateType,
                               StringType)

current_dir = os.path.dirname("")
relative_path = "books.csv"
absolute_file_path = os.path.join(current_dir, relative_path)

# Creates a session on a local master
spark = SparkSession.builder.appName("Complex CSV with a schema to Dataframe") \
    .master("local[*]").getOrCreate()

# Creates the schema
schema = StructType([StructField('id', IntegerType(), False),
                     StructField('authorId', IntegerType(), True),
                     StructField('bookTitle', IntegerType(), False),
                     StructField('releaseDate', DateType(), True),
                     StructField('url', StringType(), False)])

# Reads a CSV file with header, called books.csv, stores it in a
# dataframe
df = spark.read.format("csv") \
    .option("header", True) \
    .option("multiline", True) \
    .option("sep", ";") \
    .option("dateFormat", "MM/dd/yyyy") \
    .option("quote", "*") \
    .load(absolute_file_path)

# problem with schema
#.schema(schema) \

# Shows at most 20 rows from the dataframe
df.show(30, 25, False)
df.printSchema()

schemaAsJson = df.schema.json()
parsedSchemaAsJson = json.loads(schemaAsJson)

print("*** Schema as JSON: {}".format(json.dumps(parsedSchemaAsJson, indent=2)))

spark.stop()

+---+--------+-------------------------+-----------+-----------------------+
| id|authorId|                    title|releaseDate|                  link
+---+--------+-------------------------+-----------+-----------------------+
|  1|       1|Fantastic Beasts and W...| 11/18/2016|http://amzn.to/2kup94P
|  2|       1|Harry Potter and the S...| 10/06/2015|http://amzn.to/2l2lSwP
|  3|       1|The Tales of Beedle th...| 12/04/2008|http://amzn.to/2kYezqr
|  4|       1|Harry Potter and the C...| 10/04/2016|http://amzn.to/2kYhL5n
|  5|       2|Informix 12.10 on Mac ...| 04/23/2017|http://amzn.to/2i3mthT
|  6|       2|Development Tools in 2...| 12/28/2016|http://amzn.to/2vBxOe1
|  7|       3|Adventures of Hucklebe...| 05/26/1994|http://amzn.to/2wOeOav
|  8|       3|A Connecticut Yankee i...| 06/17/2017|http://amzn.to/2x1NuoD
| 10|       4|     Jacques le Fataliste| 03/01/2000|http://amzn.to/2uZj2KA
| 11|       4|Diderot Encyclopedia: ...|       null|http://amzn.to/2i2zo3I
| 12|    null|       