# Chapter 2

## Introduction to Spark and PySpark

In [22]:
from pyspark.sql import SparkSession


spark = SparkSession.builder.master("local[*]").appName("Intro").getOrCreate()

### Importing a csv

In [6]:
df = spark.read.csv("static/zoo.csv", header=True, inferSchema=True)

In [7]:
type(df)

pyspark.sql.classic.dataframe.DataFrame

In [8]:
df.printSchema()

root
 |-- animal_name: string (nullable = true)
 |-- hair: integer (nullable = true)
 |-- feathers: integer (nullable = true)
 |-- eggs: integer (nullable = true)
 |-- milk: integer (nullable = true)
 |-- airborne: integer (nullable = true)
 |-- aquatic: integer (nullable = true)
 |-- predator: integer (nullable = true)
 |-- toothed: integer (nullable = true)
 |-- backbone: integer (nullable = true)
 |-- breathes: integer (nullable = true)
 |-- venomous: integer (nullable = true)
 |-- fins: integer (nullable = true)
 |-- legs: integer (nullable = true)
 |-- tail: integer (nullable = true)
 |-- domestic: integer (nullable = true)
 |-- catsize: integer (nullable = true)
 |-- class_type: integer (nullable = true)



In [9]:
df.head(5)

[Row(animal_name='aardvark', hair=1, feathers=0, eggs=0, milk=1, airborne=0, aquatic=0, predator=1, toothed=1, backbone=1, breathes=1, venomous=0, fins=0, legs=4, tail=0, domestic=0, catsize=1, class_type=1),
 Row(animal_name='antelope', hair=1, feathers=0, eggs=0, milk=1, airborne=0, aquatic=0, predator=0, toothed=1, backbone=1, breathes=1, venomous=0, fins=0, legs=4, tail=1, domestic=0, catsize=1, class_type=1),
 Row(animal_name='bass', hair=0, feathers=0, eggs=1, milk=0, airborne=0, aquatic=1, predator=1, toothed=1, backbone=1, breathes=0, venomous=0, fins=1, legs=0, tail=1, domestic=0, catsize=0, class_type=4),
 Row(animal_name='bear', hair=1, feathers=0, eggs=0, milk=1, airborne=0, aquatic=0, predator=1, toothed=1, backbone=1, breathes=1, venomous=0, fins=0, legs=4, tail=0, domestic=0, catsize=1, class_type=1),
 Row(animal_name='boar', hair=1, feathers=0, eggs=0, milk=1, airborne=0, aquatic=0, predator=1, toothed=1, backbone=1, breathes=1, venomous=0, fins=0, legs=4, tail=1, domes

In [10]:
df.show(5)

+-----------+----+--------+----+----+--------+-------+--------+-------+--------+--------+--------+----+----+----+--------+-------+----------+
|animal_name|hair|feathers|eggs|milk|airborne|aquatic|predator|toothed|backbone|breathes|venomous|fins|legs|tail|domestic|catsize|class_type|
+-----------+----+--------+----+----+--------+-------+--------+-------+--------+--------+--------+----+----+----+--------+-------+----------+
|   aardvark|   1|       0|   0|   1|       0|      0|       1|      1|       1|       1|       0|   0|   4|   0|       0|      1|         1|
|   antelope|   1|       0|   0|   1|       0|      0|       0|      1|       1|       1|       0|   0|   4|   1|       0|      1|         1|
|       bass|   0|       0|   1|   0|       0|      1|       1|      1|       1|       0|       0|   1|   0|   1|       0|      0|         4|
|       bear|   1|       0|   0|   1|       0|      0|       1|      1|       1|       1|       0|   0|   4|   0|       0|      1|         1|
|     

In [11]:
df.toPandas().head(5)

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


### Creating a custom schema

In [12]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType


zoo_schema = StructType(
    [
        StructField("animal_name", StringType(), True),
        StructField("hair", IntegerType(), True),
        StructField("feathers", IntegerType(), True),
        StructField("eggs", IntegerType(), True),
        StructField("milk", IntegerType(), True),
        StructField("airborne", IntegerType(), True),
        StructField("aquatic", IntegerType(), True),
        StructField("predator", IntegerType(), True),
        StructField("toothed", IntegerType(), True),
        StructField("backbone", IntegerType(), True),
        StructField("breathes", IntegerType(), True),
        StructField("venomous", IntegerType(), True),
        StructField("fins", IntegerType(), True),
        StructField("legs", IntegerType(), True),
        StructField("tail", IntegerType(), True),
        StructField("domestic", IntegerType(), True),
        StructField("catsize", IntegerType(), True),
        StructField("type", IntegerType(), True),
    ]
)


df_with_schema = spark.read.format("csv").schema(zoo_schema).load("static/zoo.csv")


df_with_schema.show()

+-----------+----+--------+----+----+--------+-------+--------+-------+--------+--------+--------+----+----+----+--------+-------+----+
|animal_name|hair|feathers|eggs|milk|airborne|aquatic|predator|toothed|backbone|breathes|venomous|fins|legs|tail|domestic|catsize|type|
+-----------+----+--------+----+----+--------+-------+--------+-------+--------+--------+--------+----+----+----+--------+-------+----+
|animal_name|NULL|    NULL|NULL|NULL|    NULL|   NULL|    NULL|   NULL|    NULL|    NULL|    NULL|NULL|NULL|NULL|    NULL|   NULL|NULL|
|   aardvark|   1|       0|   0|   1|       0|      0|       1|      1|       1|       1|       0|   0|   4|   0|       0|      1|   1|
|   antelope|   1|       0|   0|   1|       0|      0|       0|      1|       1|       1|       0|   0|   4|   1|       0|      1|   1|
|       bass|   0|       0|   1|   0|       0|      1|       1|      1|       1|       0|       0|   1|   0|   1|       0|      0|   4|
|       bear|   1|       0|   0|   1|       0|  

### Mind the schema

In [None]:
import json


print(json.dumps(json.loads(df_with_schema.schema.json()), indent=2))

{
  "fields": [
    {
      "metadata": {},
      "name": "animal_name",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "hair",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "feathers",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "eggs",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "milk",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "airborne",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "aquatic",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "predator",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "toothed",
      "nullable": true,
      "type": "integer"
    },
    {
      "me

In [19]:
with open("static/schema.json", "w") as file:
    json.dump(json.loads(df_with_schema.schema.json()), file, indent=2)

In [27]:
with open("static/schema.json", "r") as file:
    schema = json.load(file)

schema_from_json = StructType.fromJson(schema)

schema_from_json

StructType([StructField('animal_name', StringType(), True), StructField('hair', IntegerType(), True), StructField('feathers', IntegerType(), True), StructField('eggs', IntegerType(), True), StructField('milk', IntegerType(), True), StructField('airborne', IntegerType(), True), StructField('aquatic', IntegerType(), True), StructField('predator', IntegerType(), True), StructField('toothed', IntegerType(), True), StructField('backbone', IntegerType(), True), StructField('breathes', IntegerType(), True), StructField('venomous', IntegerType(), True), StructField('fins', IntegerType(), True), StructField('legs', IntegerType(), True), StructField('tail', IntegerType(), True), StructField('domestic', IntegerType(), True), StructField('catsize', IntegerType(), True), StructField('type', IntegerType(), True)])