## Lab: Read Wine Data

For our lab, we will be reading data from the Wine Data Set at Kaggle. https://www.kaggle.com/zynicide/wine-reviews. We already downloaded and made it a part of your notebook in the `data` directory.


**Step 1:** Read wine data from `../data/winemag.csv` first without setting headers or infering the schema

**Step 2:** See what you glean from the data

**Step 3:** Print the schema

**Step 4:** Apply headers, infer the schema

**Step 5:** Print the schema again

**Step 6:** `show` some of the data using some of the varying forms

**Step 7:** Apply your own schema

In [19]:
val winesDF = spark.read.format("csv")
                     .load("../data/winemag.csv")

winesDF: org.apache.spark.sql.DataFrame = [_c0: string, _c1: string ... 12 more fields]


In [20]:
winesDF.show(10)

+----+--------+--------------------+--------------------+------+-----+-----------------+-------------------+-----------------+------------------+--------------------+--------------------+------------------+-------------------+
| _c0|     _c1|                 _c2|                 _c3|   _c4|  _c5|              _c6|                _c7|              _c8|               _c9|                _c10|                _c11|              _c12|               _c13|
+----+--------+--------------------+--------------------+------+-----+-----------------+-------------------+-----------------+------------------+--------------------+--------------------+------------------+-------------------+
|null| country|         description|         designation|points|price|         province|           region_1|         region_2|       taster_name|taster_twitter_ha...|               title|           variety|             winery|
|   0|   Italy|Aromas include tr...|        Vulkà Bianco|    87| null|Sicily & Sardinia|    

In [21]:
winesDF.printSchema

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)
 |-- _c13: string (nullable = true)



In [22]:
val winesDF = spark.read.format("csv")
                     .option("inferSchema", "true")
                     .option("header", "true")
                     .load("../data/winemag.csv")

winesDF: org.apache.spark.sql.DataFrame = [_c0: string, country: string ... 12 more fields]


In [23]:
winesDF.printSchema

root
 |-- _c0: string (nullable = true)
 |-- country: string (nullable = true)
 |-- description: string (nullable = true)
 |-- designation: string (nullable = true)
 |-- points: string (nullable = true)
 |-- price: string (nullable = true)
 |-- province: string (nullable = true)
 |-- region_1: string (nullable = true)
 |-- region_2: string (nullable = true)
 |-- taster_name: string (nullable = true)
 |-- taster_twitter_handle: string (nullable = true)
 |-- title: string (nullable = true)
 |-- variety: string (nullable = true)
 |-- winery: string (nullable = true)



In [24]:
winesDF.show(10)

+---+--------+--------------------+--------------------+------+-----+-----------------+-------------------+-----------------+------------------+---------------------+--------------------+------------------+-------------------+
|_c0| country|         description|         designation|points|price|         province|           region_1|         region_2|       taster_name|taster_twitter_handle|               title|           variety|             winery|
+---+--------+--------------------+--------------------+------+-----+-----------------+-------------------+-----------------+------------------+---------------------+--------------------+------------------+-------------------+
|  0|   Italy|Aromas include tr...|        Vulkà Bianco|    87| null|Sicily & Sardinia|               Etna|             null|     Kerin O’Keefe|         @kerinokeefe|Nicosia 2013 Vulk...|       White Blend|            Nicosia|
|  1|Portugal|This is ripe and ...|            Avidagos|    87| 15.0|            Douro|     

In [25]:
winesDF.show(truncate=false)

+---+---------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------+------+-----+-----------------+-------------------+-----------------+------------------+---------------------+--------------------------------------------------------------------------------------------------+------------------+-------------------+
|_c0|country  |description                                                                                                                                                                                                                                                                                                                |designation             




In [26]:
winesDF.show(5, truncate=10)

+---+--------+-----------+-----------+------+-----+----------+----------+----------+-----------+---------------------+----------+----------+----------+
|_c0| country|description|designation|points|price|  province|  region_1|  region_2|taster_name|taster_twitter_handle|     title|   variety|    winery|
+---+--------+-----------+-----------+------+-----+----------+----------+----------+-----------+---------------------+----------+----------+----------+
|  0|   Italy| Aromas ...| Vulkà B...|    87| null|Sicily ...|      Etna|      null| Kerin O...|           @kerino...|Nicosia...|White B...|   Nicosia|
|  1|Portugal| This is...|   Avidagos|    87| 15.0|     Douro|      null|      null| Roger Voss|           @vossroger|Quinta ...|Portugu...|Quinta ...|
|  2|      US| Tart an...|       null|    87| 14.0|    Oregon|Willame...|Willame...| Paul Gr...|           @paulgw...|Rainsto...|Pinot Gris| Rainstorm|
|  3|      US| Pineapp...| Reserve...|    87| 13.0|  Michigan|Lake Mi...|      null| Ale

Something is wrong with the data, yay! All the types right now should be `String` for now

In [34]:
import org.apache.spark.sql.types._

val wineSchema = new StructType(Array(
      new StructField("_c0", StringType, nullable = true),
      new StructField("country", StringType, nullable = true),
      new StructField("description", StringType, nullable = true),
      new StructField("designation", StringType, nullable = true),
      new StructField("points", StringType, nullable = true),//*
      new StructField("price", StringType, nullable = true),//*
      new StructField("province", StringType, nullable = true),
      new StructField("region_1", StringType, nullable = true),
      new StructField("region_2", StringType, nullable = true),
      new StructField("taster_name", StringType, nullable = true),
      new StructField("taster_twitter_handle", StringType, nullable = true),
      new StructField("title", StringType, nullable = true),
      new StructField("variety", StringType, nullable = true),
      new StructField("winery", StringType, nullable = true)
    ))

val winesDF = spark.read.format("csv")             
                     .option("header", "true")
                     .schema(wineSchema)
                     .load("../data/winemag.csv")

winesDF.show(10, truncate=30)

+---+--------+------------------------------+------------------------------+------+-----+-----------------+-------------------+-----------------+------------------+---------------------+------------------------------+------------------+-------------------+
|_c0| country|                   description|                   designation|points|price|         province|           region_1|         region_2|       taster_name|taster_twitter_handle|                         title|           variety|             winery|
+---+--------+------------------------------+------------------------------+------+-----+-----------------+-------------------+-----------------+------------------+---------------------+------------------------------+------------------+-------------------+
|  0|   Italy|Aromas include tropical fru...|                  Vulkà Bianco|    87| null|Sicily & Sardinia|               Etna|             null|     Kerin O’Keefe|         @kerinokeefe|Nicosia 2013 Vulkà Bianco  ...|       White

import org.apache.spark.sql.types._
wineSchema: org.apache.spark.sql.types.StructType = StructType(StructField(_c0,StringType,true), StructField(country,StringType,true), StructField(description,StringType,true), StructField(designation,StringType,true), StructField(points,StringType,true), StructField(price,StringType,true), StructField(province,StringType,true), StructField(region_1,StringType,true), StructField(region_2,StringType,true), StructField(taster_name,StringType,true), StructField(taster_twitter_handle,StringType,true), StructField(title,StringType,true), StructField(variety,StringType,true), StructField(winery,StringType,true))
winesDF: org.apache.spark.sql.DataFrame = [_c0: string, country: string ... 12 more fields]
