# Import CSV to DataFrame

Source: data taken from Spark: The Definitive Guide

In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
sc = SparkContext("local", "First App")

spark = SparkSession.builder.master("local").appName("Word Count").getOrCreate()

In [2]:
static = spark.read.format("csv").option("header","true").option("inferSchema","true").load("/home/jovyan/dayOne.csv")

In [3]:
static.createOrReplaceTempView("retail_Data")

In [15]:
static.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [16]:
spark.sql("""SELECT * FROM retail_Data ORDER BY UnitPrice DESC """).show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536592|      DOT|      DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
|   536544|      DOT|      DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|
|   536392|    22827|RUSTIC  SEVENTEEN...|       1|2010-12-01 10:29:00|    165.0|   13705.0|United Kingdom|
|   536592|    22503|CABIN BAG VINTAGE...|       1|2010-12-01 17:06:00|    59.53|      null|United Kingdom|
|   536544|    22769|CHALKBOARD KITCHE...|       1|2010-12-01 14:32:00|    51.02|      null|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 5 rows



In [17]:
from pyspark.sql.functions import window, column, desc, col
static.selectExpr("CustomerID","(UnitPrice * Quantity) as total_cost","InvoiceDate").groupBy(col("CustomerID")).sum("total_cost").show(5)

+----------+---------------+
|CustomerID|sum(total_cost)|
+----------+---------------+
|   15311.0|         445.33|
|   16539.0|         442.35|
|   15100.0|          350.4|
|   12583.0|         855.86|
|   15291.0|          328.8|
+----------+---------------+
only showing top 5 rows



In [19]:
foo = spark.sql("""SELECT * FROM retail_Data ORDER BY UnitPrice DESC """)

In [30]:
test = foo.where("Country = 'United Kingdom'")

In [31]:
train = foo.where("Country != 'United Kingdom'")

In [32]:
test.count()

2949

In [33]:
train.count()

159

In [34]:
spark.sql("""SELECT * FROM retail_Data ORDER BY UnitPrice DESC """).explain()

== Physical Plan ==
*(2) Sort [UnitPrice#15 DESC NULLS LAST], true, 0
+- Exchange rangepartitioning(UnitPrice#15 DESC NULLS LAST, 200)
   +- *(1) FileScan csv [InvoiceNo#10,StockCode#11,Description#12,Quantity#13,InvoiceDate#14,UnitPrice#15,CustomerID#16,Country#17] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/jovyan/dayOne.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<InvoiceNo:string,StockCode:string,Description:string,Quantity:int,InvoiceDate:timestamp,Un...
