In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
df = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("/home/fung/dev/project/spark/raw_data/data/retail-data/all/*.csv")\
  .coalesce(5)
df.cache()
df.createOrReplaceTempView("dfTable")

In [3]:
from pyspark.sql.functions import count
df.select(count("StockCode")).show() # 541909


+----------------+
|count(StockCode)|
+----------------+
|          541909|
+----------------+



In [4]:
from pyspark.sql.functions import countDistinct
df.select(countDistinct("StockCode")).show() # 4070


+-------------------------+
|count(DISTINCT StockCode)|
+-------------------------+
|                     4070|
+-------------------------+



In [5]:
from pyspark.sql.functions import approx_count_distinct
df.select(approx_count_distinct("StockCode", 0.1)).show() # 3364


+--------------------------------+
|approx_count_distinct(StockCode)|
+--------------------------------+
|                            3364|
+--------------------------------+



In [6]:
from pyspark.sql.functions import first, last
df.select(first("StockCode"), last("StockCode")).show()


+-----------------------+----------------------+
|first(StockCode, false)|last(StockCode, false)|
+-----------------------+----------------------+
|                 85123A|                 22138|
+-----------------------+----------------------+



In [7]:

from pyspark.sql.functions import min, max
df.select(min("Quantity"), max("Quantity")).show()


+-------------+-------------+
|min(Quantity)|max(Quantity)|
+-------------+-------------+
|       -80995|        80995|
+-------------+-------------+



In [8]:
from pyspark.sql.functions import sum
df.select(sum("Quantity")).show() # 5176450

+-------------+
|sum(Quantity)|
+-------------+
|      5176450|
+-------------+



In [9]:
from pyspark.sql.functions import sumDistinct
df.select(sumDistinct("Quantity")).show() # 29310



+----------------------+
|sum(DISTINCT Quantity)|
+----------------------+
|                 29310|
+----------------------+



In [10]:
from pyspark.sql.functions import sum, count, avg, expr

df.select(
    count("Quantity").alias("total_transactions"),
    sum("Quantity").alias("total_purchases"),
    avg("Quantity").alias("avg_purchases"),
    expr("mean(Quantity)").alias("mean_purchases"))\
  .selectExpr(
    "total_purchases/total_transactions",
    "avg_purchases",
    "mean_purchases").show()


+--------------------------------------+----------------+----------------+
|(total_purchases / total_transactions)|   avg_purchases|  mean_purchases|
+--------------------------------------+----------------+----------------+
|                      9.55224954743324|9.55224954743324|9.55224954743324|
+--------------------------------------+----------------+----------------+



In [11]:
from pyspark.sql.functions import var_pop, stddev_pop
from pyspark.sql.functions import var_samp, stddev_samp
df.select(var_pop("Quantity"), var_samp("Quantity"),
  stddev_pop("Quantity"), stddev_samp("Quantity")).show()


+-----------------+------------------+--------------------+---------------------+
|var_pop(Quantity)|var_samp(Quantity)|stddev_pop(Quantity)|stddev_samp(Quantity)|
+-----------------+------------------+--------------------+---------------------+
| 47559.3036466091|  47559.3914092988|  218.08095663447807|   218.08115785023426|
+-----------------+------------------+--------------------+---------------------+



In [12]:
from pyspark.sql.functions import skewness, kurtosis
df.select(skewness("Quantity"), kurtosis("Quantity")).show()


+--------------------+------------------+
|  skewness(Quantity)|kurtosis(Quantity)|
+--------------------+------------------+
|-0.26407557610529564|119768.05495534712|
+--------------------+------------------+



In [13]:
from pyspark.sql.functions import corr, covar_pop, covar_samp
df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"),
    covar_pop("InvoiceNo", "Quantity")).show()

+-------------------------+-------------------------------+------------------------------+
|corr(InvoiceNo, Quantity)|covar_samp(InvoiceNo, Quantity)|covar_pop(InvoiceNo, Quantity)|
+-------------------------+-------------------------------+------------------------------+
|     4.912186085642775E-4|             1052.7280543915654|            1052.7260778754612|
+-------------------------+-------------------------------+------------------------------+



In [14]:
from pyspark.sql.functions import collect_set, collect_list
df.agg(collect_set("Country"), collect_list("Country")).show()

+--------------------+---------------------+
|collect_set(Country)|collect_list(Country)|
+--------------------+---------------------+
|[Portugal, Italy,...| [United Kingdom, ...|
+--------------------+---------------------+



In [15]:
from pyspark.sql.functions import count

df.groupBy("InvoiceNo").agg(
    count("Quantity").alias("quan"),
    expr("count(Quantity)")).show()


+---------+----+---------------+
|InvoiceNo|quan|count(Quantity)|
+---------+----+---------------+
|   536596|   6|              6|
|   536938|  14|             14|
|   537252|   1|              1|
|   537691|  20|             20|
|   538041|   1|              1|
|   538184|  26|             26|
|   538517|  53|             53|
|   538879|  19|             19|
|   539275|   6|              6|
|   539630|  12|             12|
|   540499|  24|             24|
|   540540|  22|             22|
|  C540850|   1|              1|
|   540976|  48|             48|
|   541432|   4|              4|
|   541518| 101|            101|
|   541783|  35|             35|
|   542026|   9|              9|
|   542375|   6|              6|
|  C542604|   8|              8|
+---------+----+---------------+
only showing top 20 rows



In [16]:
df.groupBy("InvoiceNo").agg(expr("avg(Quantity)"),expr("stddev_pop(Quantity)"))\
  .show()


+---------+------------------+--------------------+
|InvoiceNo|     avg(Quantity)|stddev_pop(Quantity)|
+---------+------------------+--------------------+
|   536596|               1.5|  1.1180339887498947|
|   536938|33.142857142857146|  20.698023172885524|
|   537252|              31.0|                 0.0|
|   537691|              8.15|   5.597097462078001|
|   538041|              30.0|                 0.0|
|   538184|12.076923076923077|   8.142590198943392|
|   538517|3.0377358490566038|  2.3946659604837897|
|   538879|21.157894736842106|  11.811070444356483|
|   539275|              26.0|  12.806248474865697|
|   539630|20.333333333333332|  10.225241100118645|
|   540499|              3.75|  2.6653642652865788|
|   540540|2.1363636363636362|  1.0572457590557278|
|  C540850|              -1.0|                 0.0|
|   540976|10.520833333333334|   6.496760677872902|
|   541432|             12.25|  10.825317547305483|
|   541518| 23.10891089108911|  20.550782784878713|
|   541783|1

In [17]:
from pyspark.sql.functions import col, to_date
dfWithDate = df.withColumn("date", to_date(col("InvoiceDate"), "MM/d/yyyy H:mm"))
dfWithDate.createOrReplaceTempView("dfWithDate")


In [18]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc
windowSpec = Window\
  .partitionBy("CustomerId", "date")\
  .orderBy(desc("Quantity"))\
  .rowsBetween(Window.unboundedPreceding, Window.currentRow)


In [19]:
from pyspark.sql.functions import max
maxPurchaseQuantity = max(col("Quantity")).over(windowSpec)

In [20]:
from pyspark.sql.functions import col

from pyspark.sql.functions import dense_rank, rank
purchaseDenseRank = dense_rank().over(windowSpec)
purchaseRank = rank().over(windowSpec)

dfWithDate.where("CustomerId IS NOT NULL").orderBy("CustomerId")\
  .select(
    col("CustomerId"),
    col("date"),
    col("Quantity"),
    purchaseRank.alias("quantityRank"),
    purchaseDenseRank.alias("quantityDenseRank"),
    maxPurchaseQuantity.alias("maxPurchaseQuantity")).show()


+----------+----------+--------+------------+-----------------+-------------------+
|CustomerId|      date|Quantity|quantityRank|quantityDenseRank|maxPurchaseQuantity|
+----------+----------+--------+------------+-----------------+-------------------+
|     12346|2011-01-18|   74215|           1|                1|              74215|
|     12346|2011-01-18|  -74215|           2|                2|              74215|
|     12347|2010-12-07|      36|           1|                1|                 36|
|     12347|2010-12-07|      30|           2|                2|                 36|
|     12347|2010-12-07|      24|           3|                3|                 36|
|     12347|2010-12-07|      12|           4|                4|                 36|
|     12347|2010-12-07|      12|           4|                4|                 36|
|     12347|2010-12-07|      12|           4|                4|                 36|
|     12347|2010-12-07|      12|           4|                4|             

In [21]:
from pyspark.sql.functions import col

dfWithDate.where("CustomerId IS NOT NULL").orderBy("CustomerId")\
  .select(
    col("CustomerId"),
    col("date"),
    col("Quantity"),
    purchaseRank.alias("quantityRank"),
    purchaseDenseRank.alias("quantityDenseRank"),
    maxPurchaseQuantity.alias("maxPurchaseQuantity")).show()


+----------+----------+--------+------------+-----------------+-------------------+
|CustomerId|      date|Quantity|quantityRank|quantityDenseRank|maxPurchaseQuantity|
+----------+----------+--------+------------+-----------------+-------------------+
|     12346|2011-01-18|   74215|           1|                1|              74215|
|     12346|2011-01-18|  -74215|           2|                2|              74215|
|     12347|2010-12-07|      36|           1|                1|                 36|
|     12347|2010-12-07|      30|           2|                2|                 36|
|     12347|2010-12-07|      24|           3|                3|                 36|
|     12347|2010-12-07|      12|           4|                4|                 36|
|     12347|2010-12-07|      12|           4|                4|                 36|
|     12347|2010-12-07|      12|           4|                4|                 36|
|     12347|2010-12-07|      12|           4|                4|             

In [22]:
dfNoNull = dfWithDate.drop()
dfNoNull.createOrReplaceTempView("dfNoNull")


In [25]:
rolledUpDF = dfNoNull.rollup("Date", "Country").agg(sum("Quantity"))\
  .selectExpr("Date", "Country", "`sum(Quantity)` as total_quantity")\
  .orderBy("Date")
rolledUpDF.show()

+----------+--------------+--------------+
|      Date|       Country|total_quantity|
+----------+--------------+--------------+
|      null|          null|       5176450|
|2010-12-01|   Netherlands|            97|
|2010-12-01|          EIRE|           243|
|2010-12-01|     Australia|           107|
|2010-12-01|        Norway|          1852|
|2010-12-01|United Kingdom|         23949|
|2010-12-01|        France|           449|
|2010-12-01|          null|         26814|
|2010-12-01|       Germany|           117|
|2010-12-02|          EIRE|             4|
|2010-12-02|       Germany|           146|
|2010-12-02|          null|         21023|
|2010-12-02|United Kingdom|         20873|
|2010-12-03|         Spain|           400|
|2010-12-03|         Italy|           164|
|2010-12-03|       Belgium|           528|
|2010-12-03|          null|         14830|
|2010-12-03|   Switzerland|           110|
|2010-12-03|        France|           239|
|2010-12-03|      Portugal|            65|
+----------

In [28]:
pivoted = dfWithDate.groupBy("date").pivot("Country").sum()

In [32]:
pivoted.where("date > '2011-12-05'").select("date", "`USA_sum(Qunatity)`").show()

AnalysisException: "cannot resolve '`USA_sum(Qunatity)`' given input columns: [Hong Kong_sum(CAST(Quantity AS BIGINT)), United Arab Emirates_sum(UnitPrice), Canada_sum(CAST(CustomerID AS BIGINT)), Iceland_sum(CAST(CustomerID AS BIGINT)), Hong Kong_sum(UnitPrice), RSA_sum(CAST(CustomerID AS BIGINT)), Portugal_sum(CAST(Quantity AS BIGINT)), Channel Islands_sum(CAST(Quantity AS BIGINT)), Singapore_sum(CAST(CustomerID AS BIGINT)), Israel_sum(CAST(Quantity AS BIGINT)), United Arab Emirates_sum(CAST(Quantity AS BIGINT)), Switzerland_sum(CAST(CustomerID AS BIGINT)), Italy_sum(CAST(Quantity AS BIGINT)), Switzerland_sum(CAST(Quantity AS BIGINT)), Cyprus_sum(CAST(Quantity AS BIGINT)), Lebanon_sum(UnitPrice), Finland_sum(UnitPrice), Cyprus_sum(CAST(CustomerID AS BIGINT)), USA_sum(CAST(CustomerID AS BIGINT)), Italy_sum(UnitPrice), Germany_sum(UnitPrice), Austria_sum(CAST(Quantity AS BIGINT)), Brazil_sum(CAST(CustomerID AS BIGINT)), Unspecified_sum(UnitPrice), Norway_sum(UnitPrice), Saudi Arabia_sum(CAST(Quantity AS BIGINT)), Israel_sum(CAST(CustomerID AS BIGINT)), Czech Republic_sum(UnitPrice), USA_sum(CAST(Quantity AS BIGINT)), USA_sum(UnitPrice), Saudi Arabia_sum(CAST(CustomerID AS BIGINT)), RSA_sum(CAST(Quantity AS BIGINT)), Italy_sum(CAST(CustomerID AS BIGINT)), Finland_sum(CAST(Quantity AS BIGINT)), Iceland_sum(UnitPrice), Greece_sum(CAST(CustomerID AS BIGINT)), European Community_sum(CAST(Quantity AS BIGINT)), France_sum(CAST(Quantity AS BIGINT)), Lithuania_sum(UnitPrice), Singapore_sum(CAST(Quantity AS BIGINT)), Saudi Arabia_sum(UnitPrice), EIRE_sum(UnitPrice), Unspecified_sum(CAST(CustomerID AS BIGINT)), Brazil_sum(CAST(Quantity AS BIGINT)), Lithuania_sum(CAST(Quantity AS BIGINT)), Bahrain_sum(UnitPrice), Denmark_sum(CAST(Quantity AS BIGINT)), United Kingdom_sum(CAST(Quantity AS BIGINT)), Singapore_sum(UnitPrice), Hong Kong_sum(CAST(CustomerID AS BIGINT)), Belgium_sum(UnitPrice), France_sum(CAST(CustomerID AS BIGINT)), Greece_sum(UnitPrice), Germany_sum(CAST(CustomerID AS BIGINT)), Channel Islands_sum(CAST(CustomerID AS BIGINT)), Czech Republic_sum(CAST(CustomerID AS BIGINT)), Cyprus_sum(UnitPrice), Israel_sum(UnitPrice), Finland_sum(CAST(CustomerID AS BIGINT)), Norway_sum(CAST(Quantity AS BIGINT)), France_sum(UnitPrice), Belgium_sum(CAST(CustomerID AS BIGINT)), Spain_sum(CAST(Quantity AS BIGINT)), Netherlands_sum(CAST(CustomerID AS BIGINT)), EIRE_sum(CAST(Quantity AS BIGINT)), Austria_sum(UnitPrice), Sweden_sum(CAST(CustomerID AS BIGINT)), Unspecified_sum(CAST(Quantity AS BIGINT)), Lithuania_sum(CAST(CustomerID AS BIGINT)), Malta_sum(CAST(Quantity AS BIGINT)), Bahrain_sum(CAST(CustomerID AS BIGINT)), Australia_sum(UnitPrice), Bahrain_sum(CAST(Quantity AS BIGINT)), Austria_sum(CAST(CustomerID AS BIGINT)), date, United Kingdom_sum(UnitPrice), Malta_sum(UnitPrice), Spain_sum(CAST(CustomerID AS BIGINT)), Australia_sum(CAST(Quantity AS BIGINT)), Iceland_sum(CAST(Quantity AS BIGINT)), European Community_sum(UnitPrice), Australia_sum(CAST(CustomerID AS BIGINT)), Japan_sum(UnitPrice), Portugal_sum(CAST(CustomerID AS BIGINT)), Denmark_sum(UnitPrice), RSA_sum(UnitPrice), Spain_sum(UnitPrice), Lebanon_sum(CAST(Quantity AS BIGINT)), Belgium_sum(CAST(Quantity AS BIGINT)), Poland_sum(UnitPrice), Netherlands_sum(UnitPrice), European Community_sum(CAST(CustomerID AS BIGINT)), Channel Islands_sum(UnitPrice), United Arab Emirates_sum(CAST(CustomerID AS BIGINT)), Czech Republic_sum(CAST(Quantity AS BIGINT)), Canada_sum(UnitPrice), Malta_sum(CAST(CustomerID AS BIGINT)), Sweden_sum(CAST(Quantity AS BIGINT)), Lebanon_sum(CAST(CustomerID AS BIGINT)), Norway_sum(CAST(CustomerID AS BIGINT)), Netherlands_sum(CAST(Quantity AS BIGINT)), United Kingdom_sum(CAST(CustomerID AS BIGINT)), Switzerland_sum(UnitPrice), Poland_sum(CAST(CustomerID AS BIGINT)), Sweden_sum(UnitPrice), Japan_sum(CAST(CustomerID AS BIGINT)), Canada_sum(CAST(Quantity AS BIGINT)), Denmark_sum(CAST(CustomerID AS BIGINT)), Japan_sum(CAST(Quantity AS BIGINT)), Greece_sum(CAST(Quantity AS BIGINT)), EIRE_sum(CAST(CustomerID AS BIGINT)), Portugal_sum(UnitPrice), Germany_sum(CAST(Quantity AS BIGINT)), Poland_sum(CAST(Quantity AS BIGINT)), Brazil_sum(UnitPrice)];;\n'Project [date#1978, 'USA_sum(Qunatity)]\n+- Filter (cast(date#1978 as string) > 2011-12-05)\n   +- Project [date#1978, __pivot_sum(CAST(`Quantity` AS BIGINT)) AS `sum(CAST(``Quantity`` AS BIGINT))`#100254[0] AS Australia_sum(CAST(Quantity AS BIGINT))#100411L, __pivot_sum(`UnitPrice`) AS `sum(``UnitPrice``)`#100332[0] AS Australia_sum(UnitPrice)#100412, __pivot_sum(CAST(`CustomerID` AS BIGINT)) AS `sum(CAST(``CustomerID`` AS BIGINT))`#100410[0] AS Australia_sum(CAST(CustomerID AS BIGINT))#100413L, __pivot_sum(CAST(`Quantity` AS BIGINT)) AS `sum(CAST(``Quantity`` AS BIGINT))`#100254[1] AS Austria_sum(CAST(Quantity AS BIGINT))#100414L, __pivot_sum(`UnitPrice`) AS `sum(``UnitPrice``)`#100332[1] AS Austria_sum(UnitPrice)#100415, __pivot_sum(CAST(`CustomerID` AS BIGINT)) AS `sum(CAST(``CustomerID`` AS BIGINT))`#100410[1] AS Austria_sum(CAST(CustomerID AS BIGINT))#100416L, __pivot_sum(CAST(`Quantity` AS BIGINT)) AS `sum(CAST(``Quantity`` AS BIGINT))`#100254[2] AS Bahrain_sum(CAST(Quantity AS BIGINT))#100417L, __pivot_sum(`UnitPrice`) AS `sum(``UnitPrice``)`#100332[2] AS Bahrain_sum(UnitPrice)#100418, __pivot_sum(CAST(`CustomerID` AS BIGINT)) AS `sum(CAST(``CustomerID`` AS BIGINT))`#100410[2] AS Bahrain_sum(CAST(CustomerID AS BIGINT))#100419L, __pivot_sum(CAST(`Quantity` AS BIGINT)) AS `sum(CAST(``Quantity`` AS BIGINT))`#100254[3] AS Belgium_sum(CAST(Quantity AS BIGINT))#100420L, __pivot_sum(`UnitPrice`) AS `sum(``UnitPrice``)`#100332[3] AS Belgium_sum(UnitPrice)#100421, __pivot_sum(CAST(`CustomerID` AS BIGINT)) AS `sum(CAST(``CustomerID`` AS BIGINT))`#100410[3] AS Belgium_sum(CAST(CustomerID AS BIGINT))#100422L, __pivot_sum(CAST(`Quantity` AS BIGINT)) AS `sum(CAST(``Quantity`` AS BIGINT))`#100254[4] AS Brazil_sum(CAST(Quantity AS BIGINT))#100423L, __pivot_sum(`UnitPrice`) AS `sum(``UnitPrice``)`#100332[4] AS Brazil_sum(UnitPrice)#100424, __pivot_sum(CAST(`CustomerID` AS BIGINT)) AS `sum(CAST(``CustomerID`` AS BIGINT))`#100410[4] AS Brazil_sum(CAST(CustomerID AS BIGINT))#100425L, __pivot_sum(CAST(`Quantity` AS BIGINT)) AS `sum(CAST(``Quantity`` AS BIGINT))`#100254[5] AS Canada_sum(CAST(Quantity AS BIGINT))#100426L, __pivot_sum(`UnitPrice`) AS `sum(``UnitPrice``)`#100332[5] AS Canada_sum(UnitPrice)#100427, __pivot_sum(CAST(`CustomerID` AS BIGINT)) AS `sum(CAST(``CustomerID`` AS BIGINT))`#100410[5] AS Canada_sum(CAST(CustomerID AS BIGINT))#100428L, __pivot_sum(CAST(`Quantity` AS BIGINT)) AS `sum(CAST(``Quantity`` AS BIGINT))`#100254[6] AS Channel Islands_sum(CAST(Quantity AS BIGINT))#100429L, __pivot_sum(`UnitPrice`) AS `sum(``UnitPrice``)`#100332[6] AS Channel Islands_sum(UnitPrice)#100430, __pivot_sum(CAST(`CustomerID` AS BIGINT)) AS `sum(CAST(``CustomerID`` AS BIGINT))`#100410[6] AS Channel Islands_sum(CAST(CustomerID AS BIGINT))#100431L, __pivot_sum(CAST(`Quantity` AS BIGINT)) AS `sum(CAST(``Quantity`` AS BIGINT))`#100254[7] AS Cyprus_sum(CAST(Quantity AS BIGINT))#100432L, __pivot_sum(`UnitPrice`) AS `sum(``UnitPrice``)`#100332[7] AS Cyprus_sum(UnitPrice)#100433, ... 91 more fields]\n      +- Aggregate [date#1978], [date#1978, pivotfirst(Country#17, sum(CAST(`Quantity` AS BIGINT))#100174L, Australia, Austria, Bahrain, Belgium, Brazil, Canada, Channel Islands, Cyprus, Czech Republic, Denmark, EIRE, European Community, Finland, France, Germany, Greece, Hong Kong, Iceland, Israel, Italy, Japan, Lebanon, Lithuania, Malta, Netherlands, Norway, Poland, Portugal, RSA, Saudi Arabia, Singapore, Spain, Sweden, Switzerland, USA, United Arab Emirates, United Kingdom, Unspecified, 0, 0) AS __pivot_sum(CAST(`Quantity` AS BIGINT)) AS `sum(CAST(``Quantity`` AS BIGINT))`#100254, pivotfirst(Country#17, sum(`UnitPrice`)#100175, Australia, Austria, Bahrain, Belgium, Brazil, Canada, Channel Islands, Cyprus, Czech Republic, Denmark, EIRE, European Community, Finland, France, Germany, Greece, Hong Kong, Iceland, Israel, Italy, Japan, Lebanon, Lithuania, Malta, Netherlands, Norway, Poland, Portugal, RSA, Saudi Arabia, Singapore, Spain, Sweden, Switzerland, USA, United Arab Emirates, United Kingdom, Unspecified, 0, 0) AS __pivot_sum(`UnitPrice`) AS `sum(``UnitPrice``)`#100332, pivotfirst(Country#17, sum(CAST(`CustomerID` AS BIGINT))#100176L, Australia, Austria, Bahrain, Belgium, Brazil, Canada, Channel Islands, Cyprus, Czech Republic, Denmark, EIRE, European Community, Finland, France, Germany, Greece, Hong Kong, Iceland, Israel, Italy, Japan, Lebanon, Lithuania, Malta, Netherlands, Norway, Poland, Portugal, RSA, Saudi Arabia, Singapore, Spain, Sweden, Switzerland, USA, United Arab Emirates, United Kingdom, Unspecified, 0, 0) AS __pivot_sum(CAST(`CustomerID` AS BIGINT)) AS `sum(CAST(``CustomerID`` AS BIGINT))`#100410]\n         +- Aggregate [date#1978, Country#17], [date#1978, Country#17, sum(cast(Quantity#13 as bigint)) AS sum(CAST(`Quantity` AS BIGINT))#100174L, sum(UnitPrice#15) AS sum(`UnitPrice`)#100175, sum(cast(CustomerID#16 as bigint)) AS sum(CAST(`CustomerID` AS BIGINT))#100176L]\n            +- Project [InvoiceNo#10, StockCode#11, Description#12, Quantity#13, InvoiceDate#14, UnitPrice#15, CustomerID#16, Country#17, to_date('InvoiceDate, Some(MM/d/yyyy H:mm)) AS date#1978]\n               +- Repartition 5, false\n                  +- Relation[InvoiceNo#10,StockCode#11,Description#12,Quantity#13,InvoiceDate#14,UnitPrice#15,CustomerID#16,Country#17] csv\n"