# DataFrames

In [1]:
# python imports are first in a file, after the shebang
# https://en.wikipedia.org/wiki/Shebang_(Unix)
import os
import pyspark

In [2]:
# lets start the spark session
# the entry point for an spark app is the SparkSession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("FirstApp").getOrCreate()

In [3]:
spark

## READ
Let's dive deeper into the spark.read API

copy the files locally

In [None]:
help(spark.read)

In [None]:
print(dir(spark.read))

In [None]:
# lets check the wd
os.path.abspath(os.getcwd())

### CSV
aka delimited text files

In [4]:
help(spark.read.csv)

Help on method csv in module pyspark.sql.readwriter:

csv(path, schema=None, sep=None, encoding=None, quote=None, escape=None, comment=None, header=None, inferSchema=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None, negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None, maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None, samplingRatio=None, enforceSchema=None, emptyValue=None, locale=None, lineSep=None, pathGlobFilter=None, recursiveFileLookup=None) method of pyspark.sql.readwriter.DataFrameReader instance
    Loads a CSV file and returns the result as a  :class:`DataFrame`.
    
    This function will go through the input once to determine the input schema if
    ``inferSchema`` is enabled. To avoid going through the entire data once, disable
    ``inferSchema`` option or specify the schema explicitly usin

In [5]:
print(dir(spark.read.csv))

['__call__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__func__', '__ge__', '__get__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__self__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__']


In [4]:
nasa_path = '../data/nasa_19950701.tsv'
df = spark.read.csv(nasa_path)

In [5]:
df.schema

StructType(List(StructField(_c0,StringType,true)))

In [6]:
df.head(20)

[Row(_c0='host\tlogname\ttime\tmethod\turl\tresponse\tbytes'),
 Row(_c0='199.72.81.55\t-\t804571201\tGET\t/history/apollo/\t200\t6245\t\t'),
 Row(_c0='unicomp6.unicomp.net\t-\t804571206\tGET\t/shuttle/countdown/\t200\t3985\t\t'),
 Row(_c0='199.120.110.21\t-\t804571209\tGET\t/shuttle/missions/sts-73/mission-sts-73.html\t200\t4085\t\t'),
 Row(_c0='burger.letters.com\t-\t804571211\tGET\t/shuttle/countdown/liftoff.html\t304\t0\t\t'),
 Row(_c0='199.120.110.21\t-\t804571211\tGET\t/shuttle/missions/sts-73/sts-73-patch-small.gif\t200\t4179\t\t'),
 Row(_c0='burger.letters.com\t-\t804571212\tGET\t/images/NASA-logosmall.gif\t304\t0\t\t'),
 Row(_c0='burger.letters.com\t-\t804571212\tGET\t/shuttle/countdown/video/livevideo.gif\t200\t0\t\t'),
 Row(_c0='205.212.115.106\t-\t804571212\tGET\t/shuttle/countdown/countdown.html\t200\t3985\t\t'),
 Row(_c0='d104.aa.net\t-\t804571213\tGET\t/shuttle/countdown/\t200\t3985\t\t'),
 Row(_c0='129.94.144.152\t-\t804571213\tGET\t/\t200\t7074\t\t'),
 Row(_c0='unicomp6

In [7]:
df = spark.read.csv(nasa_path, sep='\t', header=True, inferSchema=False)

In [8]:
df.head(20)

[Row(host='199.72.81.55', logname='-', time='804571201', method='GET', url='/history/apollo/', response='200', bytes='6245'),
 Row(host='unicomp6.unicomp.net', logname='-', time='804571206', method='GET', url='/shuttle/countdown/', response='200', bytes='3985'),
 Row(host='199.120.110.21', logname='-', time='804571209', method='GET', url='/shuttle/missions/sts-73/mission-sts-73.html', response='200', bytes='4085'),
 Row(host='burger.letters.com', logname='-', time='804571211', method='GET', url='/shuttle/countdown/liftoff.html', response='304', bytes='0'),
 Row(host='199.120.110.21', logname='-', time='804571211', method='GET', url='/shuttle/missions/sts-73/sts-73-patch-small.gif', response='200', bytes='4179'),
 Row(host='burger.letters.com', logname='-', time='804571212', method='GET', url='/images/NASA-logosmall.gif', response='304', bytes='0'),
 Row(host='burger.letters.com', logname='-', time='804571212', method='GET', url='/shuttle/countdown/video/livevideo.gif', response='200', 

In [9]:
df.schema

StructType(List(StructField(host,StringType,true),StructField(logname,StringType,true),StructField(time,StringType,true),StructField(method,StringType,true),StructField(url,StringType,true),StructField(response,StringType,true),StructField(bytes,StringType,true)))

In [10]:
df = spark.read.csv(nasa_path, sep='\t', header=True, inferSchema=True)

In [11]:
df.schema

StructType(List(StructField(host,StringType,true),StructField(logname,StringType,true),StructField(time,IntegerType,true),StructField(method,StringType,true),StructField(url,StringType,true),StructField(response,IntegerType,true),StructField(bytes,IntegerType,true)))

In [12]:
df.host

Column<b'host'>

In [13]:
# this is considered the safe way of referincing a column
df['host']

Column<b'host'>

In [14]:
df.select(df.host).take(5)

[Row(host='199.72.81.55'),
 Row(host='unicomp6.unicomp.net'),
 Row(host='199.120.110.21'),
 Row(host='burger.letters.com'),
 Row(host='199.120.110.21')]

In [15]:
df.select('host').take(5)

[Row(host='199.72.81.55'),
 Row(host='unicomp6.unicomp.net'),
 Row(host='199.120.110.21'),
 Row(host='burger.letters.com'),
 Row(host='199.120.110.21')]

In [16]:
df.show()

+--------------------+-------+---------+------+--------------------+--------+-----+
|                host|logname|     time|method|                 url|response|bytes|
+--------------------+-------+---------+------+--------------------+--------+-----+
|        199.72.81.55|      -|804571201|   GET|    /history/apollo/|     200| 6245|
|unicomp6.unicomp.net|      -|804571206|   GET| /shuttle/countdown/|     200| 3985|
|      199.120.110.21|      -|804571209|   GET|/shuttle/missions...|     200| 4085|
|  burger.letters.com|      -|804571211|   GET|/shuttle/countdow...|     304|    0|
|      199.120.110.21|      -|804571211|   GET|/shuttle/missions...|     200| 4179|
|  burger.letters.com|      -|804571212|   GET|/images/NASA-logo...|     304|    0|
|  burger.letters.com|      -|804571212|   GET|/shuttle/countdow...|     200|    0|
|     205.212.115.106|      -|804571212|   GET|/shuttle/countdow...|     200| 3985|
|         d104.aa.net|      -|804571213|   GET| /shuttle/countdown/|     200

In [17]:
df.show(50)

+--------------------+-------+---------+------+--------------------+--------+------+
|                host|logname|     time|method|                 url|response| bytes|
+--------------------+-------+---------+------+--------------------+--------+------+
|        199.72.81.55|      -|804571201|   GET|    /history/apollo/|     200|  6245|
|unicomp6.unicomp.net|      -|804571206|   GET| /shuttle/countdown/|     200|  3985|
|      199.120.110.21|      -|804571209|   GET|/shuttle/missions...|     200|  4085|
|  burger.letters.com|      -|804571211|   GET|/shuttle/countdow...|     304|     0|
|      199.120.110.21|      -|804571211|   GET|/shuttle/missions...|     200|  4179|
|  burger.letters.com|      -|804571212|   GET|/images/NASA-logo...|     304|     0|
|  burger.letters.com|      -|804571212|   GET|/shuttle/countdow...|     200|     0|
|     205.212.115.106|      -|804571212|   GET|/shuttle/countdow...|     200|  3985|
|         d104.aa.net|      -|804571213|   GET| /shuttle/countdow

In [18]:
df.drop('time').show()

+--------------------+-------+------+--------------------+--------+-----+
|                host|logname|method|                 url|response|bytes|
+--------------------+-------+------+--------------------+--------+-----+
|        199.72.81.55|      -|   GET|    /history/apollo/|     200| 6245|
|unicomp6.unicomp.net|      -|   GET| /shuttle/countdown/|     200| 3985|
|      199.120.110.21|      -|   GET|/shuttle/missions...|     200| 4085|
|  burger.letters.com|      -|   GET|/shuttle/countdow...|     304|    0|
|      199.120.110.21|      -|   GET|/shuttle/missions...|     200| 4179|
|  burger.letters.com|      -|   GET|/images/NASA-logo...|     304|    0|
|  burger.letters.com|      -|   GET|/shuttle/countdow...|     200|    0|
|     205.212.115.106|      -|   GET|/shuttle/countdow...|     200| 3985|
|         d104.aa.net|      -|   GET| /shuttle/countdown/|     200| 3985|
|      129.94.144.152|      -|   GET|                   /|     200| 7074|
|unicomp6.unicomp.net|      -|   GET|/

In [19]:
df.schema

StructType(List(StructField(host,StringType,true),StructField(logname,StringType,true),StructField(time,IntegerType,true),StructField(method,StringType,true),StructField(url,StringType,true),StructField(response,IntegerType,true),StructField(bytes,IntegerType,true)))

In [20]:
df2 = df.drop('time')
df2.schema

StructType(List(StructField(host,StringType,true),StructField(logname,StringType,true),StructField(method,StringType,true),StructField(url,StringType,true),StructField(response,IntegerType,true),StructField(bytes,IntegerType,true)))

In [21]:
type(df2.schema)

pyspark.sql.types.StructType

In [22]:
df.printSchema()

root
 |-- host: string (nullable = true)
 |-- logname: string (nullable = true)
 |-- time: integer (nullable = true)
 |-- method: string (nullable = true)
 |-- url: string (nullable = true)
 |-- response: integer (nullable = true)
 |-- bytes: integer (nullable = true)



In [23]:
print(dir(pyspark.sql.types))



In [24]:
df.schema.fieldNames()

['host', 'logname', 'time', 'method', 'url', 'response', 'bytes']

In [25]:
df.schema.names

['host', 'logname', 'time', 'method', 'url', 'response', 'bytes']

In [28]:
print(type(df.schema.names), type(df.schema.fieldNames()))

<class 'list'> <class 'list'>


In [26]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, DateConverter, DateType, IntegerType

# alternatively, you can use
# from pyspark.sql.types import *

In [27]:
our_schema = StructType([
    # StructField("column_name", columnType(), Nullable),
    StructField("Host", StringType(), False),
    StructField("Logname", StringType(), True),
    StructField("Time", IntegerType(), False),
    StructField("Method", StringType(), True),
    StructField("URL", StringType(), True),
    StructField("Response", StringType(), True),
    # StructField("Bytes", StringType(), True),
])

In [28]:
df.schema == our_schema

False

In [29]:
df = spark.read.csv(nasa_path, sep='\t', header=True, schema=our_schema)

In [30]:
df.printSchema()

root
 |-- Host: string (nullable = true)
 |-- Logname: string (nullable = true)
 |-- Time: integer (nullable = true)
 |-- Method: string (nullable = true)
 |-- URL: string (nullable = true)
 |-- Response: string (nullable = true)



In [31]:
df.head(5)

[Row(Host='199.72.81.55', Logname='-', Time=804571201, Method='GET', URL='/history/apollo/', Response='200'),
 Row(Host='unicomp6.unicomp.net', Logname='-', Time=804571206, Method='GET', URL='/shuttle/countdown/', Response='200'),
 Row(Host='199.120.110.21', Logname='-', Time=804571209, Method='GET', URL='/shuttle/missions/sts-73/mission-sts-73.html', Response='200'),
 Row(Host='burger.letters.com', Logname='-', Time=804571211, Method='GET', URL='/shuttle/countdown/liftoff.html', Response='304'),
 Row(Host='199.120.110.21', Logname='-', Time=804571211, Method='GET', URL='/shuttle/missions/sts-73/sts-73-patch-small.gif', Response='200')]

In [32]:
our_schema_with_extra_column = StructType([
    # StructField("column_name", columnType(), Nullable),
    StructField("Host", StringType(), False),
    StructField("Logname", StringType(), True),
    StructField("Time", IntegerType(), False),
    StructField("Method", StringType(), True),
    StructField("URL", StringType(), True),
    StructField("Response", StringType(), True),
    StructField("Bytes", StringType(), True),
    StructField("Missing", StringType(), True),
])

In [33]:
df = spark.read.csv(nasa_path, sep='\t', header=True, schema=our_schema_with_extra_column)

In [34]:
df.printSchema()

root
 |-- Host: string (nullable = true)
 |-- Logname: string (nullable = true)
 |-- Time: integer (nullable = true)
 |-- Method: string (nullable = true)
 |-- URL: string (nullable = true)
 |-- Response: string (nullable = true)
 |-- Bytes: string (nullable = true)
 |-- Missing: string (nullable = true)



In [35]:
df.show()

+--------------------+-------+---------+------+--------------------+--------+-----+-------+
|                Host|Logname|     Time|Method|                 URL|Response|Bytes|Missing|
+--------------------+-------+---------+------+--------------------+--------+-----+-------+
|        199.72.81.55|      -|804571201|   GET|    /history/apollo/|     200| 6245|   null|
|unicomp6.unicomp.net|      -|804571206|   GET| /shuttle/countdown/|     200| 3985|   null|
|      199.120.110.21|      -|804571209|   GET|/shuttle/missions...|     200| 4085|   null|
|  burger.letters.com|      -|804571211|   GET|/shuttle/countdow...|     304|    0|   null|
|      199.120.110.21|      -|804571211|   GET|/shuttle/missions...|     200| 4179|   null|
|  burger.letters.com|      -|804571212|   GET|/images/NASA-logo...|     304|    0|   null|
|  burger.letters.com|      -|804571212|   GET|/shuttle/countdow...|     200|    0|   null|
|     205.212.115.106|      -|804571212|   GET|/shuttle/countdow...|     200| 39

In [36]:
df.describe().show()

+-------+--------------------+-------+-------------------+------+---------------+------------------+------------------+-------+
|summary|                Host|Logname|               Time|Method|            URL|          Response|             Bytes|Missing|
+-------+--------------------+-------+-------------------+------+---------------+------------------+------------------+-------+
|  count|                9999|   9999|               9999|  9999|           9999|              9999|              9999|      0|
|   mean|                null|   null|8.045768773637364E8|  null|           null|210.66426642664266|24649.759175917592|   null|
| stddev|                null|   null|  3636.201300821906|  null|           null|  32.9404221059951| 83860.87366060312|   null|
|    min|     128.187.140.171|      -|          804571201|   GET|              /|               200|                 0|   null|
|    max|zzsbtafe.slip.cc....|      -|          804584384|  POST|/whats-new.html|               404|    

In [37]:
df.describe('host').show()

+-------+--------------------+
|summary|                host|
+-------+--------------------+
|  count|                9999|
|   mean|                null|
| stddev|                null|
|    min|     128.187.140.171|
|    max|zzsbtafe.slip.cc....|
+-------+--------------------+



lets read another file

In [38]:
nasa_path = '../data/prime_nums.text'
df = spark.read.csv(nasa_path) # , sep='\t', header=True, inferSchema=True)

In [39]:
df.show(20)

+--------------------+
|                 _c0|
+--------------------+
|  2	  3	  5	  7	 ...|
| 31	 37	 41	 43	 ...|
| 73	 79	 83	 89	 ...|
|127	131	137	139	1...|
|179	181	191	193	1...|
|233	239	241	251	2...|
|283	293	307	311	3...|
|353	359	367	373	3...|
|419	421	431	433	4...|
|467	479	487	491	4...|
+--------------------+



## JSON

In [41]:
jp = '../data/resource_hvrh-b6nb.json'

In [42]:
df = spark.read.json(jp)

In [43]:
df.printSchema()

root
 |-- dropoff_latitude: string (nullable = true)
 |-- dropoff_longitude: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- lpep_dropoff_datetime: string (nullable = true)
 |-- lpep_pickup_datetime: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- pickup_latitude: string (nullable = true)
 |-- pickup_longitude: string (nullable = true)
 |-- ratecodeid: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- trip_type: string (nullable = true)
 |-- vendorid: string (nullable = true)



In [44]:
df.count()

1000

In [45]:
df.dropDuplicates().count()

1000

In [46]:
df.select('tolls_amount').show(20)

+------------+
|tolls_amount|
+------------+
|           0|
|           0|
|           0|
|           0|
|           0|
|           0|
|           0|
|           0|
|           0|
|           0|
|           0|
|           0|
|           0|
|           0|
|           0|
|           0|
|           0|
|           0|
|           0|
|           0|
+------------+
only showing top 20 rows



In [47]:
ag =df.groupby('vendorid','ratecodeid').agg({'fare_amount': 'sum'}).alias('fare_sum')

In [48]:
ag.show()

+--------+----------+----------------+
|vendorid|ratecodeid|sum(fare_amount)|
+--------+----------+----------------+
|       2|         2|            52.0|
|       2|         4|            16.0|
|       2|         1|         12123.5|
|       2|         5|           262.0|
+--------+----------+----------------+



In [49]:
ag.withColumnRenamed('sum(fare_amount)', 'fare_sum').printSchema()

root
 |-- vendorid: string (nullable = true)
 |-- ratecodeid: string (nullable = true)
 |-- fare_sum: double (nullable = true)



In [50]:
import pyspark.sql.functions as psql

In [51]:
print(dir(psql))



In [53]:
from pyspark.sql import Window
window = Window.orderBy("ratecodeid").partitionBy("vendorid")
ag.withColumn('rn', psql.row_number().over(window)).show()

+--------+----------+----------------+---+
|vendorid|ratecodeid|sum(fare_amount)| rn|
+--------+----------+----------------+---+
|       2|         1|         12123.5|  1|
|       2|         2|            52.0|  2|
|       2|         4|            16.0|  3|
|       2|         5|           262.0|  4|
+--------+----------+----------------+---+



In [54]:
df.registerTempTable('data')

In [55]:
spark.sql('select * from data').show()

+------------------+-------------------+-----+-----------+---------------------+---------------------+--------------------+-------+---------------+------------+------------------+-------------------+----------+------------------+----------+------------+------------+-------------+---------+--------+
|  dropoff_latitude|  dropoff_longitude|extra|fare_amount|improvement_surcharge|lpep_dropoff_datetime|lpep_pickup_datetime|mta_tax|passenger_count|payment_type|   pickup_latitude|   pickup_longitude|ratecodeid|store_and_fwd_flag|tip_amount|tolls_amount|total_amount|trip_distance|trip_type|vendorid|
+------------------+-------------------+-----+-----------+---------------------+---------------------+--------------------+-------+---------------+------------+------------------+-------------------+----------+------------------+----------+------------+------------+-------------+---------+--------+
|40.698043823242188|-73.924278259277344|  0.5|          8|                  0.3| 2016-01-01T00:39:..

In [56]:
spark.sql('select vendorid, sum(fare_amount) as fare_sum from data group by vendorid').show()

+--------+--------+
|vendorid|fare_sum|
+--------+--------+
|       2| 12453.5|
+--------+--------+



In [79]:
spark.sql('select vendorid, sum(fare_amount) as fare_sum, row_number() as rn from data group by vendorid').show()

Py4JJavaError: An error occurred while calling o240.showString.
: java.lang.UnsupportedOperationException: Cannot generate code for expression: row_number()
	at org.apache.spark.sql.catalyst.expressions.Unevaluable.doGenCode(Expression.scala:304)
	at org.apache.spark.sql.catalyst.expressions.Unevaluable.doGenCode$(Expression.scala:303)
	at org.apache.spark.sql.catalyst.expressions.aggregate.DeclarativeAggregate.doGenCode(interfaces.scala:369)
	at org.apache.spark.sql.catalyst.expressions.Expression.$anonfun$genCode$3(Expression.scala:146)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.catalyst.expressions.Expression.genCode(Expression.scala:141)
	at org.apache.spark.sql.catalyst.expressions.CastBase.doGenCode(Cast.scala:828)
	at org.apache.spark.sql.catalyst.expressions.Expression.$anonfun$genCode$3(Expression.scala:146)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.catalyst.expressions.Expression.genCode(Expression.scala:141)
	at org.apache.spark.sql.catalyst.expressions.CastBase.genCode(Cast.scala:823)
	at org.apache.spark.sql.catalyst.expressions.Alias.genCode(namedExpressions.scala:159)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.$anonfun$generateResultFunction$5(HashAggregateExec.scala:575)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238)
	at scala.collection.immutable.List.foreach(List.scala:392)
	at scala.collection.TraversableLike.map(TraversableLike.scala:238)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:231)
	at scala.collection.immutable.List.map(List.scala:298)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.generateResultFunction(HashAggregateExec.scala:575)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.doProduceWithKeys(HashAggregateExec.scala:762)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.doProduce(HashAggregateExec.scala:169)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:95)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.produce(HashAggregateExec.scala:48)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doCodeGen(WholeStageCodegenExec.scala:632)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:692)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:316)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:434)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:420)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3627)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2697)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3618)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3616)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2697)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2904)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:300)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:337)
	at jdk.internal.reflect.GeneratedMethodAccessor70.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:834)


In [57]:
df.write.parquet('out/df.parquet', mode='overwrite')

In [59]:
df.write.csv('out/df.csv', mode='overwrite')

In [61]:
df.explain()

== Physical Plan ==
FileScan json [dropoff_latitude#902,dropoff_longitude#903,extra#904,fare_amount#905,improvement_surcharge#906,lpep_dropoff_datetime#907,lpep_pickup_datetime#908,mta_tax#909,passenger_count#910,payment_type#911,pickup_latitude#912,pickup_longitude#913,ratecodeid#914,store_and_fwd_flag#915,tip_amount#916,tolls_amount#917,total_amount#918,trip_distance#919,trip_type#920,vendorid#921] Batched: false, DataFilters: [], Format: JSON, Location: InMemoryFileIndex[file:/D:/das_2021/data/resource_hvrh-b6nb.json], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<dropoff_latitude:string,dropoff_longitude:string,extra:string,fare_amount:string,improveme...




In [62]:
df.rdd.getNumPartitions()

1

In [63]:
df = df.repartition(4)