In [None]:
%run ./00_notebook_with_setup.ipynb

In [None]:
# Approximate quantiles with different relative errors (note that a relative error of 0.0 gives you the exact quantiles)

probabilities = [0.0, 0.25, 0.5, 0.75, 1.0]  # [min, Q1, median, Q3, max]

relativeError = 0.1

approxQuantiles = data.stat.approxQuantile("Salary", probabilities, relativeError)

print([row["Salary"] for row in data.toLocalIterator()])
print("")
print(approxQuantiles)

In [None]:
# We can automate this to compare results for different relative errors 

probabilities = [0.0, 0.25, 0.5, 0.75, 1.0]

results = []
for relativeError in [1.0, 0.5, 0.1, 0.0]:
    approxQuantiles = data.stat.approxQuantile("Salary", probabilities, relativeError)
    results.append([relativeError] + approxQuantiles)

display(pandas.DataFrame(results, columns=["relativeError"] + probabilities))

In [None]:
# A dangerous alternative to .head()

rows = data.collect()

print(type(rows))
print("")
print(len(rows))
print("")
for row in rows:
    print(row)

In [None]:
# A more dangerous alternative to .collect() as the conversion to a pandas dataframe involves copying

data_local = data.toPandas()

print(type(data_local))
print("")
print(data_local)
print("")
display(data_local)

In [None]:
# A safe and convenient use of .toPandas() after .limit()

display(data.limit(5).toPandas())

In [None]:
# A safe way to iterate over rows one by one locally

for row in data.toLocalIterator():
    print(row)

In [None]:
# Apply a function to each row or partition without having to run .collect()

def rowSink(row):
    print(row)  # write the output to somewhere external e.g. a database or a message queue

def partitionSink(rows):
    for row in rows:
        print(row)  # write the output to somewhere external e.g. a database or a message queue

data.foreach(rowSink)
data.foreachPartition(partitionSink)

In [None]:
# DataFrameWriter (note that there are multiple ways to write this but the result is the same)

data_path = f"hdfs:///tmp/data/"

data.write.csv(data_path)

In [None]:
!hdfs dfs -ls /tmp/data/

In [None]:
# DataFrameWriter (note that there are multiple ways to write this but the result is the same)

data_path = f"hdfs:///user/{username}/data/"

(
    data.write
    .format("csv")
    .mode("overwrite")
    .option("compression", "gzip")
    .save(data_path)
)

In [None]:
!hdfs dfs -ls /user/{username}/data/