Date,Open,High,Low,Close,Volume,Currency
2000-01-03,122.25,124.0,116.1,116.5,6640,USD

In [23]:
import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as sf      # sf = spark functions
import pyspark.sql.types as st 
from pyspark.sql.functions import udf

In [11]:
spark = SparkSession.builder.getOrCreate()

In [12]:
data_file = "./data/coffee.csv"

In [13]:
# coffee_df = spark.read.csv(data_file, header=True)

In [14]:
# coffee_df.printSchema()

In [15]:
schema = "Date string, Open float, High float,Low float, Close float, Volume float, Currency string"

In [16]:
coffee_df = spark.read.csv(data_file, schema=schema, header=True, enforceSchema=True)

In [17]:
coffee_df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: float (nullable = true)
 |-- High: float (nullable = true)
 |-- Low: float (nullable = true)
 |-- Close: float (nullable = true)
 |-- Volume: float (nullable = true)
 |-- Currency: string (nullable = true)



In [18]:
coffee_df.columns

['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Currency']

In [19]:
# Add a column to the DataFrame where the values are the difference between 'Open' and 'Close'.
coffee_df = coffee_df.withColumn('open_to_close', (coffee_df.Open - coffee_df.Close))

In [20]:
coffee_df.columns

['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Currency', 'open_to_close']

In [21]:
coffee_df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: float (nullable = true)
 |-- High: float (nullable = true)
 |-- Low: float (nullable = true)
 |-- Close: float (nullable = true)
 |-- Volume: float (nullable = true)
 |-- Currency: string (nullable = true)
 |-- open_to_close: float (nullable = true)



In [28]:
def to_abs(diff: float):
    return float(abs(diff))

to_abs_udf = udf(to_abs)

In [29]:
# Once you have a column for the difference between 'Open' and 'Close', add another column that contains the absolute values of the numbers in that column.
coffee_df = coffee_df.withColumn('abs_open_to_close', (to_abs_udf(coffee_df.open_to_close)))

In [31]:
coffee_df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: float (nullable = true)
 |-- High: float (nullable = true)
 |-- Low: float (nullable = true)
 |-- Close: float (nullable = true)
 |-- Volume: float (nullable = true)
 |-- Currency: string (nullable = true)
 |-- open_to_close: float (nullable = true)
 |-- abs_open_to_close: string (nullable = true)



In [34]:
coffee_df.describe().show()

                                                                                

+-------+----------+------------------+------------------+------------------+------------------+-----------------+--------+-------------------+------------------+
|summary|      Date|              Open|              High|               Low|             Close|           Volume|Currency|      open_to_close| abs_open_to_close|
+-------+----------+------------------+------------------+------------------+------------------+-----------------+--------+-------------------+------------------+
|  count|      5674|              5674|              5674|              5674|              5674|             5674|    5674|               5674|              5674|
|   mean|      null| 126.0496775257701|127.60635527515646|124.58774245688728|125.99223296105295|8749.680472329926|    null|0.05744456471714454|1.7606027822995378|
| stddev|      null|49.699819927093294| 50.26890009357301| 48.99939724363535|49.624084304058016|9593.583236499931|    null| 2.5976646412465607| 1.910731140670376|
|    min|2000-01-03|  

In [32]:
coffee_df.head(20)

                                                                                

[Row(Date='2000-01-03', Open=122.25, High=124.0, Low=116.0999984741211, Close=116.5, Volume=6640.0, Currency='USD', open_to_close=5.75, abs_open_to_close='5.75'),
 Row(Date='2000-01-04', Open=116.25, High=120.5, Low=115.75, Close=116.25, Volume=5492.0, Currency='USD', open_to_close=0.0, abs_open_to_close='0.0'),
 Row(Date='2000-01-05', Open=115.0, High=121.0, Low=115.0, Close=118.5999984741211, Volume=6165.0, Currency='USD', open_to_close=-3.5999984741210938, abs_open_to_close='3.5999984741210938'),
 Row(Date='2000-01-06', Open=119.0, High=121.4000015258789, Low=116.5, Close=116.8499984741211, Volume=5094.0, Currency='USD', open_to_close=2.1500015258789062, abs_open_to_close='2.1500015258789062'),
 Row(Date='2000-01-07', Open=117.25, High=117.75, Low=113.80000305175781, Close=114.1500015258789, Volume=6855.0, Currency='USD', open_to_close=3.0999984741210938, abs_open_to_close='3.0999984741210938'),
 Row(Date='2000-01-10', Open=123.5, High=126.0, Low=116.69999694824219, Close=117.550003