# Bitcoin Data with PySpark 101

In [21]:
import pandas as pd
import json
from datetime import datetime,date, time
import requests

In [86]:
print(pd.__version__)

1.2.4


In [4]:
import warnings
warnings.filterwarnings('ignore')

## PySpark Version

In [19]:
import pyspark
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

In [10]:
spark = SparkSession.builder.appName("bitcoin_pyspark_kraken").getOrCreate()
spark

### 1. Kraken Historical OHLCVT 

### Attemp to Fix Missing Last Row

#### Option 1

In [27]:
test1 = spark.read.csv("data/XBT_OHLCVT/XBTUSD_1440.csv", header=False)

In [28]:
test1.show(3)

+----------+------+------+------+------+------+---+
|       _c0|   _c1|   _c2|   _c3|   _c4|   _c5|_c6|
+----------+------+------+------+------+------+---+
|1381017600| 122.0| 122.0| 122.0| 122.0|   0.1|  1|
|1381104000|123.61|123.61|123.61|123.61|   0.1|  1|
|1381190400|123.91|124.19| 123.9|124.18|3.9916|  4|
+----------+------+------+------+------+------+---+
only showing top 3 rows



In [11]:
test1.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)



In [12]:
test1.count()

3178

In [13]:
test1.head(1)

[Row(_c0='1381017600', _c1='122.0', _c2='122.0', _c3='122.0', _c4='122.0', _c5='0.1', _c6='1')]

In [29]:
test1.tail(1)

[Row(_c0='1656547200', _c1='20094.0', _c2='20139.8', _c3='18617.5', _c4='19949.9', _c5='5391.5274635603', _c6='32136')]

In [30]:
test1 = test1.withColumnRenamed('_c0', "date")\
                 .withColumnRenamed('_c1', "open")\
                 .withColumnRenamed('_c2', "high")\
                 .withColumnRenamed('_c3', "low")\
                 .withColumnRenamed('_c4', "close")\
                 .withColumnRenamed('_c5', "volume")\
                 .withColumnRenamed('_c6', "trades")

In [31]:
test1.show(2)

+----------+------+------+------+------+------+------+
|      date|  open|  high|   low| close|volume|trades|
+----------+------+------+------+------+------+------+
|1381017600| 122.0| 122.0| 122.0| 122.0|   0.1|     1|
|1381104000|123.61|123.61|123.61|123.61|   0.1|     1|
+----------+------+------+------+------+------+------+
only showing top 2 rows



In [32]:
test1 = test1.withColumn("date", F.from_unixtime(F.col("date"), 'yyyy-MM-dd').cast("date"))

In [33]:
test1.tail(2)

[Row(date=datetime.date(2022, 6, 28), open='20252.5', high='20397.0', low='19828.7', close='20086.2', volume='4367.3190830902', trades='26340'),
 Row(date=datetime.date(2022, 6, 29), open='20094.0', high='20139.8', low='18617.5', close='19949.9', volume='5391.5274635603', trades='32136')]

In [34]:
test1.filter(test1.date == "2022-06-29").show()

+----------+-------+-------+-------+-------+---------------+------+
|      date|   open|   high|    low|  close|         volume|trades|
+----------+-------+-------+-------+-------+---------------+------+
|2022-06-29|20094.0|20139.8|18617.5|19949.9|5391.5274635603| 32136|
+----------+-------+-------+-------+-------+---------------+------+



In [35]:
test1.filter(test1.date == "2022-06-30").show()

+----+----+----+---+-----+------+------+
|date|open|high|low|close|volume|trades|
+----+----+----+---+-----+------+------+
+----+----+----+---+-----+------+------+



#### Option 2

In [62]:
test2 = spark.read.csv("data/XBT_OHLCVT/XBTUSD_1440.csv", inferSchema=True)

In [63]:
test2.show(3)

+----------+------+------+------+------+------+---+
|       _c0|   _c1|   _c2|   _c3|   _c4|   _c5|_c6|
+----------+------+------+------+------+------+---+
|1381017600| 122.0| 122.0| 122.0| 122.0|   0.1|  1|
|1381104000|123.61|123.61|123.61|123.61|   0.1|  1|
|1381190400|123.91|124.19| 123.9|124.18|3.9916|  4|
+----------+------+------+------+------+------+---+
only showing top 3 rows



In [64]:
test2 = test2.withColumnRenamed('_c0', "date")\
                 .withColumnRenamed('_c1', "open")\
                 .withColumnRenamed('_c2', "high")\
                 .withColumnRenamed('_c3', "low")\
                 .withColumnRenamed('_c4', "close")\
                 .withColumnRenamed('_c5', "volume")\
                 .withColumnRenamed('_c6', "trades")

In [65]:
test2 = test2.withColumn("date", F.from_unixtime(F.col("date"), 'yyyy-MM-dd').cast("date"))
test2.show(3)

+----------+------+------+------+------+------+------+
|      date|  open|  high|   low| close|volume|trades|
+----------+------+------+------+------+------+------+
|2013-10-05| 122.0| 122.0| 122.0| 122.0|   0.1|     1|
|2013-10-06|123.61|123.61|123.61|123.61|   0.1|     1|
|2013-10-07|123.91|124.19| 123.9|124.18|3.9916|     4|
+----------+------+------+------+------+------+------+
only showing top 3 rows



In [66]:
test2.orderBy(test2.date.desc()).show(3)
                                     

+----------+-------+-------+-------+-------+---------------+------+
|      date|   open|   high|    low|  close|         volume|trades|
+----------+-------+-------+-------+-------+---------------+------+
|2022-06-29|20094.0|20139.8|18617.5|19949.9|5391.5274635603| 32136|
|2022-06-28|20252.5|20397.0|19828.7|20086.2|4367.3190830902| 26340|
|2022-06-27|20716.3|21188.5|20165.3|20251.6|  3250.53358914| 23648|
+----------+-------+-------+-------+-------+---------------+------+
only showing top 3 rows



#### Option 3

In [79]:
data_schema = StructType(
              [StructField("date", StringType(), True),
               StructField("open", DoubleType(), True),
               StructField("high", DoubleType(), True),
               StructField("low", DoubleType(), True),
               StructField("close", DoubleType(), True),
               StructField("volume", DoubleType(), True),
               StructField("trades", IntegerType(), True)]
)

In [80]:
test3 = spark.read.csv("data/XBT_OHLCVT/XBTUSD_1440.csv", schema = data_schema, header=False)

In [81]:
test3.show(3)

+----------+------+------+------+------+------+------+
|      date|  open|  high|   low| close|volume|trades|
+----------+------+------+------+------+------+------+
|1381017600| 122.0| 122.0| 122.0| 122.0|   0.1|     1|
|1381104000|123.61|123.61|123.61|123.61|   0.1|     1|
|1381190400|123.91|124.19| 123.9|124.18|3.9916|     4|
+----------+------+------+------+------+------+------+
only showing top 3 rows



In [82]:
test3 = test3.withColumn("date", F.from_unixtime(F.col("date"), 'yyyy-MM-dd').cast("date"))
test3.show(3)

+----------+------+------+------+------+------+------+
|      date|  open|  high|   low| close|volume|trades|
+----------+------+------+------+------+------+------+
|2013-10-05| 122.0| 122.0| 122.0| 122.0|   0.1|     1|
|2013-10-06|123.61|123.61|123.61|123.61|   0.1|     1|
|2013-10-07|123.91|124.19| 123.9|124.18|3.9916|     4|
+----------+------+------+------+------+------+------+
only showing top 3 rows



In [83]:
test3.orderBy(test3.date.desc()).show(3)

+----------+-------+-------+-------+-------+---------------+------+
|      date|   open|   high|    low|  close|         volume|trades|
+----------+-------+-------+-------+-------+---------------+------+
|2022-06-29|20094.0|20139.8|18617.5|19949.9|5391.5274635603| 32136|
|2022-06-28|20252.5|20397.0|19828.7|20086.2|4367.3190830902| 26340|
|2022-06-27|20716.3|21188.5|20165.3|20251.6|  3250.53358914| 23648|
+----------+-------+-------+-------+-------+---------------+------+
only showing top 3 rows



#### Option 4

In [3]:
xbt_cols = ["date","open", "high", "low", "close", "volume", "trades"]

In [9]:
xbt_csv = spark.read.options("inferSchema", "True").csv("btc_data/XBT_OHLCVT/XBTUSD_1440.csv")

In [10]:
xbt_csv.show(5)

+----------+---------+---------+------+------+------+---+
|       _c0|      _c1|      _c2|   _c3|   _c4|   _c5|_c6|
+----------+---------+---------+------+------+------+---+
|1381017600|    122.0|    122.0| 122.0| 122.0|   0.1|  1|
|1381104000|   123.61|   123.61|123.61|123.61|   0.1|  1|
|1381190400|   123.91|   124.19| 123.9|124.18|3.9916|  4|
|1381276800|124.01687|124.01687|123.84|123.84| 2.823|  3|
|1381363200|   125.85|   125.86|125.85|125.86|   2.0|  2|
+----------+---------+---------+------+------+------+---+
only showing top 5 rows



In [11]:
xbt_csv.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: double (nullable = true)
 |-- _c2: double (nullable = true)
 |-- _c3: double (nullable = true)
 |-- _c4: double (nullable = true)
 |-- _c5: double (nullable = true)
 |-- _c6: integer (nullable = true)



In [12]:
type(xbt_csv)

pyspark.sql.dataframe.DataFrame

In [13]:
xbt_csv.columns

['_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6']

In [14]:
xbt_cols = ["date","open", "high", "low", "close", "volume", "trades"]

In [15]:
xbt_csv = xbt_csv.withColumnRenamed('_c0', "date")\
                 .withColumnRenamed('_c1', "open")\
                 .withColumnRenamed('_c2', "high")\
                 .withColumnRenamed('_c3', "low")\
                 .withColumnRenamed('_c4', "close")\
                 .withColumnRenamed('_c5', "volume")\
                 .withColumnRenamed('_c6', "trades")

In [16]:
xbt_csv.show(5)

+----------+---------+---------+------+------+------+------+
|      date|     open|     high|   low| close|volume|trades|
+----------+---------+---------+------+------+------+------+
|1381017600|    122.0|    122.0| 122.0| 122.0|   0.1|     1|
|1381104000|   123.61|   123.61|123.61|123.61|   0.1|     1|
|1381190400|   123.91|   124.19| 123.9|124.18|3.9916|     4|
|1381276800|124.01687|124.01687|123.84|123.84| 2.823|     3|
|1381363200|   125.85|   125.86|125.85|125.86|   2.0|     2|
+----------+---------+---------+------+------+------+------+
only showing top 5 rows



In [17]:
xbt_csv.printSchema()

root
 |-- date: integer (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: double (nullable = true)
 |-- trades: integer (nullable = true)



### Change date format

In [18]:
import pyspark.sql.functions as F

In [19]:
# df = df.withColumn("end_time", F.from_unixtime(F.col("end_time"), 'yyyy-MM-dd HH:mm:ss.SS').cast("timestamp"))

xbt_csv = xbt_csv.withColumn("date", F.from_unixtime(F.col("date"), 'yyyy-MM-dd').cast("date"))

In [20]:
xbt_csv.count()

3178

In [21]:
xbt_csv.show(3)

+----------+------+------+------+------+------+------+
|      date|  open|  high|   low| close|volume|trades|
+----------+------+------+------+------+------+------+
|2013-10-05| 122.0| 122.0| 122.0| 122.0|   0.1|     1|
|2013-10-06|123.61|123.61|123.61|123.61|   0.1|     1|
|2013-10-07|123.91|124.19| 123.9|124.18|3.9916|     4|
+----------+------+------+------+------+------+------+
only showing top 3 rows



In [22]:
xbt_csv.printSchema()

root
 |-- date: date (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: double (nullable = true)
 |-- trades: integer (nullable = true)



In [23]:
xbt_csv.tail(3)

[Row(date=datetime.date(2022, 6, 27), open=20716.3, high=21188.5, low=20165.3, close=20251.6, volume=3250.53358914, trades=23648),
 Row(date=datetime.date(2022, 6, 28), open=20252.5, high=20397.0, low=19828.7, close=20086.2, volume=4367.3190830902, trades=26340),
 Row(date=datetime.date(2022, 6, 29), open=20094.0, high=20139.8, low=18617.5, close=19949.9, volume=5391.5274635603, trades=32136)]

In [24]:
xbt_csv.select("date").tail(1)

[Row(date=datetime.date(2022, 6, 29))]

In [27]:
xbt_csv.orderBy(xbt_csv.date.desc()).show(5)

+----------+-------+-------+-------+-------+---------------+------+
|      date|   open|   high|    low|  close|         volume|trades|
+----------+-------+-------+-------+-------+---------------+------+
|2022-06-29|20094.0|20139.8|18617.5|19949.9|5391.5274635603| 32136|
|2022-06-28|20252.5|20397.0|19828.7|20086.2|4367.3190830902| 26340|
|2022-06-27|20716.3|21188.5|20165.3|20251.6|  3250.53358914| 23648|
|2022-06-26|21023.4|21528.7|20505.5|20716.3|3973.8363769301| 22966|
|2022-06-25|21476.8|21867.0|20957.0|21023.3|   1913.1909771| 20649|
+----------+-------+-------+-------+-------+---------------+------+
only showing top 5 rows



**It should have included June 30**

### Another Option

In [31]:
from pyspark.sql.types import *

In [51]:
import datetime

In [52]:
data_schema = [StructField("date", DateType(), True),
               StructField("open", DoubleType(), True),
               StructField("high", DoubleType(), True),
               StructField("low", DoubleType(), True),
               StructField("close", DoubleType(), True),
               StructField("volume", DoubleType(), True),
               StructField("trades", IntegerType(), True)]

final_struc = StructType(fields=data_schema)

In [53]:
test = spark.read.csv("btc_data/XBT_OHLCVT/XBTUSD_1440.csv", schema=final_struc)

In [54]:
test.show(5)

+----+---------+---------+------+------+------+------+
|date|     open|     high|   low| close|volume|trades|
+----+---------+---------+------+------+------+------+
|null|    122.0|    122.0| 122.0| 122.0|   0.1|     1|
|null|   123.61|   123.61|123.61|123.61|   0.1|     1|
|null|   123.91|   124.19| 123.9|124.18|3.9916|     4|
|null|124.01687|124.01687|123.84|123.84| 2.823|     3|
|null|   125.85|   125.86|125.85|125.86|   2.0|     2|
+----+---------+---------+------+------+------+------+
only showing top 5 rows



In [56]:
test = spark.read.option("inferSchema","True").option("header","false").csv("btc_data/XBT_OHLCVT/XBTUSD_1440.csv")

In [57]:
test.show(3)

+----------+------+------+------+------+------+---+
|       _c0|   _c1|   _c2|   _c3|   _c4|   _c5|_c6|
+----------+------+------+------+------+------+---+
|1381017600| 122.0| 122.0| 122.0| 122.0|   0.1|  1|
|1381104000|123.61|123.61|123.61|123.61|   0.1|  1|
|1381190400|123.91|124.19| 123.9|124.18|3.9916|  4|
+----------+------+------+------+------+------+---+
only showing top 3 rows



In [58]:
test = test.withColumnRenamed('_c0', "date")\
                 .withColumnRenamed('_c1', "open")\
                 .withColumnRenamed('_c2', "high")\
                 .withColumnRenamed('_c3', "low")\
                 .withColumnRenamed('_c4', "close")\
                 .withColumnRenamed('_c5', "volume")\
                 .withColumnRenamed('_c6', "trades")

In [59]:
test.show(3)

+----------+------+------+------+------+------+------+
|      date|  open|  high|   low| close|volume|trades|
+----------+------+------+------+------+------+------+
|1381017600| 122.0| 122.0| 122.0| 122.0|   0.1|     1|
|1381104000|123.61|123.61|123.61|123.61|   0.1|     1|
|1381190400|123.91|124.19| 123.9|124.18|3.9916|     4|
+----------+------+------+------+------+------+------+
only showing top 3 rows



In [60]:
test = test.withColumn("date", F.from_unixtime(F.col("date"), 'yyyy-MM-dd').cast("date"))

In [61]:
test.printSchema()

root
 |-- date: date (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: double (nullable = true)
 |-- trades: integer (nullable = true)



In [62]:
test.orderBy(test.date.desc()).show(5)

+----------+-------+-------+-------+-------+---------------+------+
|      date|   open|   high|    low|  close|         volume|trades|
+----------+-------+-------+-------+-------+---------------+------+
|2022-06-29|20094.0|20139.8|18617.5|19949.9|5391.5274635603| 32136|
|2022-06-28|20252.5|20397.0|19828.7|20086.2|4367.3190830902| 26340|
|2022-06-27|20716.3|21188.5|20165.3|20251.6|  3250.53358914| 23648|
|2022-06-26|21023.4|21528.7|20505.5|20716.3|3973.8363769301| 22966|
|2022-06-25|21476.8|21867.0|20957.0|21023.3|   1913.1909771| 20649|
+----------+-------+-------+-------+-------+---------------+------+
only showing top 5 rows



In [67]:
test2 = spark.read.option("inferSchema","True").option("header","true").csv("btc_data/XBT_OHLCVT/XBTUSD_1440.csv")

In [68]:
test2 = test.withColumnRenamed('_c0', "date")\
                 .withColumnRenamed('_c1', "open")\
                 .withColumnRenamed('_c2', "high")\
                 .withColumnRenamed('_c3', "low")\
                 .withColumnRenamed('_c4', "close")\
                 .withColumnRenamed('_c5', "volume")\
                 .withColumnRenamed('_c6', "trades")

In [69]:
test2.show(3)

+----------+------+------+------+------+------+------+
|      date|  open|  high|   low| close|volume|trades|
+----------+------+------+------+------+------+------+
|2013-10-05| 122.0| 122.0| 122.0| 122.0|   0.1|     1|
|2013-10-06|123.61|123.61|123.61|123.61|   0.1|     1|
|2013-10-07|123.91|124.19| 123.9|124.18|3.9916|     4|
+----------+------+------+------+------+------+------+
only showing top 3 rows



In [71]:
test2.printSchema()

root
 |-- date: date (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: double (nullable = true)
 |-- trades: integer (nullable = true)



In [72]:
#test2 = test2.withColumn("date", F.from_unixtime(F.col("date"), 'yyyy-MM-dd').cast("date"))

In [73]:
test2.orderBy(test2.date.desc()).show(3)

+----------+-------+-------+-------+-------+---------------+------+
|      date|   open|   high|    low|  close|         volume|trades|
+----------+-------+-------+-------+-------+---------------+------+
|2022-06-29|20094.0|20139.8|18617.5|19949.9|5391.5274635603| 32136|
|2022-06-28|20252.5|20397.0|19828.7|20086.2|4367.3190830902| 26340|
|2022-06-27|20716.3|21188.5|20165.3|20251.6|  3250.53358914| 23648|
+----------+-------+-------+-------+-------+---------------+------+
only showing top 3 rows



In [None]:
test2 = spark.read.option("inferSchema","True").option("header","true").csv("btc_data/XBT_OHLCVT/XBTUSD_1440.csv")

### 2. Kraken API Market data OHLC endpoint
* https://docs.kraken.com/rest/#operation/getTickerInformation

In [28]:
import requests
import pandas as pd
import json
from datetime import datetime

In [29]:
def xbt_current_df():
    kraken_ohlc_cols = ["date","open", "high", "low", "close", "vwap","volume", "trades"]
    resp = requests.get('https://api.kraken.com/0/public/OHLC?pair=XBTUSD&interval=1440&since=unix_now').json()
    xbt_till_today_df = pd.DataFrame(resp["result"]['XXBTZUSD'])
    xbt_till_today_df.columns = kraken_ohlc_cols
    xbt_till_today_df["date"] = pd.to_datetime(xbt_till_today_df["date"],unit='s')
    xbt_till_today_df["date"] = xbt_till_today_df["date"].dt.date
    xbt_till_today_df = xbt_till_today_df[xbt_till_today_df["date"] > (datetime.strptime("2022-06-30", '%Y-%m-%d').date())]
    xbt_till_today_df.drop("vwap", axis=1, inplace=True)
    xbt_till_today_df.to_csv("btc_data/xbt_till_today_df.csv")
    return xbt_till_today_df


In [30]:
new_xbt_pd = xbt_current_df()
new_xbt_pd

Unnamed: 0,date,open,high,low,close,volume,trades
619,2022-07-01,19949.8,20878.3,18950.0,19245.3,5953.60277338,32011
620,2022-07-02,19250.8,19430.4,18969.6,19226.9,1293.38922751,15723
621,2022-07-03,19226.9,19684.3,18738.6,19293.5,2226.88548164,17685
622,2022-07-04,19292.8,20325.0,19033.6,20207.5,3479.37657412,20579
623,2022-07-05,20210.8,20732.1,19300.0,20169.1,5161.93595617,30157
...,...,...,...,...,...,...,...
715,2022-10-05,20340.9,20360.9,19754.3,20162.7,3851.61226863,19609
716,2022-10-06,20163.1,20449.8,19872.7,19955.8,3542.06193830,20390
717,2022-10-07,19955.8,20054.7,19338.4,19532.8,3678.17182192,19341
718,2022-10-08,19532.8,19615.0,19249.7,19417.5,1337.85077668,10018


In [42]:
len(new_xbt_pd)

99

### Create PySpark DataFrame from Pandas DataFrame

In [43]:
new_xbt_sdf = spark.createDataFrame(new_xbt_pd)

In [44]:
new_xbt_sdf.show(5)

+----------+-------+-------+-------+-------+-------------+------+
|      date|   open|   high|    low|  close|       volume|trades|
+----------+-------+-------+-------+-------+-------------+------+
|2022-07-01|19949.8|20878.3|18950.0|19245.3|5953.60277338| 32011|
|2022-07-02|19250.8|19430.4|18969.6|19226.9|1293.38922751| 15723|
|2022-07-03|19226.9|19684.3|18738.6|19293.5|2226.88548164| 17685|
|2022-07-04|19292.8|20325.0|19033.6|20207.5|3479.37657412| 20579|
|2022-07-05|20210.8|20732.1|19300.0|20169.1|5161.93595617| 30157|
+----------+-------+-------+-------+-------+-------------+------+
only showing top 5 rows



In [45]:
new_xbt_sdf.count()

99

In [46]:
xbt_csv.count()

3178

In [47]:
new_xbt_sdf.count() + xbt_csv.count()

3277

In [48]:
xbt_csv.union(new_xbt_sdf).count()

3277

### 3. Historical + Current Price

In [49]:
merged_xbt = xbt_csv.unionAll(new_xbt_sdf).distinct()
merged_xbt.count()

3277

In [50]:
merged_xbt.show(5)

+----------+---------+---------+---------+---------+----------------+------+
|      date|     open|     high|      low|    close|          volume|trades|
+----------+---------+---------+---------+---------+----------------+------+
|2015-01-19|    211.0|219.84384|  208.395|219.81882|      5.21586423|     9|
|2015-08-22|232.37909|232.37909|   226.07|228.13041|     52.24048713|    42|
|2015-12-10|417.83305|    445.0|417.83305|  443.119|    461.35822146|   419|
|2017-05-26| 2199.787|   2290.0|  1812.99|   2008.0|12534.3991475302| 34964|
|2017-07-28| 2808.971| 2808.999| 2678.588| 2715.999| 7068.3909887602| 17833|
+----------+---------+---------+---------+---------+----------------+------+
only showing top 5 rows



In [58]:
test_df = merged_xbt.filter(merged_xbt["date"] > "2022-06-25")
test_df.show()

+----------+-------+-------+-------+-------+---------------+------+
|      date|   open|   high|    low|  close|         volume|trades|
+----------+-------+-------+-------+-------+---------------+------+
|2022-06-29|20094.0|20139.8|18617.5|19949.9|5391.5274635603| 32136|
|2022-06-27|20716.3|21188.5|20165.3|20251.6|  3250.53358914| 23648|
|2022-06-26|21023.4|21528.7|20505.5|20716.3|3973.8363769301| 22966|
|2022-06-28|20252.5|20397.0|19828.7|20086.2|4367.3190830902| 26340|
|2022-07-09|21580.9|21955.0|21316.7|21579.1|  1661.83466380| 18159|
|2022-07-04|19292.8|20325.0|19033.6|20207.5|  3479.37657412| 20579|
|2022-07-11|20830.3|20847.4|19870.5|19947.0|  3526.05246794| 26945|
|2022-07-12|19943.3|20036.3|19214.1|19307.9|  3986.29656040| 25331|
|2022-07-08|21617.6|22498.0|21183.9|21582.6|  5110.19342253| 32125|
|2022-07-01|19949.8|20878.3|18950.0|19245.3|  5953.60277338| 32011|
|2022-07-06|20169.0|20583.0|19750.1|20542.7|  3213.69715576| 23087|
|2022-07-10|21581.7|21590.6|20620.3|20830.3|  27

In [59]:
test_df.count()

103

In [93]:
test_df.filter(test_df.date == "2022-06-29").show()

+----------+-------+-------+-------+-------+---------------+------+
|      date|   open|   high|    low|  close|         volume|trades|
+----------+-------+-------+-------+-------+---------------+------+
|2022-06-29|20094.0|20139.8|18617.5|19949.9|5391.5274635603| 32136|
+----------+-------+-------+-------+-------+---------------+------+



**Confirm that pyspark missed June 30 row**

In [94]:
test_df.filter(test_df.date.isin(["2022-06-29"])).show()

+----+----+----+---+-----+------+------+
|date|open|high|low|close|volume|trades|
+----+----+----+---+-----+------+------+
+----+----+----+---+-----+------+------+



In [99]:
li = ["2022-06-29"]

In [98]:
test_df.filter(test_df.date.isin(["2022-06-29"])).show()

+----------+-------+-------+-------+-------+---------------+------+
|      date|   open|   high|    low|  close|         volume|trades|
+----------+-------+-------+-------+-------+---------------+------+
|2022-06-29|20094.0|20139.8|18617.5|19949.9|5391.5274635603| 32136|
+----------+-------+-------+-------+-------+---------------+------+



In [100]:
test_df.filter(test_df.date.isin(li)).show()

+----------+-------+-------+-------+-------+---------------+------+
|      date|   open|   high|    low|  close|         volume|trades|
+----------+-------+-------+-------+-------+---------------+------+
|2022-06-29|20094.0|20139.8|18617.5|19949.9|5391.5274635603| 32136|
+----------+-------+-------+-------+-------+---------------+------+



In [101]:
test_df.filter(test_df.date.isin(["2022-06-30"])).show()

+----+----+----+---+-----+------+------+
|date|open|high|low|close|volume|trades|
+----+----+----+---+-----+------+------+
+----+----+----+---+-----+------+------+



**Reference for fixing the missing value(s)**
- https://kokes.github.io/blog/2019/07/09/losing-data-apache-spark.html

## Pandas Version

In [72]:
import numpy as np
import pandas as pd
import json
import datetime
import time
import requests

In [73]:
def xbt_historical_make_df():
    xbt_cols = ["date","open", "high", "low", "close", "volume", "trades"]
    xbt_historical_df = pd.read_csv("btc_data/XBT_OHLCVT/XBTUSD_1440.csv", names=xbt_cols)
    xbt_historical_df["date"] = pd.to_datetime(xbt_historical_df["date"],unit='s')
    xbt_historical_df.to_csv("btc_data/xbt_historical_df.csv")
    return(xbt_historical_df)
xbt_historical_make_df()

Unnamed: 0,date,open,high,low,close,volume,trades
0,2013-10-06,122.00000,122.00000,122.00,122.00,0.100000,1
1,2013-10-07,123.61000,123.61000,123.61,123.61,0.100000,1
2,2013-10-08,123.91000,124.19000,123.90,124.18,3.991600,4
3,2013-10-09,124.01687,124.01687,123.84,123.84,2.823000,3
4,2013-10-10,125.85000,125.86000,125.85,125.86,2.000000,2
...,...,...,...,...,...,...,...
3173,2022-06-26,21476.80000,21867.00000,20957.00,21023.30,1913.190977,20649
3174,2022-06-27,21023.40000,21528.70000,20505.50,20716.30,3973.836377,22966
3175,2022-06-28,20716.30000,21188.50000,20165.30,20251.60,3250.533589,23648
3176,2022-06-29,20252.50000,20397.00000,19828.70,20086.20,4367.319083,26340


In [74]:
len(xbt_historical_make_df())

3178

In [75]:
def xbt_current_df():
    kraken_ohlc_cols = ["date","open", "high", "low", "close", "vwap","volume", "trades"]
    resp = requests.get('https://api.kraken.com/0/public/OHLC?pair=XBTUSD&interval=1440&since=unix_now').json()
    xbt_till_today_df = pd.DataFrame(resp["result"]['XXBTZUSD'])
    xbt_till_today_df.columns = kraken_ohlc_cols
    xbt_till_today_df["date"] = pd.to_datetime(xbt_till_today_df["date"],unit='s')
    xbt_till_today_df.to_csv("btc_data/xbt_till_today_df.csv")
    return xbt_till_today_df
xbt_current_df()

Unnamed: 0,date,open,high,low,close,vwap,volume,trades
0,2020-10-18,11364.0,11515.1,11355.7,11508.9,11443.4,1062.87912435,5240
1,2020-10-19,11508.8,11838.3,11416.6,11757.0,11653.6,4364.70701426,14334
2,2020-10-20,11757.0,12070.0,11678.8,11925.4,11889.5,5301.91244632,17619
3,2020-10-21,11926.7,13241.4,11905.7,12813.0,12603.2,11497.59347437,38325
4,2020-10-22,12813.0,13199.0,12690.5,12984.1,12968.6,5337.43662325,22865
...,...,...,...,...,...,...,...,...
715,2022-10-03,19060.4,19700.0,18985.7,19639.6,19367.5,3507.05155930,20153
716,2022-10-04,19631.4,20475.0,19508.2,20347.9,20034.5,5668.96124874,24242
717,2022-10-05,20340.9,20360.9,19754.3,20162.7,20063.9,3851.61226863,19609
718,2022-10-06,20163.1,20449.8,19872.7,19955.8,20139.3,3542.06193830,20390


In [76]:
len(xbt_current_df())

720

In [77]:
def btc_price_df():
    xbt_historical_df = pd.read_csv("btc_data/xbt_historical_df.csv", index_col = 0)
    xbt_till_today_df = pd.read_csv("btc_data/xbt_till_today_df.csv", index_col = 0)
    xbt_till_today_df.drop("vwap", axis=1, inplace=True)
    
    xbt_till_today = xbt_till_today_df[xbt_till_today_df["date"] > "2022-06-30"]
    
    btc_df_raw = pd.concat([xbt_historical_df, xbt_till_today ],ignore_index=True)
    btc_df_raw = btc_df_raw.astype({"open":"float64", 
                                    "high":"float64", 
                                    "low":"float64", 
                                    "close":"float64", 
                                    "volume":"float64"})
    
    btc_df_raw.to_csv("btc_data/btc_df_raw.csv")
    
    return btc_df_raw

btc_price_df()

Unnamed: 0,date,open,high,low,close,volume,trades
0,2013-10-06,122.00000,122.00000,122.00,122.00,0.100000,1
1,2013-10-07,123.61000,123.61000,123.61,123.61,0.100000,1
2,2013-10-08,123.91000,124.19000,123.90,124.18,3.991600,4
3,2013-10-09,124.01687,124.01687,123.84,123.84,2.823000,3
4,2013-10-10,125.85000,125.86000,125.85,125.86,2.000000,2
...,...,...,...,...,...,...,...
3272,2022-10-03,19060.40000,19700.00000,18985.70,19639.60,3507.051559,20153
3273,2022-10-04,19631.40000,20475.00000,19508.20,20347.90,5668.961249,24242
3274,2022-10-05,20340.90000,20360.90000,19754.30,20162.70,3851.612269,19609
3275,2022-10-06,20163.10000,20449.80000,19872.70,19955.80,3542.061938,20390


In [78]:
len(btc_price_df())

3277

In [79]:
len(xbt_historical_make_df()) + len(xbt_current_df())

3898

In [84]:
btc_df_raw = pd.read_csv("btc_data/btc_df_raw.csv", index_col=0)
len(btc_df_raw)

3277

In [87]:
test_df2 = btc_df_raw[btc_df_raw["date"] > "2022-06-25" ]

In [88]:
len(test_df2)

104

In [108]:
btc_from_pd = pd.read_csv("btc_data/btc_df_raw.csv", index_col = 0)
btc_from_pd.head(3)

Unnamed: 0,date,open,high,low,close,volume,trades
0,2013-10-06,122.0,122.0,122.0,122.0,0.1,1
1,2013-10-07,123.61,123.61,123.61,123.61,0.1,1
2,2013-10-08,123.91,124.19,123.9,124.18,3.9916,4


In [109]:
btc_from_pd.tail(3)

Unnamed: 0,date,open,high,low,close,volume,trades
3274,2022-10-05,20340.9,20360.9,19754.3,20162.7,3851.612269,19609
3275,2022-10-06,20163.1,20449.8,19872.7,19955.8,3542.061938,20390
3276,2022-10-07,19955.8,20054.7,19947.0,19994.3,181.099084,2449


In [110]:
len(btc_from_pd)

3277

In [111]:
btc_spark_df = spark.createDataFrame(btc_from_pd)

In [112]:
btc_spark_df.count()

3277

In [114]:
btc_spark_df.filter(btc_spark_df.date == "2022-06-29").show()

+----------+-------+-------+-------+-------+---------------+------+
|      date|   open|   high|    low|  close|         volume|trades|
+----------+-------+-------+-------+-------+---------------+------+
|2022-06-29|20252.5|20397.0|19828.7|20086.2|4367.3190830902| 26340|
+----------+-------+-------+-------+-------+---------------+------+



In [115]:
btc_spark_df.filter(btc_spark_df.date == "2022-06-30").show()

+----------+-------+-------+-------+-------+---------------+------+
|      date|   open|   high|    low|  close|         volume|trades|
+----------+-------+-------+-------+-------+---------------+------+
|2022-06-30|20094.0|20139.8|18617.5|19949.9|5391.5274635603| 32136|
+----------+-------+-------+-------+-------+---------------+------+



In [116]:
spark.stop()