# Random forests and collaborative filtering on bitcoin 
## Bitcoin data revisited 

The goal of this notebook is to use Latent Dirichlet Allocation as a collaborative filtering algorithm on bitcoin data. 

In [7]:
# import data from septembre of 2022 since it was rather stable during this period
import pandas as pd

# data are 15 minute candlesticks
df = pd.read_csv("btc-data/BTCUSDT_15_2023-09-01_2023-09-30.csv", names=["time","open","high","low","close","vol"], header=None)

# if on kaggle:
# df = pd.read_csv("/kaggle/input/btcusdt-15-2023-09-01-2023-09-30/BTCUSDT_15_2023-09-01_2023-09-30.csv", names=["time","open","high","low","close","vol"], header=None)

print(df.shape)
df.head()

(2881, 6)


Unnamed: 0,time,open,high,low,close,vol
0,2023.09.01 00:00,26009.3,26016.5,25627.2,25856.8,20063.967
1,2023.09.01 00:15,25856.8,26021.2,25840.1,25910.7,5158.731
2,2023.09.01 00:30,25910.7,25955.9,25888.7,25945.7,1300.917
3,2023.09.01 00:45,25945.7,26029.7,25945.0,26013.4,1616.798
4,2023.09.01 01:00,26013.4,26013.4,25962.2,25991.8,844.671


**Now we want to recreate this blog post:** <br/>
https://towardsdatascience.com/create-a-recommendation-system-based-on-time-series-data-using-latent-dirichlet-allocation-2aa141b99e19

To do this, we need to think about our data a bit different: 
In the blog post LDA has been used to get groups of different people who watch shows with a certain probability during a 24 hour period. 

Therefore it is necessary to ask a different question about the data: During which hour of the day is what variation the most likely? 

The next step is therefore to calculate normalized differences of the day starting with the open price. 

In [10]:
# transform to date_time
df["time"] = pd.to_datetime(df["time"])

# Extracting day and hour
df["day"] = df["time"].dt.day
df["hour"] = df["time"].dt.hour
df.head()

Unnamed: 0,time,open,high,low,close,vol,day,hour
0,2023-09-01 00:00:00,26009.3,26016.5,25627.2,25856.8,20063.967,1,0
1,2023-09-01 00:15:00,25856.8,26021.2,25840.1,25910.7,5158.731,1,0
2,2023-09-01 00:30:00,25910.7,25955.9,25888.7,25945.7,1300.917,1,0
3,2023-09-01 00:45:00,25945.7,26029.7,25945.0,26013.4,1616.798,1,0
4,2023-09-01 01:00:00,26013.4,26013.4,25962.2,25991.8,844.671,1,1


In [20]:
# group the open prices by day and then use the function "first" on the groups
df["daily_open"] = df.groupby(df["day"])["open"].transform("first")
print(df.shape)
df.head()

(2881, 9)


Unnamed: 0,time,open,high,low,close,vol,day,hour,daily_open
0,2023-09-01 00:00:00,26009.3,26016.5,25627.2,25856.8,20063.967,1,0,26009.3
1,2023-09-01 00:15:00,25856.8,26021.2,25840.1,25910.7,5158.731,1,0,26009.3
2,2023-09-01 00:30:00,25910.7,25955.9,25888.7,25945.7,1300.917,1,0,26009.3
3,2023-09-01 00:45:00,25945.7,26029.7,25945.0,26013.4,1616.798,1,0,26009.3
4,2023-09-01 01:00:00,26013.4,26013.4,25962.2,25991.8,844.671,1,1,26009.3


In [29]:
# deviation of close prices given the open price in percent of deviation
df["perc_dev"] = ((df["daily_open"]-df["close"])/df["daily_open"])*100
print(df.shape)
df.head()

(2881, 10)


Unnamed: 0,time,open,high,low,close,vol,day,hour,daily_open,perc_dev
0,2023-09-01 00:00:00,26009.3,26016.5,25627.2,25856.8,20063.967,1,0,26009.3,0.586329
1,2023-09-01 00:15:00,25856.8,26021.2,25840.1,25910.7,5158.731,1,0,26009.3,0.379095
2,2023-09-01 00:30:00,25910.7,25955.9,25888.7,25945.7,1300.917,1,0,26009.3,0.244528
3,2023-09-01 00:45:00,25945.7,26029.7,25945.0,26013.4,1616.798,1,0,26009.3,-0.015764
4,2023-09-01 01:00:00,26013.4,26013.4,25962.2,25991.8,844.671,1,1,26009.3,0.067284


In [30]:
# now select day and perc_dev for clustering

In [21]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

#lda = LDA(n_components=3, learning_method="batch").fit(df_avg)