# **Pip install**

# **Importing Libraries**

In [1]:
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import yfinance as yf
from scipy.interpolate import interp1d, splev, splrep
from scipy.interpolate import UnivariateSpline
from sklearn.impute import SimpleImputer
from statsmodels.tsa.seasonal import STL
import numpy as np
from datetime import datetime, timedelta

# **Creating Dataset**

We are getting a dataset from kaggle where we can reach tweets, dates, and author's followers, adding this dataset 1 week later column for later, also making index as date because we are combine this dataset and bitcoin price dataset after.

In [2]:
chunk = pd.read_csv('Bitcoin_tweets.csv',chunksize=100000,lineterminator='\n',low_memory=False)
df = pd.concat(chunk)
df = df[["user_name","user_followers","text","date"]]
df = df.convert_dtypes()
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.dropna(subset=['date'])
df['date'] = df['date'].dt.strftime('%Y-%m-%d')
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df.set_index('date', inplace=True)
df = df.sort_index()
df["1_week_later"] = df.index + timedelta(days=7)
df.shape

(4689288, 4)

# **Filtering**

We use filtering because BRENT cost LOTSSS OF TIME 

In [3]:
df['user_followers'] = pd.to_numeric(df['user_followers'], errors='coerce')

df = df[~df["user_name"].str.contains("binance")]
df = df[~df["user_name"].str.contains("coinbase")]
df = df[~df["user_name"].str.contains("huobi")]
df = df[~df["user_name"].str.contains("kraken")]
df = df[~df["user_name"].str.contains("bitfinex")]
df = df[~df["user_name"].str.contains("okex")]
df = df[~df["user_name"].str.contains("gemini")]
df = df[~df["user_name"].str.contains("kucoin")]
df = df[df['text'].str.contains('long|short|buy|sell|over|up|down|support|resistance|reversal|breakdown|break|bull|bear|ban|whale|forecast', case=False, regex=True)]
df = df[~df["text"].str.contains("t.co")]

df = df[df['user_followers'] >= 10000]
df.shape

(54460, 4)

# **Decleare Start and End Time**

Before getting BTC price dataset, we need to know time interval

In [4]:
start_date = df.index[0]
end_date = df.iloc[-1]["1_week_later"]+ timedelta(days=1)
start_date,end_date

(Timestamp('2021-02-05 00:00:00'), Timestamp('2023-01-17 00:00:00'))

# **Creating Bitcoin Price Dataset**

Also adds another 2 column which are "change" and "result"
Change = 1 week later price - index price
Result = If price goes up, 1 else 0

In [5]:
btc_data = yf.download("BTC-USD",start=start_date, end=end_date)["Adj Close"]
btc_data = pd.DataFrame(btc_data)
btc_data["1_week_later_price"] = btc_data["Adj Close"].shift(-7)
btc_data["change"] = btc_data["1_week_later_price"] - btc_data["Adj Close"]
btc_data["result"] = btc_data["change"].apply(lambda x: 1 if x > 0 else 0)
btc_data

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Adj Close,1_week_later_price,change,result
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-02-05,38144.308594,47504.851562,9360.542969,1
2021-02-06,39266.011719,47105.515625,7839.503906,1
2021-02-07,38903.441406,48717.289062,9813.847656,1
2021-02-08,46196.464844,47945.058594,1748.593750,1
2021-02-09,46481.105469,49199.871094,2718.765625,1
...,...,...,...,...
2023-01-12,18869.587891,,,0
2023-01-13,19909.574219,,,0
2023-01-14,20976.298828,,,0
2023-01-15,20880.798828,,,0


# **Merge These 2 Datasets**

In [6]:
merged_data = pd.merge(df, btc_data, how='inner', left_index=True, right_index=True)
merged_data

Unnamed: 0,user_name,user_followers,text,1_week_later,Adj Close,1_week_later_price,change,result
2021-02-05,Sqwii,13070.0,#Bitcoin close to major breakout over @elonmus...,2021-02-12,38144.308594,47504.851562,9360.542969,1
2021-02-06,Crypto Insider X,24410.0,#Bitcoin new ATH before the #KansasCityChiefs ...,2021-02-13,39266.011719,47105.515625,7839.503906,1
2021-02-06,TOP AIM STOCKS,16544.0,#BTC Never forget. The only thing #Bitcoin ha...,2021-02-13,39266.011719,47105.515625,7839.503906,1
2021-02-06,TOP AIM STOCKS,16544.0,"#Bitcoin is back above $40,000, recouping near...",2021-02-13,39266.011719,47105.515625,7839.503906,1
2021-02-06,Rekt Capital,53610.0,Buying on the retrace has been a profitable in...,2021-02-13,39266.011719,47105.515625,7839.503906,1
...,...,...,...,...,...,...,...,...
2023-01-09,Trader Fred,14701.0,Strongest Movers in #USDT  1 #Zilliqa $zil 🚀  ...,2023-01-16,17196.554688,21169.632812,3973.078125,1
2023-01-09,Baron Chymaker.𝛑,12705.0,For all of you that say 100B supply of #Pi is ...,2023-01-16,17196.554688,21169.632812,3973.078125,1
2023-01-09,AJ Crypto,18573.0,What you bullish on? . #altcoins #altcoin #m...,2023-01-16,17196.554688,21169.632812,3973.078125,1
2023-01-09,Coingraph | News ,251511.0,JUST IN: Metropolitan Commercial Bank has anno...,2023-01-16,17196.554688,21169.632812,3973.078125,1


# **Cleaning Merged Data**

In [7]:
merged_data = merged_data[["text","result"]]
merged_data

Unnamed: 0,text,result
2021-02-05,#Bitcoin close to major breakout over @elonmus...,1
2021-02-06,#Bitcoin new ATH before the #KansasCityChiefs ...,1
2021-02-06,#BTC Never forget. The only thing #Bitcoin ha...,1
2021-02-06,"#Bitcoin is back above $40,000, recouping near...",1
2021-02-06,Buying on the retrace has been a profitable in...,1
...,...,...
2023-01-09,Strongest Movers in #USDT  1 #Zilliqa $zil 🚀  ...,1
2023-01-09,For all of you that say 100B supply of #Pi is ...,1
2023-01-09,What you bullish on? . #altcoins #altcoin #m...,1
2023-01-09,JUST IN: Metropolitan Commercial Bank has anno...,1


# **Check the Distribution**

Distribution should look almost like SMOTE I think

In [8]:
merged_data.groupby('result').size()

result
0    22927
1    31533
dtype: int64

In [9]:
g_df = merged_data.groupby(merged_data.index)['text'].apply(' '.join).reset_index()
g_df['result'] = merged_data.groupby(merged_data.index)['result'].mean().astype(int).values

g_df.set_index('index', inplace=True)
g_df

Unnamed: 0_level_0,text,result
index,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-02-05,#Bitcoin close to major breakout over @elonmus...,1
2021-02-06,#Bitcoin new ATH before the #KansasCityChiefs ...,1
2021-02-07,$58m of Bitcoin was just moved from Coinbase t...,1
2021-02-08,Not only did Elon Musk end the #Bitcoin correc...,1
2021-02-09,Your views on #Bitcoin &amp; the current #BTC-...,1
...,...,...
2022-12-27,I had a very similar discussion with #BTC maxi...,0
2023-01-06,NOT selling #Dash $DASH\n🚀\n\nLT bags:\n#BTC #...,1
2023-01-07,This is the worst time to sell your #Bitcoin. ...,1
2023-01-08,Strongest Movers in #USDT\n 1 #GALA $gala 🚀\n ...,1


# Apply BERT Transform

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from transformers import BertTokenizer, BertModel
import torch
from tqdm import tqdm
from sklearn.metrics import f1_score

X = g_df["text"]
y = g_df['result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

def get_bert_embedding(text, max_length=512):
    tokens = tokenizer.encode(text, add_special_tokens=True)[:max_length]
    tokens += [0] * (max_length - len(tokens))
    tokens_tensor = torch.tensor(tokens).unsqueeze(0)
    with torch.no_grad():
        outputs = model(tokens_tensor)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
train_embeddings = [get_bert_embedding(text, max_length=512) for text in tqdm(X_train)]
test_embeddings = [get_bert_embedding(text, max_length=512) for text in tqdm(X_test)]

  0%|          | 0/175 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (34008 > 512). Running this sequence through the model will result in indexing errors
  7%|▋         | 12/175 [00:06<01:16,  2.12it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
100%|██████████| 175/175 [01:35<00:00,  1.82it/s]
100%|██████████| 44/44 [00:20<00:00,  2.10it/s]


# MLP

In [13]:
results_df = pd.DataFrame(columns=['HiddenLayer1', 'HiddenLayer2', 'MaxIter', 'Accuracy'])

for i in range(1,8):
    for j in range(1,8):
        for k in range(1000,50000,1000):
            mlp = MLPClassifier(hidden_layer_sizes=(i,j), max_iter=k, tol= 0.001)
            mlp.fit(train_embeddings, y_train)
            predictions = mlp.predict(test_embeddings)
            accuracy = accuracy_score(y_test, predictions)

            results_df.loc[len(results_df)] = [i, j, k, accuracy]
results_df

Unnamed: 0,HiddenLayer1,HiddenLayer2,MaxIter,Accuracy
0,1.0,1.0,1000.0,0.568182
1,1.0,1.0,2000.0,0.431818
2,1.0,1.0,3000.0,0.568182
3,1.0,1.0,4000.0,0.568182
4,1.0,1.0,5000.0,0.431818
...,...,...,...,...
2396,7.0,7.0,45000.0,0.590909
2397,7.0,7.0,46000.0,0.613636
2398,7.0,7.0,47000.0,0.477273
2399,7.0,7.0,48000.0,0.522727


In [14]:
sorted_results_df = results_df.sort_values(by='Accuracy', ascending=False)
sorted_results_df

Unnamed: 0,HiddenLayer1,HiddenLayer2,MaxIter,Accuracy
1617,5.0,6.0,1000.0,0.727273
2242,7.0,4.0,38000.0,0.681818
2392,7.0,7.0,41000.0,0.681818
1574,5.0,5.0,7000.0,0.681818
2033,6.0,7.0,25000.0,0.681818
...,...,...,...,...
1951,6.0,5.0,41000.0,0.409091
453,2.0,3.0,13000.0,0.409091
1990,6.0,6.0,31000.0,0.409091
1431,5.0,2.0,11000.0,0.409091
