# Data Preparation
---

## Import Libraries

In [None]:
#pip install imblearn

In [None]:
# Import Libraries
import os
import requests
import pandas as pd
from dotenv import load_dotenv
import panel as pn
import hvplot.pandas
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import tensorflow as tf
pn.extension('plotly')
import plotly.express as px
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

## Load API Keys from Environment Variables and Pull BTC Data

In [None]:
# Load .env enviroment variables
load_dotenv()

In [None]:
api_key = os.getenv("glassnode_api")
type(api_key)

In [None]:
# Define crypto currencies to pull
crypto_list = ["BTC"]

In [None]:
# Define Price URLs
price_url = 'https://api.glassnode.com/v1/metrics/market/price_usd'

In [None]:
# Price API Request
btc_price_res = requests.get(price_url,
                      params={'a': 'BTC',
                              'i': '24h',
                              'api_key': api_key})


# Convert price to Pandas Dataframe, set index to time and clean up file
btc_price_df = pd.read_json(btc_price_res.text, convert_dates=['t'])
btc_price_df.columns = ['Date', 'BTC Price']
btc_price_df.set_index('Date', inplace=True)

### BTC Data Table Debugging Cells

In [None]:
btc_price_df

### BTC Daily Changes

In [None]:
# Daily change in price column
btc_daily_price_pct_change = btc_price_df.pct_change(1)
btc_daily_price_pct_change.columns = ['BTC Daily Price Change %']
btc_daily_price_pct_change = btc_daily_price_pct_change.dropna()
# btc_daily_price_pct_change
btc_daily_price_pct_change.tail()

### BTC Data Aggregating & Cleaning

In [None]:
# Define all the different data frames into a list
btc_frames = [btc_price_df, btc_daily_price_pct_change]

# Concatenate all the dataframes into one
btc_data = pd.concat(btc_frames, axis=1, join="outer", ignore_index=False)
btc_data['BTC Daily Price Change %'] = btc_data['BTC Daily Price Change %']*100
btc_data.tail()

## Get News

In [None]:
# Generate DataFrame from CSV
bitcoin_df = pd.read_csv(Path('DataNews_Bitcoin.csv'))
bitcoin_df['Published Date'] = pd.to_datetime(bitcoin_df['Published Date'], exact = False, infer_datetime_format=True, format = '%Y/%m%d')
for i in bitcoin_df.index:
     bitcoin_df['Published Date'][i] = bitcoin_df['Published Date'][i].date()
bitcoin_df.columns = ['Date', 'Title', 'Content']

In [None]:
bitcoin_df

# Initialize VADER Sentiment Analyzer
___

In [None]:
# Download/Update the VADER Lexicon
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

In [None]:
# Create function to store the sentiment scores in DataFrame
btc_sentiments = []


for i in bitcoin_df.index:
    try:
        text = bitcoin_df["Content"][i]
        date = bitcoin_df["Date"][i]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
    
        
        btc_sentiments.append({
            "text": text,
            "Date": date,
            "compound": compound,
            "positive": pos,
            "negative": neg,
            "neutral": neu
        })
        
    except AttributeError:
        pass

    
    
# Create DataFrame
btc_df = pd.DataFrame(btc_sentiments)

# Reorder DataFrame columns
cols = ["Date", "text", "compound", "positive", "negative", "neutral"]
btc_df = btc_df[cols]

btc_df.head()

In [None]:
#btc_sentiments

In [None]:
# Get descriptive stats from the DataFrame
#btc_df.describe()

## Append daily BTC prices to Sentiment DF

In [None]:
# set Date column to index
btc_df['Date'] = pd.to_datetime(btc_df['Date'])
btc_df = btc_df.set_index('Date')

#btc_df

In [None]:
btc_data

In [None]:
btc_merged = pd.merge(
    btc_df,
    btc_data,
    how='inner',
    on='Date')

btc_merged.dropna()
btc_merged.head(20)

# Crude Visualizations
___

In [None]:
import seaborn as sns

In [None]:
#set mulitple daily observations to a single mean
btc_viz = btc_merged.groupby(pd.Grouper(freq='d')).mean().dropna(how='all')
btc_viz_text = pd.DataFrame(btc_merged['text'].groupby('Date').apply(lambda texts: ' '.join(texts)))

btc_viz_weekly = btc_merged.groupby(pd.Grouper(freq='w')).mean().dropna(how='all')
btc_viz_monthly = btc_merged.groupby(pd.Grouper(freq='m')).mean().dropna(how='all')
btc_viz_yearly = btc_merged.groupby(pd.Grouper(freq='y')).mean().dropna(how='all')

btc_corr = btc_viz.drop(columns=['positive','negative','neutral'])

btc_corr['lagprice'] = btc_corr['BTC Price'].shift(0)
btc_corr['lagprice2'] = btc_corr['BTC Price'].shift(1)

sns.heatmap(btc_corr.corr(), annot = True)


In [None]:
# Create text Column grouped by day
btc_viz_text = pd.DataFrame(btc_merged['text'].groupby('Date').apply(lambda texts: ' '.join(texts)))
# Concat text Column and Data Columns grouped by day
#btc_viz_grouped = pd.concat([btc_viz_text, btc_viz], axis = 1, join = 'inner' )
btc_viz_grouped = pd.merge(btc_viz_text, btc_viz, how = 'outer', left_on = 'Date', right_on = 'Date' )
btc_viz_grouped.head()

In [None]:
fig, ax1 = plt.subplots()

color = 'tab:red'
ax1.set_xlabel('Date')
ax1.set_ylabel('Sentiment Score', color=color)
ax1.plot(btc_viz['compound'], color=color)
ax1.tick_params(axis='y', labelcolor=color)
ax1.set_xticklabels(ax1.get_xticks(), rotation = 90)

ax2 = ax1.twinx()

color = 'tab:blue'
ax2.set_ylabel('Price Change %', color=color)  
ax2.plot(btc_viz['BTC Daily Price Change %'], color=color)
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout() 
plt.show()

# Build a predictive Model with Vader
___

In [None]:
# Add in a binary classifier for positive and negative price changes
btc_merged['target'] = np.where(btc_merged['BTC Daily Price Change %'] > 0, 1, 0)
# Add in a binary classifier for positive and negative price changes in grouped dataframe
btc_viz_grouped['target'] = np.where(btc_viz_grouped['BTC Daily Price Change %'] > 0, 1, 0)
btc_merged

In [None]:
X = btc_merged["text"]
y = btc_merged["target"]

# Split data into train & test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train)

In [None]:
# Visualize the data
plt.scatter(btc_merged['positive'],btc_merged['negative'], c=y)

In [None]:
# Append sentiment back using indices
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [None]:
# Check dimensions
print(f"Train: {train.shape[0]} rows and {train.shape[1]} columns")
print(f"{train['target'].value_counts()}\n")
print(f"Test: {test.shape[0]} rows and {test.shape[1]} columns")
print(test['target'].value_counts())

In [None]:
train[['neg', 'neu', 'pos', 'compound']] = train['text'].apply(analyzer.polarity_scores).apply(pd.Series)
train.head()

In [None]:
for var in ['pos', 'neg', 'neu', 'compound']:
    plt.figure(figsize=(12,4))
    sns.distplot(train.query("target==1")[var], bins=30, kde=False, 
                 color='blue', label='Positive')
    sns.distplot(train.query("target==0")[var], bins=30, kde=False, 
                 color='gray', label='Negative')
    plt.legend()
    plt.title(f'Histogram of {var} by true sentiment');

# Build a Predictive Model with RNN LSTM

In [None]:
# Import the Tokenizer method from Keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
# Create an instance of the Tokenizer and fit it with the X text data
tokenizer = Tokenizer(lower=True)
tokenizer.fit_on_texts(X)

In [None]:
# Print the first five elements of the encoded vocabulary
for token in list(tokenizer.word_index)[:10]:
    print(f"word: '{token}', token: {tokenizer.word_index[token]}")

In [None]:
# Transform the text data to numerical sequences
X_seq = tokenizer.texts_to_sequences(X)

In [None]:
# Contrast a sample numerical sequence with its text version
print("**Text comment**")
print({X[0]})

In [None]:
print("**Numerical sequence representation**")
print(X_seq[0])

In [None]:
# Import the pad_sequences method from Keras
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Set the pad size
max_words = 100

# Pad the sequences using the pad_sequences() method
X_pad = pad_sequences(X_seq, padding = 'post', truncating = 'post')
X_pad

In [None]:
# Creating training, validation, and testing sets using the encoded data
X_train_rnn, X_test_rnn, y_train_rnn, y_test_rnn = train_test_split(X_pad, y)

X_train_rnn, X_val_rnn, y_train_rnn, y_val_rnn = train_test_split(X_train_rnn, y_train_rnn)

In [None]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler

# RandomOverSampler randomly duplicates minority class transactions
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_rnn, y_train_rnn)


In [None]:
len(y_test_rnn)

In [None]:
# Import Keras modules for model creation
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [None]:
# Model set-up
vocabulary_size = len(tokenizer.word_counts.keys()) + 1
embedding_size = 64

In [None]:
# Define the LSTM RNN model
model = Sequential()

# Layer 1
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))

# Layer 2
model.add(LSTM(units=280))

# Output layer
model.add(Dense(units=1, activation="sigmoid"))

In [None]:
# Compile the model
model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=[
        "accuracy",
        tf.keras.metrics.TruePositives(name="tp"),
        tf.keras.metrics.TrueNegatives(name="tn"),
        tf.keras.metrics.FalsePositives(name="fp"),
        tf.keras.metrics.FalseNegatives(name="fn"),
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall"),
        tf.keras.metrics.AUC(name="auc"),
    ],
)

In [None]:
np.__version__

In [None]:
# Show model summary
model.summary()

In [None]:
# Training the model
batch_size = 1000
epochs = 10
model.fit(
    X_train_rnn,
    y_train_rnn,
    validation_data=(X_val_rnn, y_val_rnn),
    epochs=epochs,
    batch_size=batch_size,
    verbose=1,
)

In [None]:
# Predict classes using the testing data
y_rnn_pred = model.predict_classes(X_test_rnn, batch_size=1000)


# Model Comparison
___

In [None]:
# Import relevant libraries from sklearn
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc

### Classification Reports

In [None]:
#VADER - polarity model
train['vader_polarity'] = np.where(train['pos']>train['neg'], 1, 0)
target_names=['negative', 'positive']
print("Classification Report for the VADER Sentiment Model using polarity scores")
print(classification_report(train['target'], 
                            train['vader_polarity'], 
                            target_names=target_names))

In [None]:
#VADER - compound model
train['vader_compound'] = np.where(train['compound']>0, 1, 0)
print("Classification Report for the VADER Sentiment Model using compound scores")
print(classification_report(train['target'], 
                            train['vader_compound'], 
                            target_names=target_names))

In [None]:
# Display classification report for the RNN LSTM Model
print("Classification Report for the RNN LSTM Model")
print(classification_report(y_rnn_pred, y_test_rnn))

### Confusion Matrices

In [None]:
# Create function we can call for all models
def plot_cm(y_test, y_pred, target_names=['negative', 'positive'], 
            figsize=(5,3)):
    """Create a labelled confusion matrix plot."""
    cm = confusion_matrix(y_test, y_pred)
    fig, ax = plt.subplots(figsize=figsize)
    sns.heatmap(cm, annot=True, fmt='g', cmap='BuGn', cbar=False, 
                ax=ax)
    ax.set_title('Confusion matrix')
    ax.set_xlabel('Predicted')
    ax.set_xticklabels(target_names)
    ax.set_ylabel('Actual')
    ax.set_yticklabels(target_names, 
                       fontdict={'verticalalignment': 'center'});

In [None]:
# Plot confusion matrix for VADER using polarity scores
print("Confusion Matrix from the VADER model using polarity scores")
plot_cm(train['target'], train['vader_polarity'])

In [None]:
print("Confusion Matrix from the VADER model using compound scores")
plot_cm(train['target'], train['vader_compound'])

In [None]:
print("Confusion Matrix from the RNN LSTM Model")
plot_cm(y_test_rnn, y_rnn_pred)

### Accuracy Scores

In [None]:
print("Vader Accuracy - Polarity: %.2f" % (accuracy_score(train['target'], train['vader_polarity'])))
print("Vader Accuracy - Compound: %.2f" % (accuracy_score(train['target'], train['vader_compound'])))
print("RNN LSTM Accuracy %.2f" % (accuracy_score(y_test_rnn, y_rnn_pred)))

### AUC ROC

In [None]:
# Data for ROC Curve - VADER Compound
fpr_test_vader, tpr_test_vader, thresholds_test_vader = roc_curve(train['target'], train['vader_compound'])

In [None]:
# AUC for VADER
auc_test_vader = auc(fpr_test_vader, tpr_test_vader)
auc_test_vader = round(auc_test_vader, 4)

In [None]:
# Dataframe to plot ROC Curve for VADER
roc_df_test_vader = pd.DataFrame({"FPR Test": fpr_test_vader, "TPR Test": tpr_test_vader,})

In [None]:
roc_df_test_vader.plot(
    x="FPR Test",
    y="TPR Test",
    color="red",
    style="--",
    xlim=([-0.05, 1.05]),
    title=f"Test ROC Curve - VADER(Compound) (AUC={auc_test_vader})",
)

In [None]:
# Data for ROC Curve - VADER Compound
fpr_test_vader, tpr_test_vader, thresholds_test_vader = roc_curve(train['target'], train['vader_polarity'])

In [None]:
# AUC for VADER
auc_test_vader = auc(fpr_test_vader, tpr_test_vader)
auc_test_vader = round(auc_test_vader, 4)

In [None]:
# Dataframe to plot ROC Curve for VADER
roc_df_test_vader = pd.DataFrame({"FPR Test": fpr_test_vader, "TPR Test": tpr_test_vader,})

In [None]:
roc_df_test_vader.plot(
    x="FPR Test",
    y="TPR Test",
    color="red",
    style="--",
    xlim=([-0.05, 1.05]),
    title=f"Test ROC Curve - VADER(Polarity) (AUC={auc_test_vader})",
)

In [None]:
# Making predictions to feed the roc_curve module
test_predictions_rnn = model.predict(X_test_rnn, batch_size=1000)

In [None]:
# Data for ROC Curve - RNN LSTM Model
fpr_test_rnn, tpr_test_rnn, thresholds_test_rnn = roc_curve(y_test_rnn, test_predictions_rnn)

In [None]:
# AUC for the RNN LSTM Model
auc_test_rnn = auc(fpr_test_rnn, tpr_test_rnn)
auc_test_rnn = round(auc_test_rnn, 4)

In [None]:
# Dataframe to plot ROC Curve for the RNN LSTM model
roc_df_test_rnn = pd.DataFrame({"FPR Test": fpr_test_rnn, "TPR Test": tpr_test_rnn,})

In [None]:
roc_df_test_rnn.plot(
    x="FPR Test",
    y="TPR Test",
    color="blue",
    style="--",
    xlim=([-0.05, 1.05]),
    title=f"Test ROC Curve (AUC={auc_test_rnn})",
)