In [52]:
import praw
import pandas as pd
import re
from datetime import datetime

# Set up Reddit API
reddit = praw.Reddit(
    client_id='yZgp5fHdkhZQwGSCQ6Of4Q',
    client_secret='yZgmiMH34SQlf2efwsf1zIeqWXEvoQ',
    user_agent='Bitcoin Sentiment Analysis'
)

# List of subreddits
subreddits = ['Bitcoin', 'CryptoCurrency', 'BitcoinMarkets', 'btc']
posts = []

# Clean text utility
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s.,!?]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.lower().strip()

# Loop through each subreddit
for subreddit_name in subreddits:
    subreddit = reddit.subreddit(subreddit_name)
    
    # Fetch 50 posts per subreddit with keyword "Bitcoin"
    for submission in subreddit.search('Bitcoin', limit=50):
        try:
            submission.comments.replace_more(limit=0)
            # Extract up to 20 top-level comments
            top_comments = [comment.body for comment in submission.comments.list()[:20]]
            comment_text = " ".join([clean_text(comment) for comment in top_comments])
            
            title_clean = clean_text(submission.title)
            selftext_clean = clean_text(submission.selftext)
            combined_text = f"{title_clean} {selftext_clean} {comment_text}"
            
            posts.append([
                title_clean,
                selftext_clean,
                comment_text,
                combined_text,
                datetime.utcfromtimestamp(submission.created_utc),
                subreddit_name
            ])
        except Exception as e:
            print(f"⚠️ Skipping a post in r/{subreddit_name} due to error: {e}")
            continue

# Convert to DataFrame
df_reddit = pd.DataFrame(posts, columns=['title', 'selftext', 'comments', 'content', 'created_utc', 'subreddit'])
print(f"✅ Collected and cleaned {len(df_reddit)} posts (with comments) from {len(subreddits)} subreddits.")



✅ Collected and cleaned 200 posts (with comments) from 4 subreddits.


In [53]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import torch

tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

def get_finbert_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        logits = model(**inputs).logits
    scores = softmax(logits.numpy()[0])
    sentiment = scores[2] - scores[0]  # Positive - Negative
    return sentiment

df_reddit['content'] = df_reddit['title'] + ' ' + df_reddit['selftext'] + ' ' + df_reddit['comments']

df_reddit['sentiment'] = df_reddit['content'].apply(get_finbert_sentiment)


In [54]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta

# Define time range
end_date = datetime.now()
start_date = end_date - timedelta(days=10)

# Download BTC data
btc_data = yf.download('BTC-USD', start=start_date, end=end_date, interval='1h')

# Fix multilevel columns
if isinstance(btc_data.columns, pd.MultiIndex):
    btc_data.columns = btc_data.columns.get_level_values(0)  # Keep just the first level

# Reset index
btc_data.reset_index(inplace=True)

# Rename 'index' or confirm 'Datetime' exists
btc_data.rename(columns={'index': 'Datetime'}, inplace=True)

# Confirm column structure
print("btc_data.columns:", btc_data.columns)


[*********************100%***********************]  1 of 1 completed

btc_data.columns: Index(['Datetime', 'Close', 'High', 'Low', 'Open', 'Volume'], dtype='object', name='Price')





In [55]:
# Remove timezone from Reddit timestamps
df_reddit['created_utc'] = pd.to_datetime(df_reddit['created_utc']).dt.tz_localize(None)
btc_data['Datetime'] = pd.to_datetime(btc_data['Datetime']).dt.tz_localize(None)

# Create hourly timestamps
df_reddit['hour'] = df_reddit['created_utc'].dt.floor('H')
btc_data['hour'] = btc_data['Datetime'].dt.floor('H')

# Group Reddit sentiment by hour
sentiment_hourly = df_reddit.groupby('hour', as_index=False)['sentiment'].mean()

# ✅ Ensure btc_data has no MultiIndex
btc_data.columns = [col if isinstance(col, str) else col[0] for col in btc_data.columns]

# ✅ Merge on the hour column
data = pd.merge(btc_data, sentiment_hourly, on='hour', how='left')

# Fill missing sentiment with neutral
data['sentiment'].fillna(0, inplace=True)

# Select required columns
data = data[['Datetime', 'Close', 'sentiment']]

# Normalize
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data[['Close', 'sentiment']] = scaler.fit_transform(data[['Close', 'sentiment']])


  df_reddit['hour'] = df_reddit['created_utc'].dt.floor('H')
  btc_data['hour'] = btc_data['Datetime'].dt.floor('H')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['sentiment'].fillna(0, inplace=True)


In [56]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.layers import Dropout

def create_sequences(data, time_steps=60):
    X, y = [], []
    for i in range(len(data) - time_steps):
        X.append(data[i:i + time_steps])
        y.append(data[i + time_steps, 0])  # Close price
    return np.array(X), np.array(y)

dataset = data[['Close', 'sentiment']].values
time_steps = 60
X, y = create_sequences(dataset, time_steps)

split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(X.shape[1], X.shape[2])),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(1)
])
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train, y_train, epochs=80, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


<keras.callbacks.History at 0x1b300638dc0>

In [57]:
# Get forecast horizon from the user
forecast_hours = int(input("Enter how many hours ahead you'd like to forecast (e.g., 24, 48, 168): "))

# Forecast loop
last_sequence = dataset[-time_steps:]
forecast_input = last_sequence.copy()
forecast_prices = []

for _ in range(forecast_hours):
    input_seq = np.expand_dims(forecast_input[-time_steps:], axis=0)
    pred = model.predict(input_seq, verbose=0)[0][0]
    forecast_prices.append(pred)
    forecast_input = np.vstack([forecast_input, [pred, forecast_input[-1][1]]])  # Use last sentiment

# Decode forecasted prices
decoded_prices = [scaler.inverse_transform([[p, 0]])[0][0] for p in forecast_prices]
last_actual_price = scaler.inverse_transform([[dataset[-1][0], 0]])[0][0]

# Calculate percentage fluctuations hour by hour
print("\n📈 Hourly Forecasted BTC Price Fluctuations:\n")
prev_price = last_actual_price
for i, price in enumerate(decoded_prices):
    change = ((price - prev_price) / prev_price) * 100
    direction = "↑" if change > 0 else "↓" if change < 0 else "→"
    print(f"Hour {i+1:>2}: Predicted Price = ${price:,.2f} | Change = {change:+.2f}% {direction}")
    prev_price = price

# Final summary
final_predicted_price = decoded_prices[-1]
percentage_change = ((final_predicted_price - last_actual_price) / last_actual_price) * 100

print("\n🔮 Final Forecast Summary:")
if percentage_change > 0.1:
    print(f"✅ Yes, the price is increasing by {percentage_change:.2f}% in the next {forecast_hours} hours.")
elif percentage_change < -0.1:
    print(f"❌ No, the price is decreasing by {abs(percentage_change):.2f}% in the next {forecast_hours} hours.")
else:
    print(f"⚖️ The predicted change is negligible ({percentage_change:.2f}%) in the next {forecast_hours} hours.")



📈 Hourly Forecasted BTC Price Fluctuations:

Hour  1: Predicted Price = $103,248.11 | Change = +0.09% ↑
Hour  2: Predicted Price = $103,246.84 | Change = -0.00% ↓
Hour  3: Predicted Price = $103,254.48 | Change = +0.01% ↑
Hour  4: Predicted Price = $103,268.70 | Change = +0.01% ↑
Hour  5: Predicted Price = $103,287.53 | Change = +0.02% ↑
Hour  6: Predicted Price = $103,309.36 | Change = +0.02% ↑
Hour  7: Predicted Price = $103,332.95 | Change = +0.02% ↑
Hour  8: Predicted Price = $103,357.27 | Change = +0.02% ↑
Hour  9: Predicted Price = $103,381.58 | Change = +0.02% ↑
Hour 10: Predicted Price = $103,405.34 | Change = +0.02% ↑
Hour 11: Predicted Price = $103,428.12 | Change = +0.02% ↑
Hour 12: Predicted Price = $103,449.58 | Change = +0.02% ↑
Hour 13: Predicted Price = $103,469.52 | Change = +0.02% ↑
Hour 14: Predicted Price = $103,487.83 | Change = +0.02% ↑
Hour 15: Predicted Price = $103,504.40 | Change = +0.02% ↑
Hour 16: Predicted Price = $103,519.15 | Change = +0.01% ↑
Hour 17: P