In [None]:
!pip install -q tweepy==4.10 yfinance scikit-learn pandas matplotlib seaborn

In [None]:
BEARER_TOKEN = "INSERT_YOUR_BEARER_TOKEN_HERE"  # 🔐 Replace with yours!

In [None]:

import tweepy
import yfinance as yf
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


In [None]:

def clean_text(text):
    text = str(text).lower() 
    #text = re.sub(r'@[A-Za-z0-9_]+', '', text)  
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)  # URL
    text = re.sub(r'[^a-zA-Z\s]', '', text)  
    text = re.sub(r'\s+', ' ', text).strip()  
    return text if text else "empty"  

In [None]:

print("🧠 Training sentiment model...")

# Download a small but clean dataset (Fake vs. Real News - classifiable text)
!wget -q https://raw.githubusercontent.com/clairett/the-fake-news-challenge/master/data/train.csv -O train.csv

# Leggi 2000 notizie "fake" (negative) e 2000 "real" (positive)
df_fake = pd.read_csv('train.csv', usecols=[3], names=['text'], skiprows=1, nrows=2000)
df_real = pd.read_csv('train.csv', usecols=[2], names=['text'], skiprows=1, nrows=2000)

df_fake['label'] = 0  # fake → negative
df_real['label'] = 1  # real → positive

# Combine and mix
df_full = pd.concat([df_fake, df_real]).sample(frac=1).reset_index(drop=True)

# Clean text
df_full['clean_text'] = df_full['text'].apply(clean_text)

# Remove empty line
df_full = df_full[df_full['clean_text'] != "empty"]
df_full = df_full[df_full['clean_text'].str.strip() != ""]

print(f"✅ Training data ready: {len(df_full)} samples")

vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2), min_df=2)
X = vectorizer.fit_transform(df_full['clean_text'])
y = df_full['label']

model = LogisticRegression(max_iter=1000)
model.fit(X, y)

print("✅ Sentiment model trained!")

In [None]:
# Collect Tweets About Bitcoin (Last 7 Days)

client = tweepy.Client(bearer_token=BEARER_TOKEN, wait_on_rate_limit=True)

seven_days_ago = datetime.now() - timedelta(days=7)
query = "bitcoin OR #bitcoin OR BTC OR $BTC lang:en" 

tweets = client.search_recent_tweets(
    query=query,
    start_time=seven_days_ago,
    max_results=100,
    tweet_fields=['created_at'],
    max_pages=3  # up to 300 tweets
)

# Saving data
data = []
for tweet in tweets:
    if tweet.text:
        data.append({
            'date': tweet.created_at,
            'text': tweet.text
        })

df_tweets = pd.DataFrame(data)
if len(df_tweets) == 0:
    raise ValueError("❌ No tweets collected. Check your query or Bearer Token.")

df_tweets['clean_text'] = df_tweets['text'].apply(clean_text)
print(f"✅ Collected {len(df_tweets)} tweets")

In [None]:
#  ANALYZE THE SENTIMENT OF TWEETS
X_new = vectorizer.transform(df_tweets['clean_text'])
sentiment = model.predict(X_new)
confidence = model.predict_proba(X_new).max(axis=1)

df_tweets['sentiment'] = sentiment  # 0=negative, 1=positive
df_tweets['confidence'] = confidence
df_tweets['date_only'] = pd.to_datetime(df_tweets['date']).dt.date

# Daily Sentiment  (mean)
daily_sentiment = df_tweets.groupby('date_only')['sentiment'].mean().reset_index()
daily_sentiment['date_only'] = pd.to_datetime(daily_sentiment['date_only'])

print("📊 Daily sentiment:")
print(daily_sentiment)

In [None]:
#  Bitcoin price dowload
print("💰 Downloading Bitcoin price...")

btc = yf.download("BTC-USD", start=seven_days_ago, end=datetime.now())
btc = btc[['Close']].reset_index()
btc['date_only'] = pd.to_datetime(btc['Date']).dt.date
btc.rename(columns={'Close': 'price'}, inplace=True)

print("📊 Bitcoin price data:")
print(btc[['date_only', 'price']])

In [None]:
# Merge data and visualize
merged = pd.merge(daily_sentiment, btc[['date_only', 'price']], on='date_only', how='inner')
merged.rename(columns={'sentiment': 'sentiment_score'}, inplace=True)
merged = merged.sort_values('date_only')

if len(merged) == 0:
    print("❌ No overlapping dates between tweets and price data.")
else:
    print("📈 Final data for plotting:")
    print(merged)

  
    plt.figure(figsize=(12, 6))
    sns.set_style("whitegrid")

    plt.plot(merged['date_only'], merged['price'], color='blue', label='Bitcoin Price (USD)')
    plt.twinx().plot(merged['date_only'], merged['sentiment_score'], color='green', linestyle='--', label='Sentiment Score')
    
    plt.title('Bitcoin: Twitter Sentiment vs Price (Last 7 Days)')
    plt.xlabel('Date')
    plt.xticks(rotation=45)
    
    lines, labels = plt.gca().get_legend_handles_labels()
    lines2, labels2 = plt.gca().get_legend_handles_labels()
    plt.legend(lines + lines2[:1], labels + labels2[:1], loc='upper left')

    plt.tight_layout()
    plt.show()