In [None]:
# Bitcoin Project - PQT
import os
import csv
import requests
import pandas as pd
import tweepy  
import praw
import kaggle as kg
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from pytrends.request import TrendReq

In [None]:
# Set up environment variables
    # Kaggle
KAGGLE_USERNAME = ""
KAGGLE_KEY = ""
    # Reddit
YOUR_CLIENT_ID = ""
YOUR_CLIENT_SERET = ""
YOUR_USER_AGENT = ""
    # Twitter
TWITTER_BEARER_TOKEN = ""


# Kaggle Credentials
os.environ['KAGGLE_USERNAME'] = '{KAGGLE_USERNAME}'
os.environ['KAGGLE_KEY'] = '{KAGGLE_KEY}'
kg.api.authenticate()

# Download VADER Sentiment Analyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

# Reddit Credentials
reddit = praw.Reddit(
    client_id="YOUR_CLIENT_ID",
    client_secret="YOUR_CLIENT_SECRET",
    user_agent="YOUR_USER_AGENT"
)

#Twitter Credentials
client = tweepy.Client(bearer_token=TWITTER_BEARER_TOKEN)

In [None]:
## Dataset 1: Bitcoin Historical Data
'''Gets min/min bitcoin data since 2012'''

data_1 = "mczielinski/bitcoin-historical-data"
df_1 = "onzip"
kg.api.dataset_download_files(dataset = data_1, path=df_1, unzip=True)
df_1 = pd.read_csv("on.zip/btcusd_1-min_data.csv")

## Alternativly you can download it directly as a csv:
''' 
data_1 = "mczielinski/bitcoin-historical-data"
file_name = "btcusd_1-min_data.csv"
kaggle.api.dataset_download_file(dataset=data_1, file_name=file_name, path="./")
df_1 = pd.read_csv(file_name)
'''


In [None]:
## Dataset 2: Bitcoin/Eth Prices
'''
    Gets day/day bitcoin data from start-Present
    *** Note there are some other useful datasets here 
    Explore!
'''
data_2 = "kapturovalexander/bitcoin-and-ethereum-prices-from-start-to-2023"
df_2 = "on_1.zip"
kg.api.dataset_download_files(dataset = data_2,path=df_2,unzip = True)
''' 
    There are 10 files in the zip: df_2 is currently 
    set to Eth/USD for the month of January, you can 
    change that by replacing on_1.zip/{filename.csv}
    with the name of the file.
'''
df_2 = pd.read_csv("on_1.zip/ETH-USD (01-05.2024).csv")


In [None]:
## Dataset 3: Bitcoin F&G index
'''Uses live data to get the fear and greed index of bitcoin'''
## get the data (json)
def get_fng_data():
    url = "https://api.alternative.me/fng/"
    params = {"limit": 0, "format": "json", "date_format": "world"}
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()["data"]
    else:
        print(response.status_code)
        return None
## json->csv
def convert_csv(data, filename):
    if not data:
        print("No data")
        return
    keys = data[0].keys()
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=keys)
        writer.writeheader()
        writer.writerows(data)

## Call functions
df_3 = get_fng_data()
convert_csv(df_3,"fear_greed_index.csv")

In [None]:
## Dataset 4: Twitter Scraping
'''
    Scrapes twitter and uses sentiment analysis to generate a score
    ** Note you can customize query, fields, and even do some
    filtering on the author based on what you think is best for the 
    model
'''
def fetch_tweets(query, max_tweets):
    tweets = []
    for tweet in tweepy.Paginator(client.search_recent_tweets, 
                                  query=query, tweet_fields=['created_at', 'text', 'author_id'], 
                                  max_results=100).flatten(limit=max_tweets):
        sentiment = sia.polarity_scores(tweet.text)
        tweets.append([
            tweet.created_at, tweet.author_id, tweet.text, 
            sentiment['compound'], sentiment['pos'], sentiment['neu'], sentiment['neg']
        ])

    df = pd.DataFrame(tweets, columns=['Date', 'Username', 'Tweet', 'Compound', 'Positive', 'Neutral', 'Negative'])
    df.to_csv("bitcoin_sentiment.csv", index=False)

fetch_tweets("Bitcoin OR BTC -filter:retweets",1000)

In [None]:
## Dataset 5: Reddit Scraping
'''Scrapes a subreddit and returns a csv of comments'''
def scrape_reddit(subreddit_name, limit):
    subreddit = reddit.subreddit(subreddit_name)
    posts = []

    for post in subreddit.hot(limit=limit):
        posts.append([
            post.title,
            post.score,
            post.created_utc,
            post.url
        ])
    
    df = pd.DataFrame(posts, columns=["Title", "Score", "Timestamp", "URL"])
    df.to_csv("reddit_bitcoin.csv", index=False)

scrape_reddit("Bitcoin",1000)

In [None]:
## Dataset 6: Google Search Trends
'''Maps google search trends with keyword 'Bitcoin' '''
def fetch_google(keyword, timeframe):
    pytrends = TrendReq(hl="en-US", tz=360)
    pytrends.build_payload([keyword], cat=0, timeframe=timeframe, geo="", gprop="")
    
    trends_data = pytrends.interest_over_time()
    if "isPartial" in trends_data.columns:
        trends_data.drop(columns=["isPartial"], inplace=True)
    
    trends_data.to_csv("google_trends_bitcoin.csv")

fetch_google("Bitcoin", "today 30-m")
