<a href="https://colab.research.google.com/github/francescodisalvo05/66DaysOfData/blob/main/notebooks-scripts/day_36.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# !pip install flair
# !pip install vaderSentiment
# !pip install yfinance

In [35]:
import flair
import json

import pandas as pd
import re

from datetime import datetime, timedelta
import requests
import pandas as pd

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

## Get Tweets

In [26]:
# https://towardsdatascience.com/sentiment-analysis-for-stock-price-prediction-in-python-bed40c65d178

# read bearer token for authentication
f = open("../dataset/barrer-token.txt", "r")
BEARER_TOKEN = f.read()

# setup the API request
endpoint = 'https://api.twitter.com/2/tweets/search/recent'
headers = {'authorization': f'Bearer {BEARER_TOKEN}'}
params = {
    #'query': '(tesla OR tsla OR elon musk) (lang:en)',
    'query': '(BTC OR btc OR bitcoin OR Bitcoin) (lang:en)',
    'max_results': '100',
    'tweet.fields': 'created_at,lang'
}

dtformat = '%Y-%m-%dT%H:%M:%SZ'  # the date format string required by twitter

def get_data(tweet):
    data = {
        'created_at': tweet['created_at'],
        'text': tweet['text']
    }
    return data

# we use this function to subtract 60 mins from our datetime string
def time_travel(now, mins):
    now = datetime.strptime(now, dtformat)
    back_in_time = now - timedelta(minutes=mins)
    return back_in_time.strftime(dtformat)
    
now = datetime.now()  # get the current datetime, this is our starting point
last_week = now - timedelta(days=6)  # datetime one week ago = the finish line
now = now.strftime(dtformat)  # convert now datetime to format for API

df = pd.DataFrame()  # initialize dataframe to store tweets

while True:
    
    if datetime.strptime(now, dtformat) < last_week:
        # if we have reached 7 days ago, break the loop
        break
        
    pre60 = time_travel(now, 230) 
    now = time_travel(now, 180)
    
    params['start_time'] = pre60
    params['end_time'] = now
    
    response = requests.get(endpoint,
                            params=params,
                            headers=headers) 
    now = pre60  
    
    for tweet in response.json()['data']:

        row = get_data(tweet)  
        df = df.append(row, ignore_index=True)

df.to_csv('BTC.csv')

In [27]:
df = pd.read_csv('BTC.csv')

## Clean text

In [28]:
def clean_text(text):
    """
    :param text: initial dirty tweet
    :return: cleaned tweet
    
    remove hashtag, remove tags, remove links,
    remove "RT" (retweet), substitute the "_" with a blank space
    """
    t = re.sub(r"#", "", text)
    t = re.sub(r"@[A-z]{1,}", "", t)
    t = re.sub(r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)", "", t)
    t = re.sub(r"RT", "", t)
    t = re.sub(r"_", " ", t)
    
    return t

df_cleaned = df.copy()
df_cleaned['text'] = df_cleaned.text.apply(lambda x : clean_text(x))

## Get sentiment

In [44]:
# flair
sentiment_model = flair.models.TextClassifier.load('en-sentiment')
# vader
analyzer = SentimentIntensityAnalyzer()

def get_sentiment_flair(text):
  sentence = flair.data.Sentence(text)
  sentiment_model.predict(sentence)

  # empty sentence
  if len(sentence.labels) == 0:
    return -1
  else:
    if sentence.labels[0].value == "POSITIVE":
      return 1
    elif sentence.labels[0].value == "NEGATIVE":
      return 0

def get_sentiment_vader(text):
    compound = analyzer.polarity_scores(text)['compound']
    if compound >= 0.3:
        return 1
    else: 
        return 0

def get_sentiment_textblob(text):
  polarity = TextBlob(text).sentiment.polarity    
  if polarity >= 0.3:
      return 1
  else: 
      return 0

# add probability and sentiment predictions to tweets dataframe
df_cleaned['sentiment_flair'] = df_cleaned.text.apply(lambda x : get_sentiment_flair(x))
df_cleaned['sentiment_vader'] = df_cleaned.text.apply(lambda x : get_sentiment_vader(x))
df_cleaned['sentiment_blob'] = df_cleaned.text.apply(lambda x : get_sentiment_textblob(x))

2021-04-01 19:20:24,566 loading file /root/.flair/models/sentiment-en-mix-distillbert_4.pt


In [45]:
df_cleaned.head()

Unnamed: 0.1,Unnamed: 0,created_at,text,sentiment_flair,sentiment_vader,sentiment_blob
0,0,2021-04-01T14:09:24.000Z,His account must‘ve been hacked! bitcoin ToTh...,0,0,0
1,1,2021-04-01T14:09:24.000Z,: New Airdrop: RaifeidoDAO (RDAO)\nReward: 20...,1,1,0
2,2,2021-04-01T14:09:24.000Z,: 1 Bitcoin vs 1kg of Gold.\n\nThis one isn't...,0,1,0
3,3,2021-04-01T14:09:24.000Z,There’s a whole lot of wishful thinking going ...,0,0,0
4,4,2021-04-01T14:09:24.000Z,: Thinking about not tweeting about bitcoin t...,0,0,0


In [47]:
# remove -1 for Vader
df_cleaned = df_cleaned[df_cleaned["sentiment_flair"] >= 0].copy()
df_cleaned.to_csv('BTC_sentiment.csv')

In [67]:
df_cleaned = pd.read_csv('BTC_sentiment.csv')
df_cleaned['created_at'] = pd.to_datetime(df_cleaned['created_at']).dt.strftime('%Y-%m-%d')

# df_cleaned['created_at'] = pd.to_datetime(df_cleaned['created_at']).dt.strftime('%Y-%m-%dT%H') 
# group by hour just if there would be a huge sample

In [68]:
df_cleaned.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,created_at,text,sentiment_flair,sentiment_vader,sentiment_blob
0,0,0,2021-04-01,His account must‘ve been hacked! bitcoin ToTh...,0,0,0
1,1,1,2021-04-01,: New Airdrop: RaifeidoDAO (RDAO)\nReward: 20...,1,1,0
2,2,2,2021-04-01,: 1 Bitcoin vs 1kg of Gold.\n\nThis one isn't...,0,1,0
3,3,3,2021-04-01,There’s a whole lot of wishful thinking going ...,0,0,0
4,4,4,2021-04-01,: Thinking about not tweeting about bitcoin t...,0,0,0


## Stock comparison

### Get stock data

In [70]:
import yfinance as yf
from datetime import datetime

tsla = yf.Ticker("btc-usd")

tsla_stock = tsla.history(
    start = pd.to_datetime(df_cleaned['created_at']).dt.strftime('%Y-%m-%d').min(),
    end= pd.to_datetime(df_cleaned['created_at']).dt.strftime('%Y-%m-%d').max(), 
    interval='60m'
).reset_index()

### Get daily sentiment

In [71]:
df_grouped = df_cleaned.copy()
df_grouped['created_at'] = pd.to_datetime(df_grouped['created_at']).dt.strftime('%Y-%m-%d')

daily_flair_sentiment = (df_grouped.groupby(by="created_at").sum() / df_grouped.groupby(by="created_at").count())['sentiment_flair']
daily_vader_sentiment = (df_grouped.groupby(by="created_at").sum() / df_grouped.groupby(by="created_at").count())['sentiment_vader']
daily_blob_sentiment = (df_grouped.groupby(by="created_at").sum() / df_grouped.groupby(by="created_at").count())['sentiment_blob']

In [75]:
import plotly.offline as pyo
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=2, cols=1, subplot_titles=("BTC", "% Positive Tweets"))

# stock prices
fig.add_trace(go.Scatter(x=tsla_stock['Datetime'], y=tsla_stock['Close'], name='price'), row=1, col=1)

# sentiment
fig.add_trace(go.Scatter(x=daily_flair_sentiment.index, y=daily_flair_sentiment, name='Flair'), row=2, col=1)
fig.add_trace(go.Scatter(x=daily_vader_sentiment.index, y=daily_vader_sentiment, name='Vader'), row=2, col=1)
fig.add_trace(go.Scatter(x=daily_blob_sentiment.index, y=daily_blob_sentiment, name='TextBlob'), row=2, col=1)

fig.show()