# Twitter vs Stock Price Data Analysis
In this particular analysis, we're looking at GameStop stock since the start of the new year. 

In [36]:
import pandas as pd 
import numpy as np 
import plotly.express as px
import traceback

## Data Cleaning
Here we clean the scraped dataset by removing null values and removing non-alphanumeric tweets as we can't analyze those. We also remove non-English tweets for now but we can think about analyzing different languages separately.

In [39]:
# Load data
tweet_data = pd.read_csv('data/tweet_data.csv')
print(tweet_data.shape)
tweet_data.isnull().sum()

(59662, 41)


Unnamed: 0                          0
id                                  0
conversation_id                     0
created_at                          0
date                                0
timezone                            0
place                           59594
tweet                               0
language                            0
hashtags                            0
cashtags                            0
user_id                             0
user_id_str                         0
username                            0
name                                3
day                                 0
hour                                0
link                                0
urls                                0
photos                              0
video                               0
thumbnail                       51151
retweet                             0
nlikes                              0
nreplies                            0
nretweets                           0
quote_url   

In [40]:
# Delete all columns with null values
tweet_data.drop(columns=['place', 'thumbnail', 'quote_url', 'near', 'geo', 'source', 'user_rt_id', 'user_rt', 'retweet_id', 'retweet_date', 'translate', 'trans_src', 'trans_dest'], inplace=True)

In [42]:
# Remove all non-alphanumeric tweets (because we can't find sentiment for them anyway)
# We want to remove things like non-Latin character based languages.
# NB: This may also be removing tweets with emojis, which we want to keep.

# Test string
test = 'Voulez-vous coucher avec moi :)?'
print(test, ' is ASCII: ', test.isascii())

tweet_data['is_alpha'] = tweet_data['tweet'].apply(lambda x: x.isascii())
print(tweet_data['is_alpha'].value_counts())

tweet_data = tweet_data[tweet_data['is_alpha']]
tweet_data.drop(columns=['is_alpha'])
print('Shape of tweet_data: ', tweet_data.shape)

Voulez-vous coucher avec moi :)?  is ASCII:  True
True    41949
Name: is_alpha, dtype: int64
Shape of tweet_data:  (41949, 29)


In [43]:
# Check for tweets that are empty
tweet_data['empty'] = tweet_data['tweet'].apply(lambda x: True if x == " " or x == '' else False)
print('Number of empty tweets: \n')
print(tweet_data['empty'].value_counts())

Number of empty tweets: 

False    41949
Name: empty, dtype: int64


In [44]:
# Get Latin character based language distribution
# We use a library called langdetect to detect which language the text is in
# Currently, we can use this to only keep English tweets, but in the future
# we can separately analyze different languages as well if we have enough
# support for them

from langdetect import detect

def detect_safe(x):
    try:
        language = detect(x)
    except Exception as error:
        language = '//~!!~ERR~!!~//'
        print(x)
    return language

# NB: This process takes a LONG time!
tweet_data['lang'] = tweet_data['tweet'].apply(lambda x: detect_safe(x))

# Remove all tweets that cause an error
# Most of these errors are strings with only URLs I believe
tweet_data = tweet_data[tweet_data['lang'] != '//~!!~ERR~!!~//']
print(tweet_data.shape)

.@GameStop  https://t.co/QAPacESSVv
(41948, 31)


In [46]:
# Plot of language distribution
fig = px.histogram(tweet_data, x='lang', title='Tweet Languages')
fig.show()

In [47]:
# Drop all non-English tweets
tweet_data = tweet_data[tweet_data['lang'] == 'en']
tweet_data.shape

(39229, 31)

In [48]:
# Save this cleaned data set
tweet_data.to_csv('data/tweet_data_cleaned.csv')

## Correlation with Stock Ticker
Here we look at the correlations of the dataset with the actual daily stock price. For this, we're going to use the ewm_processor class in ewm_correlation.py. This class calculates correlation between weighted tweet counts (by sentiment) against the stock ticker close. 



In [65]:
import yfinance as yf
from textblob import TextBlob
from datetime import datetime, timedelta
from ewm_correlation import ewm_processor

# Load cleaned data set
tweet_data = pd.read_csv('data/tweet_data_cleaned.csv')

### Tweet Sentiment Analysis
Here we take the scraped tweets, clean them and calculate their sentiment using NLP pre-trained models such as TextBlob. TextBlob is a pre-trained SVM (I believe) that maps extracted vectors from words to sentiment scores, and produces an overall sentiment for a string of text.

We apply TextBlob to all tweets in the tweet_data DataFrame, and save their sentiment polarity and subjectivity.

In [66]:
# An example
example_string = "Tesla stock is great! It's going to go nowhere but up!"
sentiment = TextBlob(example_string).sentiment
print(sentiment)

Sentiment(polarity=1.0, subjectivity=0.75)


### Comparing Stock Price with tweet counts

In [97]:
# Get Stock Price Data
stock_data = yf.download('GME', interval='1d', start_date='2021-01-01')

# Get Sentiment Scores for tweets
tweet_data['tweet_sentiment'] = tweet_data['tweet'].apply(lambda x: TextBlob(x).sentiment)
tweet_data['tweet_sentiment_polarity'] = tweet_data['tweet_sentiment'].apply(lambda x: x.polarity)
tweet_data['tweet_sentiment_subjectivity'] = tweet_data['tweet_sentiment'].apply(lambda x: x.subjectivity)
tweet_data.drop(columns=['tweet_sentiment'], inplace=True)

[*********************100%***********************]  1 of 1 completed


In [98]:
# Number of Tweets vs Stock Price

# Converting datetime string to YYYY-MM-DD only format for grouping
tweet_data['date_day'] = tweet_data['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime("%Y-%m-%d"))
num_tweets = tweet_data.groupby('date_day').count()['id']

# Joining stock_data with num_tweets
# Major problem here is that the NYSE doesn't trade on weekends, but people do tweet on weekends
# Going to do a right join and replace the NaN values with 0, just so we can see clearly on the graph
stock_data = stock_data.join(num_tweets, how='right')
stock_data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,id
date_day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-01,,,,,,,451
2021-01-02,,,,,,,802
2021-01-03,,,,,,,921
2021-01-04,19.0,19.1,17.15,17.25,17.25,10022500.0,897
2021-01-05,17.35,18.08,17.23,17.370001,17.370001,4961500.0,829


In [106]:
# Multiple Axes graph
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=plot_data.index, y=plot_data['Close'], name="GME daily close"),
    secondary_y=True, 
)

fig.add_trace(
    go.Scatter(x=plot_data.index, y=plot_data['id'], opacity=0.5, name="GME daily tweets"),
    secondary_y=False,
)

# Add figure title
fig.update_layout(
    title_text="GME Close vs Daily No. of Tweets"
)

# Set x-axis title
fig.update_xaxes(title_text="Date")

# Set y-axes titles
fig.update_yaxes(title_text="<b>primary</b> Num Tweets", secondary_y=False)
fig.update_yaxes(title_text="<b>secondary</b> Stock price", secondary_y=True)

fig.show()

print('Pearson Correlation between num tweets and Close: ', plot_data.corr()['id']['Close'])

Pearson Correlation between num tweets and Close:  0.5217080159504974


## Weighted Correlations
We create a 'weighted sentiment' score which is the weighted average of the count of tweets per day with respect to the sentiment polarity. Thus, a higher sentiment score will result in a higher score value for a day compared to another day with the same number of tweets. We take the exponential weighted mean of this score with respect to time-shifted data.

In [59]:
# This derived class updates the process_all_stocks function to use data from input rather than scrape
class ewm_processor_cleaned_data(ewm_processor):
    # base_processor ==> ewm_processor ==> ewm_processor_cleaned_data
    def __init__(self):
        super().__init__()

    """
    process_all_stocks: Over-arching function that downloads tweets, downloads stock prices,
    and produces correlation for all days in the shift range. This is going to be quite a slow
    process, depending on the size of data required to download and process. 

    args:
        stock_ticker (str) : The stock to be analyzed by this process
        start_date (datetime.datetime object): Starting date from which tweets and stock
                                            ticker data is to be downloaded and processed
        min_likes (int): Twint search requirement - defines the minimum number of likes 
                        required for a tweet to be picked up by the scraper
        shift_range (int): Range of days - [0, shift_range] - the time-shifting is to be
                        done for.
    
    output:
        (dict): A dictionary containing mappings of stock acronyms to their correlation lists.
    """
    def process_all_stocks(self, data, stock_ticker, start_date, shift_range):
        output = {}

        # Check - if the amount of shift is greater than the number of days
        # we are taking tweets for, we have a problem
        assert(shift_range < (datetime.now() - start_date).days)

        # Check - start date has to be less than or equal to the current date
        assert(start_date <= datetime.now())
            
        # Calculate sentiment
        data['tweet_sentiment'] = data['tweet'].apply(lambda x: TextBlob(x).sentiment)
        data['tweet_sentiment_polarity'] = data['tweet_sentiment'].apply(lambda x: x.polarity)
        data['tweet_sentiment_subjectivity'] = data['tweet_sentiment'].apply(lambda x: x.subjectivity)
        data.drop(columns=['tweet_sentiment'], inplace=True)

        # Keep ONLY positive tweets
        # data = data[data['tweet_sentiment_polarity'] > 0]

        # Scrape stock ticker data
        stock_data = yf.download(stock_ticker, interval='1d', start=start_date)

        # Get correlation between time-shifted tweet data and stock ticker data
        out = []
        for i in range(0, shift_range):
            start_shifted = start_date + timedelta(days=i)
            out.append(self.get_correlation(data, stock_data, start_shifted, i))

        output[stock_ticker] = out

        return output 

In [64]:
    stock_ticker = 'GME'
    start_date = datetime.strptime('2021-01-01', '%Y-%m-%d')
    processor = ewm_processor_cleaned_data()
    output = processor.process_all_stocks(tweet_data, stock_ticker, start_date, 20)

    output = pd.DataFrame.from_dict(output)
    output.to_csv('data/tweet_correlations_ewm_gamestop.csv')
    fig = px.line(output, x=output.index, y=['GME'], title='Weighted Correlation between past tweets and stock price')
    fig.update_xaxes(title_text='Days past',rangeslider_visible=True)
    fig.update_yaxes(title_text='Correlation')
    fig.show()

[*********************100%***********************]  1 of 1 completed
