In [42]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from datetime import datetime

data = pd.read_csv('tesla.csv')

In [43]:
data['tweet'].head()

0                                              TSLA欲しい
1    “There’s so many semi trucks and pickup trucks...
2    If you are going to buy a Tesla, hold onto you...
3    Can’t remember the last time $TSLA closed afte...
4               Wake me up when $TSLA reaches $1,000 😴
Name: tweet, dtype: object

In [44]:
#######################
## Title Sentiment Analysis
#######################

# We use this app called TextBlob to categorize text sentiment
# Returns two values - polarity, which is the actual sentiment,
# and subjectivity, which is the 'faith' that the program has
# on the sentiment.

data['tweet_sentiment'] = data['tweet'].apply(lambda x: TextBlob(x).sentiment)
data['tweet_sentiment_polarity'] = data['tweet_sentiment'].apply(lambda x: x.polarity)
data['tweet_sentiment_subjectivity'] = data['tweet_sentiment'].apply(lambda x: x.subjectivity)
data.drop(columns=['tweet_sentiment'], inplace=True)

In [45]:
data[['tweet', 'tweet_sentiment_polarity', 'tweet_sentiment_subjectivity']].sort_values('tweet_sentiment_polarity', ascending=False).head()

Unnamed: 0,tweet,tweet_sentiment_polarity,tweet_sentiment_subjectivity
6386,The best investments I’ve made involve one tra...,1.0,0.3
932,Who wore it best: $TSLA or Burj Khalifa? http...,1.0,0.3
636,$tsla batteries are the best of all electric v...,1.0,0.3
1606,"And just like that, Q420 is trending to be the...",1.0,0.3
8395,President Trump Praises @ElonMusk ”One Of Our ...,1.0,1.0


In [46]:
### TEST: Identify with positive sentiment only
data = data[data['tweet_sentiment_polarity'] > 0]

In [47]:
data['datetime'] = data['date'] + " " + data['time']
data['datetime'] = data['datetime'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

In [48]:
data.columns

Index(['id', 'conversation_id', 'created_at', 'date', 'time', 'timezone',
       'user_id', 'username', 'name', 'place', 'tweet', 'language', 'mentions',
       'urls', 'photos', 'replies_count', 'retweets_count', 'likes_count',
       'hashtags', 'cashtags', 'link', 'retweet', 'quote_url', 'video',
       'thumbnail', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
       'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
       'trans_dest', 'tweet_sentiment_polarity',
       'tweet_sentiment_subjectivity', 'datetime'],
      dtype='object')

In [49]:
data[['username','tweet', 'datetime']].sort_values('datetime', ascending=False).head()

Unnamed: 0,username,tweet,datetime
1,gerberkawasaki,“There’s so many semi trucks and pickup trucks...,2021-01-05 09:48:52
3,sawyermerritt,Can’t remember the last time $TSLA closed afte...,2021-01-05 08:17:57
5,goddesstessaa,Hi guys! Anyone interested in joining a stocks...,2021-01-05 07:16:52
6,elonsworld,$TSLA should do a new 7-1 stock split,2021-01-05 06:09:36
7,alternatejones,"In 2014, Elon predicted that Tesla would *prod...",2021-01-05 06:06:56


In [50]:
data['time_from_now'] = datetime.now() - data['datetime']

In [51]:
data[['username','tweet', 'likes_count', 'tweet_sentiment_polarity', 'time_from_now']].sort_values('time_from_now', ascending=True).head()

Unnamed: 0,username,tweet,likes_count,tweet_sentiment_polarity,time_from_now
1,gerberkawasaki,“There’s so many semi trucks and pickup trucks...,261,0.45,1 days 05:16:10.452720
3,sawyermerritt,Can’t remember the last time $TSLA closed afte...,177,0.18,1 days 06:47:05.452720
5,goddesstessaa,Hi guys! Anyone interested in joining a stocks...,208,0.25,1 days 07:48:10.452720
6,elonsworld,$TSLA should do a new 7-1 stock split,401,0.136364,1 days 08:55:26.452720
7,alternatejones,"In 2014, Elon predicted that Tesla would *prod...",379,0.5,1 days 08:58:06.452720


In [52]:
data_dategrouped = data.groupby('date')[['tweet_sentiment_polarity', 'likes_count', 'replies_count', 'retweets_count']].mean()

In [53]:
data_dategrouped['num_tweets'] = data.groupby('date')['id'].count()
data_dategrouped.head()

Unnamed: 0_level_0,tweet_sentiment_polarity,likes_count,replies_count,retweets_count,num_tweets
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-01,0.5,446.0,11.0,35.0,1
2020-01-02,0.223723,399.142857,24.857143,46.142857,7
2020-01-03,0.288614,535.6,25.75,73.0,20
2020-01-04,0.224613,445.714286,18.428571,52.0,7
2020-01-05,0.319618,1065.75,33.75,168.75,4


In [54]:
import plotly.express as px
fig = px.line(data_dategrouped, x=data_dategrouped.index, y='tweet_sentiment_polarity')
fig.show()

### Stock Ticker 

In [55]:
import yfinance as yf
from yahoofinancials import YahooFinancials

stock_df = yf.download('TSLA', interval='1d', start='2020-01-01')

[*********************100%***********************]  1 of 1 completed


In [56]:
stock_df = stock_df.join(data_dategrouped)
stock_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,tweet_sentiment_polarity,likes_count,replies_count,retweets_count,num_tweets
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-12-31,81.0,84.258003,80.416,83.666,83.666,51428500,,,,,
2020-01-02,84.900002,86.139999,84.342003,86.052002,86.052002,47660500,0.223723,399.142857,24.857143,46.142857,7.0
2020-01-03,88.099998,90.800003,87.384003,88.601997,88.601997,88892500,0.288614,535.6,25.75,73.0,20.0
2020-01-06,88.094002,90.311996,88.0,90.307999,90.307999,50665000,0.299827,416.142857,18.857143,46.571429,7.0
2020-01-07,92.279999,94.325996,90.671997,93.811996,93.811996,89410500,0.196893,490.444444,25.333333,63.611111,18.0


In [57]:
stock_df.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,tweet_sentiment_polarity,likes_count,replies_count,retweets_count,num_tweets
count,256.0,256.0,256.0,256.0,256.0,256.0,254.0,254.0,254.0,254.0,254.0
mean,291.674032,299.935766,283.333531,292.647672,292.647672,60458840.0,0.279593,508.400509,29.830666,50.113787,12.740157
std,171.496363,175.675161,167.394476,172.970563,172.970563,46117010.0,0.087016,444.795017,15.007502,32.902996,10.947025
min,74.940002,80.972,70.101997,72.244003,72.244003,6362400.0,0.05,192.0,4.0,10.0,1.0
25%,148.156498,154.881004,143.027,149.747505,149.747505,22632000.0,0.224705,355.825,20.7625,33.15,5.0
50%,249.817001,260.57901,245.163994,257.453995,257.453995,50462650.0,0.277385,423.671429,26.80625,42.318182,9.0
75%,422.055,431.762497,410.944511,421.430008,421.430008,85806950.0,0.324746,531.988971,36.352273,55.383333,17.0
max,723.659973,744.48999,719.200012,735.109985,735.109985,304694000.0,0.75,6075.0,117.321429,342.125,53.0


In [58]:
stock_df['total_likes'] = stock_df['num_tweets'] * stock_df['likes_count']

In [59]:
fig = px.line(stock_df, x=stock_df.index, y=['Close', 'num_tweets'])
fig.update_xaxes(rangeslider_visible=True)
fig.show()

In [60]:
# Multiple Axes graph
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=stock_df.index, y=stock_df['Open'], name="TSLA daily open"),
    secondary_y=True,
)

fig.add_trace(
    go.Scatter(x=stock_df.index, y=stock_df['num_tweets'], opacity=0.5, name="TSLA daily tweets <br> with >100 likes"),
    secondary_y=False,
)

# Add figure title
fig.update_layout(
    title_text="TSLA Open vs number of tweets"
)

# Set x-axis title
fig.update_xaxes(title_text="Date")

# Set y-axes titles
fig.update_yaxes(title_text="<b>primary</b> Num Tweets", secondary_y=False)
fig.update_yaxes(title_text="<b>secondary</b> Stock price", secondary_y=True)

fig.show()



In [61]:

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=stock_df.index, y=stock_df['Open'], name="TSLA daily open"),
    secondary_y=True,
)

fig.add_trace(
    go.Scatter(x=stock_df.index, y=stock_df['total_likes'], opacity=0.5, name="TSLA total likes"),
    secondary_y=False,
)

# Add figure title
fig.update_layout(
    title_text="TSLA High vs tweet likes"
)

# Set x-axis title
fig.update_xaxes(title_text="Date")

# Set y-axes titles
fig.update_yaxes(title_text="<b>primary</b> Tweet Sentiment", secondary_y=False)
fig.update_yaxes(title_text="<b>secondary</b> Stock price", secondary_y=True)

fig.show()

In [62]:
stock_df[['Close', 'High', 'likes_count', 'num_tweets', 'total_likes','tweet_sentiment_polarity']].corr()

Unnamed: 0,Close,High,likes_count,num_tweets,total_likes,tweet_sentiment_polarity
Close,1.0,0.998985,0.038271,0.450286,0.361818,-0.076504
High,0.998985,1.0,0.038543,0.462524,0.371993,-0.072597
likes_count,0.038271,0.038543,1.0,0.058438,0.566535,0.009685
num_tweets,0.450286,0.462524,0.058438,1.0,0.808616,0.01562
total_likes,0.361818,0.371993,0.566535,0.808616,1.0,0.013858
tweet_sentiment_polarity,-0.076504,-0.072597,0.009685,0.01562,0.013858,1.0


### Forecasting Stock Price using Num Tweets and Sentiment

In [63]:
from datetime import timedelta

# Shifts tweet dates two days forward, so that we can match tweets with 'future' stock price.
data_forecasted = data.copy(deep=True)
data_forecasted['datetime'] = data_forecasted['datetime'] + timedelta(days=4)
print(data_forecasted['datetime'] - data['datetime'])

1      4 days
3      4 days
5      4 days
6      4 days
7      4 days
        ...  
8783   4 days
8784   4 days
8785   4 days
8789   4 days
8793   4 days
Name: datetime, Length: 4127, dtype: timedelta64[ns]


In [64]:
# Strips datetime object into required date format
data_forecasted['date'] = data_forecasted['datetime'].apply(lambda x: x.strftime("%Y-%m-%d"))

# Groups by date and gets averages for different values
data_forecasted_dategrouped = data_forecasted.groupby('date')[['tweet_sentiment_polarity', 'likes_count', 'replies_count', 'retweets_count']].mean()
data_forecasted_dategrouped['num_tweets'] = data_forecasted.groupby('date')['id'].count()
data_forecasted_dategrouped.head()

Unnamed: 0_level_0,tweet_sentiment_polarity,likes_count,replies_count,retweets_count,num_tweets
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-05,0.5,446.0,11.0,35.0,1
2020-01-06,0.223723,399.142857,24.857143,46.142857,7
2020-01-07,0.288614,535.6,25.75,73.0,20
2020-01-08,0.224613,445.714286,18.428571,52.0,7
2020-01-09,0.319618,1065.75,33.75,168.75,4


In [65]:
stock_df = yf.download('TSLA', interval='1d', start='2020-01-05')
stock_df.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-06,88.094002,90.311996,88.0,90.307999,90.307999,50665000
2020-01-07,92.279999,94.325996,90.671997,93.811996,93.811996,89410500
2020-01-08,94.739998,99.697998,93.646004,98.428001,98.428001,155721500
2020-01-09,99.419998,99.760002,94.573997,96.267998,96.267998,142202000
2020-01-10,96.358002,96.987999,94.739998,95.629997,95.629997,64797500


In [66]:
stock_df = stock_df.join(data_forecasted_dategrouped)
stock_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,tweet_sentiment_polarity,likes_count,replies_count,retweets_count,num_tweets
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-01-06,88.094002,90.311996,88.0,90.307999,90.307999,50665000,0.223723,399.142857,24.857143,46.142857,7.0
2020-01-07,92.279999,94.325996,90.671997,93.811996,93.811996,89410500,0.288614,535.6,25.75,73.0,20.0
2020-01-08,94.739998,99.697998,93.646004,98.428001,98.428001,155721500,0.224613,445.714286,18.428571,52.0,7.0
2020-01-09,99.419998,99.760002,94.573997,96.267998,96.267998,142202000,0.319618,1065.75,33.75,168.75,4.0
2020-01-10,96.358002,96.987999,94.739998,95.629997,95.629997,64797500,0.299827,416.142857,18.857143,46.571429,7.0


In [67]:
# Multiple Axes graph
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=stock_df.index, y=stock_df['Open'], name="TSLA daily open"),
    secondary_y=True,
)

fig.add_trace(
    go.Scatter(x=stock_df.index, y=stock_df['num_tweets'], opacity=0.5, name="TSLA daily tweets <br> with >100 likes <br> -- timeshifted by 4 days"),
    secondary_y=False,
)

# Add figure title
fig.update_layout(
    title_text="TSLA Open vs number of tweets"
)

# Set x-axis title
fig.update_xaxes(title_text="Date")

# Set y-axes titles
fig.update_yaxes(title_text="<b>primary</b> Num Tweets", secondary_y=False)
fig.update_yaxes(title_text="<b>secondary</b> Stock price", secondary_y=True)

fig.show()

In [68]:
stock_df[['Close', 'High', 'likes_count', 'retweets_count','num_tweets', 'tweet_sentiment_polarity']].corr()

Unnamed: 0,Close,High,likes_count,retweets_count,num_tweets,tweet_sentiment_polarity
Close,1.0,0.99897,0.024453,-0.226579,0.447558,-0.030175
High,0.99897,1.0,0.022144,-0.228549,0.444244,-0.028565
likes_count,0.024453,0.022144,1.0,0.825502,-0.000123,0.044739
retweets_count,-0.226579,-0.228549,0.825502,1.0,-0.092864,0.087579
num_tweets,0.447558,0.444244,-0.000123,-0.092864,1.0,-0.008921
tweet_sentiment_polarity,-0.030175,-0.028565,0.044739,0.087579,-0.008921,1.0


In [69]:
fig = px.scatter(stock_df, x=stock_df['num_tweets'], y='Close', title='Num of Tweets (time-shifted by four days) vs Close Stock price for TSLA')
fig.update_xaxes(title_text='Number of Tweets',rangeslider_visible=True)
fig.update_yaxes(title_text='Close stock price')
fig.show()

In [70]:
stock_df.dropna(axis=0, how='any',inplace=True)
stock_df.isnull().sum()

Open                        0
High                        0
Low                         0
Close                       0
Adj Close                   0
Volume                      0
tweet_sentiment_polarity    0
likes_count                 0
replies_count               0
retweets_count              0
num_tweets                  0
dtype: int64

In [71]:
### Finding curve of best fit
from scipy.optimize import curve_fit

def objective(x, a, b, c):
    return a*x**2 + b*x + c

# fit curve
popt, _ = curve_fit(objective, stock_df['num_tweets'].to_numpy(), stock_df['Close'].to_numpy())
popt

array([ -0.27343181,  20.01141184, 150.55457272])

In [72]:
stock_df['line_of_best_fit'] = stock_df['num_tweets'].apply(lambda x: objective(x, popt[0], popt[1], popt[2]))

In [73]:
fig = px.scatter(stock_df, x=stock_df['num_tweets'], y=['Close', 'line_of_best_fit'], title='Num of Tweets (time-shifted by four days) vs Close Stock price for TSLA')
fig.update_xaxes(title_text='Number of Tweets',rangeslider_visible=True)
fig.update_yaxes(title_text='Close stock price')
fig.show()

### Identifying best day correlation difference between stock price and num tweets

In [74]:
def get_correlation(data, stock='TSLA', start_date='2020-01-01', day_difference=1):
    """
    get_correlation: Function which returns correlation between time-shifted num of tweets and the 
    stock price for a given day. What we do here is that we shift the day of creation of tweets by 
    day_difference so that we compare past counts of tweets with a specific day's stock price. 
    It's a very hack-y function but this is EDA anyway, not performance coding.

    <TODO> Don't like the idea of downloading tickers every time. That should be done once, and then
    days removed </TODO>

    args:
        data (pd.DataFrame): DataFrame containing tweet information (what's really needed is the
                             datetime value)
        stock (string): Stock Ticker of the stock we are analyzing
        start_date (string): Date to get stock prices from
        day_difference (int): The amount of the time-shift (in days)
    """
    start = datetime.strptime(start_date, '%Y-%m-%d')
    start = start + timedelta(days=day_difference)
    start = start.strftime('%Y-%m-%d')
    x = yf.download(stock, interval='1d', start=start)['Close']

    data_f = data[['datetime', 'id']].copy(deep=True)
    data_f['datetime'] = data_f['datetime'] + timedelta(days=day_difference)

    # Strips datetime object into required date format
    data_f['date'] = data_f['datetime'].apply(lambda x: x.strftime("%Y-%m-%d"))

    # Groups by date and gets averages for different values
    num_tweets = data_f.groupby('date')['id'].count()
    num_tweets = num_tweets.to_frame().join(x, how='inner')

    return np.corrcoef(num_tweets['id'], num_tweets['Close'])[0, 1]

In [80]:
x = np.linspace(0, 180, 181)
coeff = []
 
for i in x:
    coeff.append(get_correlation(data, day_difference=i))

[  0.   1.   2.   3.   4.   5.   6.   7.   8.   9.  10.  11.  12.  13.
  14.  15.  16.  17.  18.  19.  20.  21.  22.  23.  24.  25.  26.  27.
  28.  29.  30.  31.  32.  33.  34.  35.  36.  37.  38.  39.  40.  41.
  42.  43.  44.  45.  46.  47.  48.  49.  50.  51.  52.  53.  54.  55.
  56.  57.  58.  59.  60.  61.  62.  63.  64.  65.  66.  67.  68.  69.
  70.  71.  72.  73.  74.  75.  76.  77.  78.  79.  80.  81.  82.  83.
  84.  85.  86.  87.  88.  89.  90.  91.  92.  93.  94.  95.  96.  97.
  98.  99. 100. 101. 102. 103. 104. 105. 106. 107. 108. 109. 110. 111.
 112. 113. 114. 115. 116. 117. 118. 119. 120. 121. 122. 123. 124. 125.
 126. 127. 128. 129. 130. 131. 132. 133. 134. 135. 136. 137. 138. 139.
 140. 141. 142. 143. 144. 145. 146. 147. 148. 149. 150. 151. 152. 153.
 154. 155. 156. 157. 158. 159. 160. 161. 162. 163. 164. 165. 166. 167.
 168. 169. 170. 171. 172. 173. 174. 175. 176. 177. 178. 179. 180.]
[*********************100%***********************]  1 of 1 completed
[***********

In [81]:
fig = px.line(x=x.astype(int), y=coeff, title='Comparing Num Positive Tweets Correlation with Stock price vs <br> amount of days the tweets are time-shifted')
fig.update_xaxes(title_text='Num. of days time-shifted')
fig.update_yaxes(title_text="Pearson's correlation coefficient")
fig.show()