In [1]:
import pandas as pd

In [57]:
df = pd.read_json('/Users/gabrieltaylor/Downloads/user-tweets.jsonl', lines = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12730 entries, 0 to 12729
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Text            12730 non-null  object
 1   UserName        12730 non-null  object
 2   LinkToTweet     12730 non-null  object
 3   TweetEmbedCode  12730 non-null  object
 4   CreatedAt       12730 non-null  object
dtypes: object(5)
memory usage: 497.4+ KB


### Compute Sentiment

In [21]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
import re

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/gabrieltaylor/nltk_data...


In [17]:
df['Text'] = [re.sub(r"(?:\@|https?\://)\S+", "", tweet) for tweet in df['Text']]

In [23]:
df['Text'][0]

' I love the thought of a car drifting apparently endlessly through space and perhaps being discovered by an alien race millions of years in the future'

In [22]:
sia = SentimentIntensityAnalyzer()
sia.polarity_scores(df['Text'][0])

{'neg': 0.0, 'neu': 0.846, 'pos': 0.154, 'compound': 0.6369}

In [58]:
df['Sentiment'] = [sia.polarity_scores(tweet)['compound'] for tweet in df['Text']]

In [54]:
df.head()

Unnamed: 0,Text,UserName,LinkToTweet,TweetEmbedCode,CreatedAt,Sentiment
0,@highqualitysh1t I love the thought of a car d...,elonmusk,http://twitter.com/elonmusk/status/93704198630...,"<blockquote class=""twitter-tweet""><p lang=""en""...",2021-12-04,0.6369
1,@novaspivack Asimov's Foundation books should ...,elonmusk,http://twitter.com/elonmusk/status/93709071522...,"<blockquote class=""twitter-tweet""><p lang=""en""...",2021-12-04,0.5859
2,@novaspivack That's certainly the right way to...,elonmusk,http://twitter.com/elonmusk/status/93710961569...,"<blockquote class=""twitter-tweet""><p lang=""en""...",2021-12-04,0.34
3,To preserve the transcendent majesty &amp; spe...,elonmusk,http://twitter.com/elonmusk/status/93739733099...,"<blockquote class=""twitter-tweet""><p lang=""en""...",2021-12-04,-0.3182
4,@harrisonlingren @JW8888888 Busted,elonmusk,http://twitter.com/elonmusk/status/93739781363...,"<blockquote class=""twitter-tweet""><p lang=""en""...",2021-12-04,0.0


### Change Date

In [30]:
from datetime import datetime 

In [41]:
df['CreatedAt'][0].replace(",", "").replace("at", "")

'December 02 2017  07:33PM'

In [60]:
datetime.strptime(df['CreatedAt'][0].replace(",", "").replace("at", ""), '%B %d %Y %I:%M%p').date()

datetime.date(2017, 12, 2)

In [61]:
df['CreatedAt'] = [datetime.strptime(date.replace(",", "").replace("at", ""), '%B %d %Y %I:%M%p').date() for date in df['CreatedAt']]

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12730 entries, 0 to 12729
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Text            12730 non-null  object 
 1   UserName        12730 non-null  object 
 2   LinkToTweet     12730 non-null  object 
 3   TweetEmbedCode  12730 non-null  object 
 4   CreatedAt       12730 non-null  object 
 5   Sentiment       12730 non-null  float64
dtypes: float64(1), object(5)
memory usage: 596.8+ KB


### Aggregate Sentiment By Date

In [90]:
df_by_day = df.groupby('CreatedAt').agg({'Sentiment' : 'mean'}).reset_index().rename({'CreatedAt' : 'Date'}, axis = 1)

In [92]:
df_by_day['Date'][0]

datetime.date(2017, 12, 2)

### Stock Prices

In [78]:
prices = pd.read_csv("/Users/gabrieltaylor/Downloads/TSLA.csv")
prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1008 entries, 0 to 1007
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       1008 non-null   object 
 1   Open       1008 non-null   float64
 2   High       1008 non-null   float64
 3   Low        1008 non-null   float64
 4   Close      1008 non-null   float64
 5   Adj Close  1008 non-null   float64
 6   Volume     1008 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 55.2+ KB


In [93]:
prices['Date'][0]

'2017-12-04'

In [94]:
datetime.strptime(prices['Date'][0], '%Y-%m-%d').date()

datetime.date(2017, 12, 4)

In [95]:
prices['Date'] = [datetime.strptime(date, '%Y-%m-%d').date() for date in prices['Date']]

In [114]:
final_df = pd.merge(prices, df_by_day, how = 'inner', on = 'Date')

In [115]:
final_df.corr()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Sentiment
Open,1.0,0.999622,0.999469,0.999032,0.999032,-0.258575,-0.047899
High,0.999622,1.0,0.999375,0.999557,0.999557,-0.251106,-0.04941
Low,0.999469,0.999375,1.0,0.999565,0.999565,-0.265529,-0.048124
Close,0.999032,0.999557,0.999565,1.0,1.0,-0.257407,-0.050089
Adj Close,0.999032,0.999557,0.999565,1.0,1.0,-0.257407,-0.050089
Volume,-0.258575,-0.251106,-0.265529,-0.257407,-0.257407,1.0,0.01773
Sentiment,-0.047899,-0.04941,-0.048124,-0.050089,-0.050089,0.01773,1.0
