In [2]:
import pandas as pd

In [3]:
df = pd.read_json('/Users/gabrieltaylor/Downloads/user-tweets.jsonl', lines = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12730 entries, 0 to 12729
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Text            12730 non-null  object
 1   UserName        12730 non-null  object
 2   LinkToTweet     12730 non-null  object
 3   TweetEmbedCode  12730 non-null  object
 4   CreatedAt       12730 non-null  object
dtypes: object(5)
memory usage: 497.4+ KB


### Compute Sentiment

In [4]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
import re

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/gabrieltaylor/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [5]:
df['Text'] = [re.sub(r"(?:\@|https?\://)\S+", "", tweet) for tweet in df['Text']]

In [6]:
df['Text'][0]

' I love the thought of a car drifting apparently endlessly through space and perhaps being discovered by an alien race millions of years in the future'

In [7]:
sia = SentimentIntensityAnalyzer()
sia.polarity_scores(df['Text'][0])

{'neg': 0.0, 'neu': 0.846, 'pos': 0.154, 'compound': 0.6369}

In [8]:
df['Sentiment'] = [sia.polarity_scores(tweet)['compound'] for tweet in df['Text']]

In [9]:
df.head()

Unnamed: 0,Text,UserName,LinkToTweet,TweetEmbedCode,CreatedAt,Sentiment
0,I love the thought of a car drifting apparent...,elonmusk,http://twitter.com/elonmusk/status/93704198630...,"<blockquote class=""twitter-tweet""><p lang=""en""...","December 02, 2017 at 07:33PM",0.6369
1,Asimov's Foundation books should def be part ...,elonmusk,http://twitter.com/elonmusk/status/93709071522...,"<blockquote class=""twitter-tweet""><p lang=""en""...","December 02, 2017 at 10:46PM",0.5859
2,That's certainly the right way to go to store...,elonmusk,http://twitter.com/elonmusk/status/93710961569...,"<blockquote class=""twitter-tweet""><p lang=""en""...","December 03, 2017 at 12:01AM",0.34
3,To preserve the transcendent majesty &amp; spe...,elonmusk,http://twitter.com/elonmusk/status/93739733099...,"<blockquote class=""twitter-tweet""><p lang=""en""...","December 03, 2017 at 07:05PM",-0.3182
4,Busted,elonmusk,http://twitter.com/elonmusk/status/93739781363...,"<blockquote class=""twitter-tweet""><p lang=""en""...","December 03, 2017 at 07:07PM",0.0


### Change Date

In [10]:
from datetime import datetime 

In [11]:
df['CreatedAt'][0].replace(",", "").replace("at", "")

'December 02 2017  07:33PM'

In [12]:
datetime.strptime(df['CreatedAt'][0].replace(",", "").replace("at", ""), '%B %d %Y %I:%M%p').date()

datetime.date(2017, 12, 2)

In [13]:
df['CreatedAt'] = [datetime.strptime(date.replace(",", "").replace("at", ""), '%B %d %Y %I:%M%p').date() for date in df['CreatedAt']]

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12730 entries, 0 to 12729
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Text            12730 non-null  object 
 1   UserName        12730 non-null  object 
 2   LinkToTweet     12730 non-null  object 
 3   TweetEmbedCode  12730 non-null  object 
 4   CreatedAt       12730 non-null  object 
 5   Sentiment       12730 non-null  float64
dtypes: float64(1), object(5)
memory usage: 596.8+ KB


### Aggregate Sentiment By Date

In [16]:
sum_square = lambda x: sum(x ** 2)

In [17]:
df_by_day = df.groupby('CreatedAt').agg({'Sentiment' : sum_square}).reset_index().rename({'CreatedAt' : 'Date'}, axis = 1)

In [18]:
df_by_day

Unnamed: 0,Date,Sentiment
0,2017-12-02,0.748920
1,2017-12-03,1.090182
2,2017-12-07,0.000000
3,2017-12-08,0.323369
4,2017-12-11,1.341099
...,...,...
1341,2021-11-30,2.011338
1342,2021-12-01,1.539978
1343,2021-12-02,0.462352
1344,2021-12-03,3.524062


### Stock Prices

In [19]:
prices = pd.read_csv("/Users/gabrieltaylor/Downloads/TSLA.csv")
prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1008 entries, 0 to 1007
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       1008 non-null   object 
 1   Open       1008 non-null   float64
 2   High       1008 non-null   float64
 3   Low        1008 non-null   float64
 4   Close      1008 non-null   float64
 5   Adj Close  1008 non-null   float64
 6   Volume     1008 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 55.2+ KB


In [93]:
prices['Date'][0]

'2017-12-04'

In [20]:
datetime.strptime(prices['Date'][0], '%Y-%m-%d').date()

datetime.date(2017, 12, 4)

In [21]:
prices['Date'] = [datetime.strptime(date, '%Y-%m-%d').date() for date in prices['Date']]

In [22]:
final_df = pd.merge(prices, df_by_day, how = 'inner', on = 'Date')

In [23]:
final_df.to_csv('/Users/gabrieltaylor/Python/STAT429/Elon.csv')