Purpose: Use Vader sentiment analyzer to collect sentiment for each tweet it the dataframe. (Will repeat multiple times for each year)

In [1]:
import pandas as pd
from textblob import TextBlob
import re
import string
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import time 

In [2]:
path = "/Users/cocoramgopal/code/cryptocurrency1/data/raw/"

In [3]:
#Example using 2014 data 
original_df = pd.read_csv(path+"2014Data.csv")
original_df.dtypes


created_at       object
text             object
like_count        int64
retweet_count     int64
quote_count       int64
reply_count       int64
dtype: object

In [31]:
#Drop unecessary columns and convert text to string 
original_df = original_df.drop(['retweet_count', 'reply_count', 'quote_count', 'like_count'], axis = 1)
original_df['text_cleaned_string'] = pd.Series(original_df['text'], dtype = pd.StringDtype())


Unnamed: 0,created_at,text,text_cleaned_string
0,2014-01-30 23:58:43+00:00,@BitcoinPosse The bitcoin pins are and I wi...,@BitcoinPosse The bitcoin pins are and I wi...
1,2014-01-30 23:56:16+00:00,"In the last 10 mins, there were arb opps spann...","In the last 10 mins, there were arb opps spann..."
2,2014-01-30 23:55:38+00:00,"""I can't believe how much business I've gotten...","""I can't believe how much business I've gotten..."
3,2014-01-30 23:55:19+00:00,@dtcb @fascinated bitcoin as real life,@dtcb @fascinated bitcoin as real life
4,2014-01-30 23:54:48+00:00,Hey @BitSprinkle sprinkle me some #Bitcoin 12c...,Hey @BitSprinkle sprinkle me some #Bitcoin 12c...
...,...,...,...
716144,2014-12-01 00:00:15+00:00,1 #BTC (#Bitcoin) quotes:\n$376.08/$376.50 #Bi...,1 #BTC (#Bitcoin) quotes: $376.08/$376.50 #Bit...
716145,2014-12-01 00:00:05+00:00,Bitcoin's distributed asset ledger is the most...,Bitcoin's distributed asset ledger is the most...
716146,2014-12-01 00:00:05+00:00,One Bitcoin now worth $376.984. Market Cap $5....,One Bitcoin now worth $376.984. Market Cap $5....
716147,2014-12-01 00:00:02+00:00,The current BTC (#bitcoin) value in USD is $37...,The current BTC (#bitcoin) value in USD is $37...


In [32]:
#Convert Date to Datetime 
original_df['date_object'] = pd.to_datetime(original_df['created_at'], infer_datetime_format=True).dt.tz_localize(None)
original_df['datetime_striped'] = original_df['date_object'].dt.strftime('%b-%Y')
original_df['date_object'] =  original_df.date_object.dt.strftime("%Y-%m-%d") 

In [33]:
#Create new clean dataframe for sentiment analysis
clean_df = pd.DataFrame(original_df['text_cleaned_string'])
clean_df['month_year'] = original_df['datetime_striped']
clean_df['date'] =original_df['date_object']

In [35]:
#Function to process and clean tweets for analysis 

def process_tweets(tweet): 
    tweet = str(tweet)
    
    tweet = re.sub(r'\$\w*', '', tweet)

    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)

	# remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
	
	# remove hashtags
	# only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    tweet = re.sub('r-', '', tweet)
    
    #remove html tags from tweets
    clean = re.compile('<.*?>')
    tweet = re.sub(clean, '', tweet)
    
    #remove mentions
    tweet = re.sub(r"@(\w+)", ' ', tweet, flags=re.MULTILINE)
    
    #remove /n 
    tweet=re.sub("\n","",tweet)
    
    #remove punctuation
    apostrophe_handled = re.sub("'", "'", tweet) 
    
    text  = re.sub("[^-9A-Za-z\s]", "" , apostrophe_handled)
    text = text.lower()
    return text

In [36]:
clean_df['text_cleaned_string'] = clean_df['text_cleaned_string'].apply(process_tweets)

In [37]:
#Function that applies the TextBlob sentiment analysis to each tweet 

def tb_enrich(ls):
    #Enriches a column of text with TextBlob Sentiment Analysis outputs
    tb_polarity = []
    tb_subject = []

    for tweet in ls:
        sentiment = TextBlob(tweet).sentiment
        tb_polarity.append(sentiment[0])
        tb_subject.append(sentiment[1])
    

    return tb_polarity, tb_subject

In [38]:
#Function called to determine overall sentiment 
def get_sentiment(ls):
    
    sentiment_list = []
    
    for tweet in ls: 
        if tweet > 0.00:
            sentiment = 'positive'
        elif tweet < 0.00:
            sentiment = 'negative'
        else:
            sentiment = 'neutral'
        sentiment_list.append(sentiment)
    return sentiment_list 

In [39]:
clean_df["Polarity"], clean_df["Subjectivity"] = tb_enrich(list(clean_df['text_cleaned_string']))
clean_df['polarity_sentiment'] = get_sentiment(list(clean_df['Polarity']))

In [40]:
clean_df['polarity_sentiment'].value_counts()

neutral     394894
positive    229366
negative     91889
Name: polarity_sentiment, dtype: int64

In [41]:
#Get sentiment scores from Vader Sentiment Analyzer 
clean_df['scores'] = clean_df['text_cleaned_string'].apply(lambda tweet: SentimentIntensityAnalyzer().polarity_scores(tweet))

In [42]:
#Create a new column for each of the returned scores -- positive, neutral, negative, compound 
clean_df['compound']  = clean_df['scores'].apply(lambda score_dict: score_dict['compound'])
clean_df['neutral']  = clean_df['scores'].apply(lambda score_dict: score_dict['neu'])
clean_df['negative']  = clean_df['scores'].apply(lambda score_dict: score_dict['neg'])
clean_df['positive']  = clean_df['scores'].apply(lambda score_dict: score_dict['pos'])

#Get overall sentiment (positive, negative or neutral)
clean_df['sentiment'] = get_sentiment(list(clean_df['compound']))

In [44]:
clean_df.to_csv('TestClean2014Data.csv', sep = ',', index = False)

In [169]:
#Compute average scores delivered by Vader for each month and combine into one dataframe 

scores_df = clean_df.groupby('month_year')['positive'].mean().reset_index(name = 'avg_positive')
scores_df2 = clean_df.groupby('month_year')['neutral'].mean().reset_index(name = 'avg_neutral')
scores_df3 = clean_df.groupby('month_year')['negative'].mean().reset_index(name = 'avg_negative')
scores_df4 = clean_df.groupby('month_year')['compound'].mean().reset_index(name = 'avg_compound')

merged_score1 = pd.merge(scores_df, scores_df2, on = 'month_year')
merged_score2 = merged_score1.merge(scores_df3, on = 'month_year')
merged_score3 = merged_score2.merge(scores_df4, on = 'month_year')

In [176]:
merged_score3['datetime'] = pd.to_datetime(merged_score3['month_year'])
merged_score3 = merged_score3.sort_values(by='datetime').reset_index(drop=True)

In [178]:
#Compute average polarity and subjectivity 
polarity_df = clean_df.groupby('month_year')['Polarity'].mean().reset_index(name = 'avg_polarity')
subjectivity_df = clean_df.groupby('month_year')['Subjectivity'].mean().reset_index(name = 'avg_subjectivity')

merged_polarity = pd.merge(polarity_df, subjectivity_df, on = 'month_year')
merged_polarity['datetime'] = pd.to_datetime(merged_polarity['month_year'])
merged_polarity = merged_polarity.sort_values(by='datetime')

Unnamed: 0,month_year,avg_polarity,avg_subjectivity,datetime
4,Jan-2020,0.087531,0.370085,2020-01-01
3,Feb-2020,0.088812,0.366552,2020-02-01
7,Mar-2020,0.07543,0.349152,2020-03-01
0,Apr-2020,0.051916,0.380047,2020-04-01
8,May-2020,0.063667,0.378551,2020-05-01
6,Jun-2020,0.060581,0.386042,2020-06-01
5,Jul-2020,0.076816,0.351162,2020-07-01
1,Aug-2020,0.092482,0.361673,2020-08-01
11,Sep-2020,0.10453,0.381311,2020-09-01
10,Oct-2020,0.090738,0.357288,2020-10-01


In [179]:
#Merge all average scores together 
merged = pd.merge(merged_score3, merged_polarity[['month_year', 'avg_polarity']], on= 'month_year', how='left')
merged1 = pd.merge(merged, merged_polarity[['month_year', 'avg_subjectivity']], on= 'month_year', how='left')

In [181]:
merged1.to_csv('2017_avg_scores.csv', sep = ',', index = False)