In [56]:
import numpy as np
import pandas as pd
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [57]:
tw_df = pd.read_csv('../data/3-preprocessed_tweets.csv')
tw_df.drop(columns='Unnamed: 0',inplace=True)
tw_df.head()

Unnamed: 0,timestamp,text
0,2016-01-01 00:00:04+00:00,Current price 43105 BTCUSD btc bitcoin 2015123...
1,2016-01-01 00:00:05+00:00,Current price 39605 € BTCEUR btc bitcoin 20160...
2,2016-01-01 00:00:05+00:00,Current price 29177 £ BTCGBP btc bitcoin 20160...
3,2016-01-01 00:00:53+00:00,RDD BTC exchange Cryptsy 00000000 4 Bittrex 00...
4,2016-01-01 00:02:32+00:00,RDD BTC exchange Cryptsy 00000000 4 Bittrex 00...


In [58]:
tw_df.isnull().sum()

timestamp    0
text         0
dtype: int64

In [59]:
#getting the subjectivity from TextBlob: Subjectivity is the output that lies within [0,1] and refers to personal opinions and judgments where 0 is very objective and 1 is very subjective
tw_df['subjectivity'] = tw_df['text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
tw_df.head()

Unnamed: 0,timestamp,text,subjectivity
0,2016-01-01 00:00:04+00:00,Current price 43105 BTCUSD btc bitcoin 2015123...,0.4
1,2016-01-01 00:00:05+00:00,Current price 39605 € BTCEUR btc bitcoin 20160...,0.4
2,2016-01-01 00:00:05+00:00,Current price 29177 £ BTCGBP btc bitcoin 20160...,0.4
3,2016-01-01 00:00:53+00:00,RDD BTC exchange Cryptsy 00000000 4 Bittrex 00...,0.4
4,2016-01-01 00:02:32+00:00,RDD BTC exchange Cryptsy 00000000 4 Bittrex 00...,0.4


In [60]:
tw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 702027 entries, 0 to 702026
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   timestamp     702027 non-null  object 
 1   text          702027 non-null  object 
 2   subjectivity  702027 non-null  float64
dtypes: float64(1), object(2)
memory usage: 16.1+ MB


In [61]:
tw_df.tail() 
#information of current price tend to get 0 for subjectivity but we also observe some like entry 702024 where it is unable to capture the subjectivity on predicting next pump which likely to be subjective

Unnamed: 0,timestamp,text,subjectivity
702022,2017-12-30 23:39:27+00:00,ETHLend LEND volume 1054 1 hour Volume 453505 ...,0.0
702023,2017-12-30 23:47:11+00:00,Gaming PCBitcoin Miner run game 1080p ultra se...,0.4
702024,2017-12-30 23:51:39+00:00,Next BigPump tomorrow 1300 GMT Want benefit Pu...,0.0
702025,2017-12-30 23:56:00+00:00,Bitcoin eliminates bullshit world It start exp...,1.0
702026,2017-12-30 23:57:52+00:00,Gooood Morning Asia 0900 Time buy cryptocurren...,0.0


In [62]:
#getting the Vader sentiment analysis: suitable for english words
sentiment_analyser = SentimentIntensityAnalyzer()
tw_df['compound'] = tw_df['text'].apply(lambda x: sentiment_analyser.polarity_scores(x)["compound"])
tw_df['pos_pol'] = tw_df['text'].apply(lambda x: sentiment_analyser.polarity_scores(x)["pos"])    
tw_df['neg_pol'] = tw_df['text'].apply(lambda x: sentiment_analyser.polarity_scores(x)["neg"])    
tw_df['neu_pol'] = tw_df['text'].apply(lambda x: sentiment_analyser.polarity_scores(x)["neu"])
tw_df.head()

Unnamed: 0,timestamp,text,subjectivity,compound,pos_pol,neg_pol,neu_pol
0,2016-01-01 00:00:04+00:00,Current price 43105 BTCUSD btc bitcoin 2015123...,0.4,0.0,0.0,0.0,1.0
1,2016-01-01 00:00:05+00:00,Current price 39605 € BTCEUR btc bitcoin 20160...,0.4,0.0,0.0,0.0,1.0
2,2016-01-01 00:00:05+00:00,Current price 29177 £ BTCGBP btc bitcoin 20160...,0.4,0.0,0.0,0.0,1.0
3,2016-01-01 00:00:53+00:00,RDD BTC exchange Cryptsy 00000000 4 Bittrex 00...,0.4,0.0,0.0,0.0,1.0
4,2016-01-01 00:02:32+00:00,RDD BTC exchange Cryptsy 00000000 4 Bittrex 00...,0.4,0.0,0.0,0.0,1.0


In [63]:
tw_df = tw_df[tw_df['neu_pol']!=1] #those with neutral polarity means that it has no bias opinion, therefore, excluding these data from 

In [64]:
tw_df.head()

Unnamed: 0,timestamp,text,subjectivity,compound,pos_pol,neg_pol,neu_pol
5,2016-01-01 00:08:47+00:00,In 10 min arb opps spanning 6 exchange pair yi...,0.0,0.4404,0.162,0.0,0.838
6,2016-01-01 00:10:07+00:00,Buying USD 100 Bitcoin 1 year ago THEN USD 313...,0.1,0.4939,0.208,0.0,0.792
7,2016-01-01 00:18:47+00:00,In 10 min arb opps spanning 6 exchange pair yi...,0.0,0.4404,0.162,0.0,0.838
8,2016-01-01 00:28:48+00:00,In 10 min arb opps spanning 5 exchange pair yi...,0.0,0.4404,0.162,0.0,0.838
9,2016-01-01 00:38:48+00:00,In 10 min arb opps spanning 6 exchange pair yi...,0.0,0.4404,0.162,0.0,0.838


In [65]:
tw_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 343973 entries, 5 to 702025
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   timestamp     343973 non-null  object 
 1   text          343973 non-null  object 
 2   subjectivity  343973 non-null  float64
 3   compound      343973 non-null  float64
 4   pos_pol       343973 non-null  float64
 5   neg_pol       343973 non-null  float64
 6   neu_pol       343973 non-null  float64
dtypes: float64(5), object(2)
memory usage: 21.0+ MB


In [66]:
tw_df['timestamp'] = pd.to_datetime(tw_df['timestamp'])
tw_d = tw_df.resample('D', on='timestamp').agg({'subjectivity':'mean', 'compound':'mean','pos_pol':'mean','neg_pol':'mean','neu_pol':'mean'})
tw_d.to_csv('../data/4-tweets-polarity-day.csv')
tw_d.head()

Unnamed: 0_level_0,subjectivity,compound,pos_pol,neg_pol,neu_pol
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-01-01 00:00:00+00:00,0.268975,0.30123,0.152325,0.036704,0.811015
2016-01-02 00:00:00+00:00,0.301871,0.388479,0.173094,0.014299,0.812633
2016-01-03 00:00:00+00:00,0.369974,0.428669,0.18662,0.005385,0.808005
2016-01-04 00:00:00+00:00,0.364042,0.393258,0.179719,0.009154,0.81114
2016-01-05 00:00:00+00:00,0.431407,0.399047,0.185405,0.00822,0.806382


In [67]:
tw_h = tw_df.resample('H', on='timestamp').agg({'subjectivity':'mean', 'compound':'mean','pos_pol':'mean','neg_pol':'mean','neu_pol':'mean'})
tw_h.to_csv('../data/4-tweets-polarity-hour.csv')
tw_h.head()

Unnamed: 0_level_0,subjectivity,compound,pos_pol,neg_pol,neu_pol
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-01-01 00:00:00+00:00,0.014286,0.448043,0.168571,0.0,0.831429
2016-01-01 01:00:00+00:00,0.166667,0.4404,0.173,0.0,0.827
2016-01-01 02:00:00+00:00,0.0,0.4043,0.164125,0.0,0.835875
2016-01-01 03:00:00+00:00,0.101667,0.3789,0.16175,0.0175,0.82075
2016-01-01 04:00:00+00:00,0.213333,0.359011,0.156667,0.021,0.822333
