In [1]:
import numpy as np
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import random
import datetime
import re
from collections import Counter
# from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#stuff from nltk
import nltk
from nltk.tokenize import word_tokenize #<- For finding tokens (small divisions) from a large sample of text
from nltk.corpus import stopwords #<- For calling the know stopwords in english (e.g, articles, connectors)
from nltk.corpus import wordnet #<- For calling a lexical database in eglish with meanings, synonyms, antonyms, and more 
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentAnalyzer

########## progress bar
from tqdm.notebook import tqdm, trange
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()
import time

sid_analyzer = SentimentIntensityAnalyzer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

##### emotions
import text2emotion as te



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\filip\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\filip\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\filip\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 

In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

new_words = {
    'bear': -1.3,
    'bears': -1.5,
    'bearish': -3.4,
    'bulls':1.9,
    'bull':1.8,
    'bullish':2.3,
    'whales':-1.1,
    'support':1,
    'resistance':-0.3,
    'short':-0.8,
    'shorts':-0.8,
    'long':1.3,
    'longs':1.3,
    'bounce':1.1,
    'rekt':-2.2,
    'arbitrage':0.4,
    'manipulation':-2.7,
    'bot':-0.9,
    'strategy':1.5,
    'SEC':-1.0,
    'regulations':-1.2,
    'FUD':-1.9,
    'ICO':-0.4,
    'CNBC':-2.1,
    'Fox':-2.1,
    'cramer':-1.0,
    'hodl':2.357,
    'holder':2.357,
    'bag':1.0,
    'bagholder':-1.0,
    'paperhands':-1.0,
    'diamondhands':1.0,
    'all time high':3.0,
    'ATH':3.0,
    'all time low':-3.0,
    'ATL':-3.0,
    "Bull Market": 2.3,
    'bear market': -2.3,
    "All time high": 2.3,
    "Trading analysis": 1.0,
    "Short squeeze": 0.6,
    "Closing a long": 1.6,
    "Closing a short": 0.5,
    "Opening a long": 1.3,
    "Opening a short": -0.9,
    "flip a coin": 0.6,
    'Jerome Powell': -1.0,
    'jpow':-1.0,
    'bull trap':-1.0,
    'bear trap':1.0,
    'crab':-0.5,
    'crabbing':-0.5,
    'pump':2.0,
    'pumping':2.0,
    'dump':-2.0,
    'dumping':-2.0,
    'pamp':2.0,
    'damp':-2.0,
    'tendies':2.0,
    'print': 2.0,
    'printing':2.0,
    'shitcoin':-3.0,
    'vaporware':-3.0,
}
    
SIA = SentimentIntensityAnalyzer()

SIA.lexicon.update(new_words)

In [None]:
reply3 = 'bear'
SIA.polarity_scores(reply3)


### have to use initially cleaned dataset again with neutral comments as previously neutral comments with new lexicon may no longer be neutral

#### Before running the new lexicon I suspect we will find 
1. less neutral comments
2. comments being more polarised because new phrases will add to shifting both distributions towards their maximums of 1 and -1
3. the aim being capturing more sentiment... therefore I hope more accurate binning of comments

In [None]:
dt_cols = {"author": str, "body": str, "created_utc": str, "score": float}
df = pd.read_csv('df.csv',  usecols=dt_cols, low_memory = False)
df

In [None]:
def clean_removed_deleted(df):
    
    """Input = uncleaned reddit data dataframe with a column called 'body'
    
    Output = body column with removed '[deleted]' '[removed]' entries which do not give us
    anything of value for sentiment analysis
    
    '[deleted]' = deleted post by poster
    '[removed]' = deleted by moderators, either auto or manually
    """
    
    # NA's
    df = df.dropna(how='any')
    
    # Removing '[removed]' entries which create noise, account for about 8% of data
    print('dropping [removed] posts')
    removed = df[df.loc[:, 'body'].progress_apply(lambda x: str(x)=="[removed]")]
    index_r = removed.index
    cleaned_r = pd.DataFrame.drop(df, index = index_r)
    
    # Removing '[deleted]' entries to remove noise
    print(' dropping [deleted] posts')
    deleted = cleaned_r[cleaned_r.loc[:,'body'].progress_apply(lambda x: str(x)=="[deleted]")]
    index_d = deleted.index
    cleaned_d = pd.DataFrame.drop(cleaned_r, index = index_d)
    cleaned = cleaned_d

    return cleaned

cleaned_comments  = clean_removed_deleted(df.copy(deep=True))
cleaned_comments

In [None]:
bot_list = ['AutoModerator', '___alexa___', 'SwapzoneIO', 'ccModBot', 'coinfeeds-bot', 'CryptoMods','[deleted]', '[removed]']

# users = df["author"].value_counts()
# users.head(50)

def remove_bots(df, bot_list, column_name):
    
    """This function is designed to compare authors of comments in dataframe with a predefined list of bot accounts
    
    inputs = dataframe, list of bots, column name (must be a string) in dataframe to compute comparison
    
    output = dataframe with comments by bot accounts removed"""
    is_in = df[~df[column_name].isin(bot_list)]     
    
    return is_in

removed_bots = remove_bots(df=cleaned_comments.copy(deep=True), bot_list = bot_list, column_name = 'author')
removed_bots

In [None]:
def clean_hyperlinks(df, column_name):
    
    """This function removed all hyperlinks that behin with 'http' and 
    replace with a whitespace and counts number of hyperlinks removed
    
    Input = Dataframe, selected column to remove hyperlinks
    
    Output = Dataframe with removed hyperlinks in text column
    """
#     count = 0
    
#     for comment in df['body']:
#         if (re.subn(r'http\S+', " ", comment)[1]) > 0:
#             count+=1
                 
#     print("removed hyperlinks in {} rows/comments".format(count))
    
    df[column_name] = df[column_name].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
    
    return df

df1 = removed_bots.copy(deep=True)

df2 = clean_hyperlinks(df1, column_name = 'body')
df2

In [None]:
def get_datetime(df):
    
    """Reddit does not provide a datetime format, comments based on UTC format
    Input = This function takes in Pandas dataframe and expects the presence of a 'created_utc' column
    to convert to Datetime
    
    Output = New column in dataframe called 'datetime' with date present"""

    df['created_utc'] = df['created_utc'].astype(int)
    x = df['created_utc']
    
    datetime = []
    print('creating datetime column')
    for num in tqdm_notebook(x):
        y = pd.Timestamp(num, unit='s', hour=None)
        datetime.append(y)  
    df['date'] = datetime
    df['date'] = pd.to_datetime(df['date']).dt.date
    df
    
    return df

# df1 = removed_bots.copy(deep=True)

datetime = get_datetime(df2)
datetime#.info()

In [None]:
def get_sentiment(text:str, analyser, desired_type:str='pos'):
    # Get sentiment from text
    sentiment_score = SIA.polarity_scores(text)
    return sentiment_score[desired_type]

def get_sentiment_scores(df,data_column):
    print('Getting compound sentiment')
    df['Compound Sentiment Score'] = df[data_column].astype(str).progress_apply(lambda x: get_sentiment(x,SIA,'compound'))
    
#     print('Getting compound sentiment')
#     df['Positive Sentiment Score'] = df[data_column].astype(str).progress_apply(lambda x: get_sentiment(x,sid_analyzer,'pos'))
    
#     print('Getting compound sentiment')
#     df['Negative Sentiment Score'] = df[data_column].astype(str).progress_apply(lambda x: get_sentiment(x,sid_analyzer,'neg'))
    
#     print('Getting compound sentiment')
#     df['Neutral Sentiment Score'] = df[data_column].astype(str).progress_apply(lambda x: get_sentiment(x,sid_analyzer,'neu'))
    return df

# df_small

df = get_sentiment_scores(datetime, 'body')
df

In [None]:
sentiment_df_crypto = df.copy(deep=True)
sentiment_df_crypto

In [None]:
neutral = sentiment_df_crypto[sentiment_df_crypto['Compound Sentiment Score'] == 0.0].index
neutral
df_clean = sentiment_df_crypto.drop(index = neutral)
df_clean

In [None]:
# df_clean.to_csv('sentiment_df_crypto.csv', header=True, index=False, columns=list(sentiment_df_crypto.axes[1]))

# crypto with 0.25 threshold

In [2]:
df_clean = pd.read_csv('sentiment_df_crypto.csv', parse_dates=['date'])
df_clean

Unnamed: 0,author,body,created_utc,score,date,Compound Sentiment Score
0,TechnoMagik,I'm not sure how you eliminate spread.. If I a...,1368332818,1.0,2013-05-12,0.7946
1,mytwobitcents,fixed thanks,1368321753,2.0,2013-05-12,0.4404
2,sex_with_a_goat,"The Spanish one is wrong, we don't use 'y' wit...",1368318717,2.0,2013-05-12,-0.4767
3,davidpbrown,"Yes, Russian Trolls are the most obvious answer.",1368298185,2.0,2013-05-11,0.4019
4,bigglejones,"A ""consultant"" asking advice on how to be a co...",1368257850,2.0,2013-05-11,0.2883
...,...,...,...,...,...,...
4596340,damnusernamegotcutof,Have mercy oh crypto gods for I can only get s...,1612828904,2.0,2021-02-09,0.3612
4596341,larrydavid4eyedfuck,"yes thats correct, I own the ethereum wallet t...",1612828903,1.0,2021-02-09,0.7782
4596342,ChocolateMorsels,"This is insanity. Thank you Elon, very cool!",1612828897,3.0,2021-02-09,0.1742
4596343,Jooylo,"$1,000,000 would make the market cap of BTC at...",1612828896,2.0,2021-02-09,0.3384


In [3]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4596345 entries, 0 to 4596344
Data columns (total 6 columns):
 #   Column                    Dtype         
---  ------                    -----         
 0   author                    object        
 1   body                      object        
 2   created_utc               int64         
 3   score                     float64       
 4   date                      datetime64[ns]
 5   Compound Sentiment Score  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(2)
memory usage: 210.4+ MB


In [4]:
df_positive = df_clean.copy(deep=True)
df_negative = df_clean.copy(deep=True)

df_positive_drop = df_positive[df_positive['Compound Sentiment Score'] <= 0.25].index
df_positive_drop
df_clean_pos = df_positive.drop(index = df_positive_drop)

df_negative_drop = df_negative[df_negative['Compound Sentiment Score'] >= -0.25].index
df_negative
df_clean_neg = df_negative.drop(index = df_negative_drop)



nonneutral_05 = pd.concat((df_clean_neg, df_clean_pos))
nonneutral_05

Unnamed: 0,author,body,created_utc,score,date,Compound Sentiment Score
2,sex_with_a_goat,"The Spanish one is wrong, we don't use 'y' wit...",1368318717,2.0,2013-05-12,-0.4767
5,davidpbrown,20% not in cold storage = ?a sizeable amount o...,1368255514,1.0,2013-05-11,-0.2732
7,sgodsdogs,it seems there's been a lot of talking heads i...,1368249064,5.0,2013-05-11,-0.5574
20,hyh123,I see. But copycats of the world don't seem to...,1368117303,1.0,2013-05-09,-0.8221
23,AltClubGirls,Sorry that setting was changed by mistake. It ...,1368080817,1.0,2013-05-09,-0.4019
...,...,...,...,...,...,...
4596337,ReformedPony,what coin/coins would you guys put 500-1k into...,1612828916,3.0,2021-02-09,0.4404
4596338,rbmichael,How long until Amazon/Bezos jump in on it?,1612828915,1.0,2021-02-09,0.3182
4596340,damnusernamegotcutof,Have mercy oh crypto gods for I can only get s...,1612828904,2.0,2021-02-09,0.3612
4596341,larrydavid4eyedfuck,"yes thats correct, I own the ethereum wallet t...",1612828903,1.0,2021-02-09,0.7782


In [5]:
# nonneutral_05.to_csv('sentiment_df_crypto_25.csv', header=True, index=False, columns=list(nonneutral_05.axes[1]))

In [6]:
test_phrase_median_25_pos = df_clean_pos.groupby([pd.Grouper(key='date',freq='D')])['Compound Sentiment Score'].median()
# test_phrase_median = compound_sentiment_bitcoin[1205:]
test_phrase_median_25_pos
test_phrase_median_25_neg = df_clean_neg.groupby([pd.Grouper(key='date',freq='D')])['Compound Sentiment Score'].median()
test_phrase_median_25_neg, test_phrase_median_25_pos
test_phrase_median_25_neg = test_phrase_median_25_neg.interpolate(method = 'linear')
test_phrase_median_25_pos = test_phrase_median_25_pos.interpolate(method = 'linear')


test_phrase_median_25_neg,test_phrase_median_25_pos

(date
 2013-03-12   -0.340000
 2013-03-13   -0.685200
 2013-03-14   -0.469850
 2013-03-15   -0.361200
 2013-03-16   -0.347367
                 ...   
 2021-02-10   -0.493900
 2021-02-11   -0.476700
 2021-02-12   -0.476700
 2021-02-13   -0.493900
 2021-02-14   -0.493900
 Freq: D, Name: Compound Sentiment Score, Length: 2897, dtype: float64,
 date
 2013-03-11    0.5916
 2013-03-12    0.6249
 2013-03-13    0.5178
 2013-03-14    0.4257
 2013-03-15    0.5859
                ...  
 2021-02-10    0.5859
 2021-02-11    0.5859
 2021-02-12    0.5859
 2021-02-13    0.5719
 2021-02-14    0.5626
 Freq: D, Name: Compound Sentiment Score, Length: 2898, dtype: float64)

In [7]:
test_phrase_median_25_app = test_phrase_median_25_neg[48:]
test_phrase_median_25_app
test_phrase_median_25_appp = test_phrase_median_25_pos[49:]
test_phrase_median_25_appp,test_phrase_median_25_app

(date
 2013-04-29    0.4588
 2013-04-30    0.8608
 2013-05-01    0.6345
 2013-05-02    0.7384
 2013-05-03    0.6936
                ...  
 2021-02-10    0.5859
 2021-02-11    0.5859
 2021-02-12    0.5859
 2021-02-13    0.5719
 2021-02-14    0.5626
 Freq: D, Name: Compound Sentiment Score, Length: 2849, dtype: float64,
 date
 2013-04-29   -0.5923
 2013-04-30   -0.5286
 2013-05-01   -0.5126
 2013-05-02   -0.4674
 2013-05-03   -0.4222
                ...  
 2021-02-10   -0.4939
 2021-02-11   -0.4767
 2021-02-12   -0.4767
 2021-02-13   -0.4939
 2021-02-14   -0.4939
 Freq: D, Name: Compound Sentiment Score, Length: 2849, dtype: float64)

In [8]:
sep_sentiment = pd.read_csv('coin_Bitcoin.csv')
sep_sentiment_full = sep_sentiment[:2849]


In [9]:
df = sep_sentiment_full.copy(deep=True)

In [10]:
df['negative sentiment'] = test_phrase_median_25_app.values
df['positive sentiment']=test_phrase_median_25_appp.values

In [11]:
# df = pd.DataFrame((sep_sentiment_full, test_phrase_median_25_app))

In [12]:
df

Unnamed: 0,SNo,Name,Symbol,Date,High,Low,Open,Close,Volume,Marketcap,negative sentiment,positive sentiment
0,1,Bitcoin,BTC,2013-04-29 23:59:59,147.488007,134.000000,134.444000,144.539993,0.000000e+00,1.603769e+09,-0.5923,0.4588
1,2,Bitcoin,BTC,2013-04-30 23:59:59,146.929993,134.050003,144.000000,139.000000,0.000000e+00,1.542813e+09,-0.5286,0.8608
2,3,Bitcoin,BTC,2013-05-01 23:59:59,139.889999,107.720001,139.000000,116.989998,0.000000e+00,1.298955e+09,-0.5126,0.6345
3,4,Bitcoin,BTC,2013-05-02 23:59:59,125.599998,92.281898,116.379997,105.209999,0.000000e+00,1.168517e+09,-0.4674,0.7384
4,5,Bitcoin,BTC,2013-05-03 23:59:59,108.127998,79.099998,106.250000,97.750000,0.000000e+00,1.085995e+09,-0.4222,0.6936
...,...,...,...,...,...,...,...,...,...,...,...,...
2844,2845,Bitcoin,BTC,2021-02-10 23:59:59,47145.568199,43881.152599,46469.761202,44918.184492,8.730109e+10,8.366169e+11,-0.4939,0.5859
2845,2846,Bitcoin,BTC,2021-02-11 23:59:59,48463.467125,44187.762351,44898.711611,47909.331195,8.138891e+10,8.923649e+11,-0.4767,0.5859
2846,2847,Bitcoin,BTC,2021-02-12 23:59:59,48745.733800,46424.977818,47877.034375,47504.851183,7.655504e+10,8.848741e+11,-0.4767,0.5859
2847,2848,Bitcoin,BTC,2021-02-13 23:59:59,48047.744591,46392.282333,47491.202555,47105.517473,7.025046e+10,8.774789e+11,-0.4939,0.5719


In [13]:
df.to_csv('sentiment_seperate_25thresh_crypto.csv', header=True, index=False, columns=list(df.axes[1]))

# 0.05 threshold with crypto lexicon

In [15]:
df2 = pd.read_csv('sentiment_df_crypto_05.csv', parse_dates=['date'])

df2

Unnamed: 0,author,body,created_utc,score,date,Compound Sentiment Score
0,sex_with_a_goat,"The Spanish one is wrong, we don't use 'y' wit...",1368318717,2.0,2013-05-12,-0.4767
1,davidpbrown,20% not in cold storage = ?a sizeable amount o...,1368255514,1.0,2013-05-11,-0.2732
2,sgodsdogs,it seems there's been a lot of talking heads i...,1368249064,5.0,2013-05-11,-0.5574
3,hyh123,I see. But copycats of the world don't seem to...,1368117303,1.0,2013-05-09,-0.8221
4,AltClubGirls,Sorry that setting was changed by mistake. It ...,1368080817,1.0,2013-05-09,-0.4019
...,...,...,...,...,...,...
4506949,Cold_Goose_4242,I want 75k end of week,1612828909,2.0,2021-02-09,0.0772
4506950,damnusernamegotcutof,Have mercy oh crypto gods for I can only get s...,1612828904,2.0,2021-02-09,0.3612
4506951,larrydavid4eyedfuck,"yes thats correct, I own the ethereum wallet t...",1612828903,1.0,2021-02-09,0.7782
4506952,ChocolateMorsels,"This is insanity. Thank you Elon, very cool!",1612828897,3.0,2021-02-09,0.1742


In [16]:
df_positive = df2.copy(deep=True)
df_negative = df2.copy(deep=True)

df_positive_drop = df_positive[df_positive['Compound Sentiment Score'] <= 0.05].index
df_positive_drop
df_clean_pos = df_positive.drop(index = df_positive_drop)

df_negative_drop = df_negative[df_negative['Compound Sentiment Score'] >= -0.05].index
df_negative
df_clean_neg = df_negative.drop(index = df_negative_drop)



nonneutral_05 = pd.concat((df_clean_neg, df_clean_pos))
nonneutral_05

Unnamed: 0,author,body,created_utc,score,date,Compound Sentiment Score
0,sex_with_a_goat,"The Spanish one is wrong, we don't use 'y' wit...",1368318717,2.0,2013-05-12,-0.4767
1,davidpbrown,20% not in cold storage = ?a sizeable amount o...,1368255514,1.0,2013-05-11,-0.2732
2,sgodsdogs,it seems there's been a lot of talking heads i...,1368249064,5.0,2013-05-11,-0.5574
3,hyh123,I see. But copycats of the world don't seem to...,1368117303,1.0,2013-05-09,-0.8221
4,AltClubGirls,Sorry that setting was changed by mistake. It ...,1368080817,1.0,2013-05-09,-0.4019
...,...,...,...,...,...,...
4506949,Cold_Goose_4242,I want 75k end of week,1612828909,2.0,2021-02-09,0.0772
4506950,damnusernamegotcutof,Have mercy oh crypto gods for I can only get s...,1612828904,2.0,2021-02-09,0.3612
4506951,larrydavid4eyedfuck,"yes thats correct, I own the ethereum wallet t...",1612828903,1.0,2021-02-09,0.7782
4506952,ChocolateMorsels,"This is insanity. Thank you Elon, very cool!",1612828897,3.0,2021-02-09,0.1742


In [17]:
test_phrase_median_25_pos = df_clean_pos.groupby([pd.Grouper(key='date',freq='D')])['Compound Sentiment Score'].median()
# test_phrase_median = compound_sentiment_bitcoin[1205:]
test_phrase_median_25_pos
test_phrase_median_25_neg = df_clean_neg.groupby([pd.Grouper(key='date',freq='D')])['Compound Sentiment Score'].median()
test_phrase_median_25_neg, test_phrase_median_25_pos
test_phrase_median_25_neg = test_phrase_median_25_neg.interpolate(method = 'linear')
test_phrase_median_25_pos = test_phrase_median_25_pos.interpolate(method = 'linear')


In [18]:
test_phrase_median_25_app = test_phrase_median_25_neg[48:]
test_phrase_median_25_app
test_phrase_median_25_appp = test_phrase_median_25_pos[49:]
test_phrase_median_25_appp

date
2013-04-29    0.40510
2013-04-30    0.86080
2013-05-01    0.54675
2013-05-02    0.69350
2013-05-03    0.69360
               ...   
2021-02-10    0.52830
2021-02-11    0.53670
2021-02-12    0.52670
2021-02-13    0.51060
2021-02-14    0.50930
Freq: D, Name: Compound Sentiment Score, Length: 2849, dtype: float64

In [19]:
sep_sentiment = pd.read_csv('coin_Bitcoin.csv')
sep_sentiment_full = sep_sentiment[:2849]
df = sep_sentiment_full.copy(deep=True)

In [20]:
df['negative sentiment'] = test_phrase_median_25_app.values
df['positive sentiment']=test_phrase_median_25_appp.values

In [21]:
df

Unnamed: 0,SNo,Name,Symbol,Date,High,Low,Open,Close,Volume,Marketcap,negative sentiment,positive sentiment
0,1,Bitcoin,BTC,2013-04-29 23:59:59,147.488007,134.000000,134.444000,144.539993,0.000000e+00,1.603769e+09,-0.2477,0.40510
1,2,Bitcoin,BTC,2013-04-30 23:59:59,146.929993,134.050003,144.000000,139.000000,0.000000e+00,1.542813e+09,-0.3423,0.86080
2,3,Bitcoin,BTC,2013-05-01 23:59:59,139.889999,107.720001,139.000000,116.989998,0.000000e+00,1.298955e+09,-0.3754,0.54675
3,4,Bitcoin,BTC,2013-05-02 23:59:59,125.599998,92.281898,116.379997,105.209999,0.000000e+00,1.168517e+09,-0.3988,0.69350
4,5,Bitcoin,BTC,2013-05-03 23:59:59,108.127998,79.099998,106.250000,97.750000,0.000000e+00,1.085995e+09,-0.4222,0.69360
...,...,...,...,...,...,...,...,...,...,...,...,...
2844,2845,Bitcoin,BTC,2021-02-10 23:59:59,47145.568199,43881.152599,46469.761202,44918.184492,8.730109e+10,8.366169e+11,-0.4019,0.52830
2845,2846,Bitcoin,BTC,2021-02-11 23:59:59,48463.467125,44187.762351,44898.711611,47909.331195,8.138891e+10,8.923649e+11,-0.4019,0.53670
2846,2847,Bitcoin,BTC,2021-02-12 23:59:59,48745.733800,46424.977818,47877.034375,47504.851183,7.655504e+10,8.848741e+11,-0.4019,0.52670
2847,2848,Bitcoin,BTC,2021-02-13 23:59:59,48047.744591,46392.282333,47491.202555,47105.517473,7.025046e+10,8.774789e+11,-0.4019,0.51060


In [22]:
df.to_csv('sentiment_seperate_05thresh_crypto.csv', header=True, index=False, columns=list(df.axes[1]))

# normal vader with 0.25 threshold

In [23]:
df3 = pd.read_csv('sentiment_df.csv', parse_dates=['date'])
df3

Unnamed: 0,author,body,created_utc,score,date,Compound Sentiment Score
0,TechnoMagik,I'm not sure how you eliminate spread.. If I a...,1368332818,1.0,2013-05-12,0.9014
1,mytwobitcents,fixed thanks,1368321753,2.0,2013-05-12,0.4404
2,sex_with_a_goat,"The Spanish one is wrong, we don't use 'y' wit...",1368318717,2.0,2013-05-12,-0.4767
3,davidpbrown,"Yes, Russian Trolls are the most obvious answer.",1368298185,2.0,2013-05-11,0.4019
4,bigglejones,"A ""consultant"" asking advice on how to be a co...",1368257850,2.0,2013-05-11,0.0813
...,...,...,...,...,...,...
4553702,Cold_Goose_4242,I want 75k end of week,1612828909,2.0,2021-02-09,0.0772
4553703,damnusernamegotcutof,Have mercy oh crypto gods for I can only get s...,1612828904,2.0,2021-02-09,0.3612
4553704,larrydavid4eyedfuck,"yes thats correct, I own the ethereum wallet t...",1612828903,1.0,2021-02-09,0.7608
4553705,ChocolateMorsels,"This is insanity. Thank you Elon, very cool!",1612828897,3.0,2021-02-09,0.1742


In [24]:
df_positive = df3.copy(deep=True)
df_negative = df3.copy(deep=True)

df_positive_drop = df_positive[df_positive['Compound Sentiment Score'] <= 0.25].index
df_positive_drop
df_clean_pos = df_positive.drop(index = df_positive_drop)

df_negative_drop = df_negative[df_negative['Compound Sentiment Score'] >= -0.25].index
df_negative
df_clean_neg = df_negative.drop(index = df_negative_drop)



nonneutral_05 = pd.concat((df_clean_neg, df_clean_pos))
nonneutral_05

Unnamed: 0,author,body,created_utc,score,date,Compound Sentiment Score
2,sex_with_a_goat,"The Spanish one is wrong, we don't use 'y' wit...",1368318717,2.0,2013-05-12,-0.4767
5,davidpbrown,Websites get hacked.. news at 11.\n\nDon't kee...,1368255591,3.0,2013-05-11,-0.4019
8,sgodsdogs,it seems there's been a lot of talking heads i...,1368249064,5.0,2013-05-11,-0.5574
23,hyh123,I see. But copycats of the world don't seem to...,1368117303,1.0,2013-05-09,-0.8221
26,AltClubGirls,Sorry that setting was changed by mistake. It ...,1368080817,1.0,2013-05-09,-0.4019
...,...,...,...,...,...,...
4553700,Denaneha,Officially £1 Trillion market cap . CONGRATS C...,1612828918,5.0,2021-02-09,0.8769
4553701,ReformedPony,what coin/coins would you guys put 500-1k into...,1612828916,3.0,2021-02-09,0.4404
4553703,damnusernamegotcutof,Have mercy oh crypto gods for I can only get s...,1612828904,2.0,2021-02-09,0.3612
4553704,larrydavid4eyedfuck,"yes thats correct, I own the ethereum wallet t...",1612828903,1.0,2021-02-09,0.7608


In [25]:
test_phrase_median_25_pos = df_clean_pos.groupby([pd.Grouper(key='date',freq='D')])['Compound Sentiment Score'].median()
# test_phrase_median = compound_sentiment_bitcoin[1205:]
test_phrase_median_25_pos
test_phrase_median_25_neg = df_clean_neg.groupby([pd.Grouper(key='date',freq='D')])['Compound Sentiment Score'].median()
test_phrase_median_25_neg, test_phrase_median_25_pos
test_phrase_median_25_neg = test_phrase_median_25_neg.interpolate(method = 'linear')
test_phrase_median_25_pos = test_phrase_median_25_pos.interpolate(method = 'linear')


In [26]:
test_phrase_median_25_app = test_phrase_median_25_neg[48:]
test_phrase_median_25_app
test_phrase_median_25_appp = test_phrase_median_25_pos[49:]
test_phrase_median_25_appp

date
2013-04-29    0.45880
2013-04-30    0.88500
2013-05-01    0.54675
2013-05-02    0.73840
2013-05-03    0.73040
               ...   
2021-02-10    0.58590
2021-02-11    0.58590
2021-02-12    0.58590
2021-02-13    0.57190
2021-02-14    0.55740
Freq: D, Name: Compound Sentiment Score, Length: 2849, dtype: float64

In [27]:
sep_sentiment = pd.read_csv('coin_Bitcoin.csv')
sep_sentiment_full = sep_sentiment[:2849]
df = sep_sentiment_full.copy(deep=True)

In [28]:
df['negative sentiment'] = test_phrase_median_25_app.values
df['positive sentiment']=test_phrase_median_25_appp.values

In [29]:
df

Unnamed: 0,SNo,Name,Symbol,Date,High,Low,Open,Close,Volume,Marketcap,negative sentiment,positive sentiment
0,1,Bitcoin,BTC,2013-04-29 23:59:59,147.488007,134.000000,134.444000,144.539993,0.000000e+00,1.603769e+09,-0.7165,0.45880
1,2,Bitcoin,BTC,2013-04-30 23:59:59,146.929993,134.050003,144.000000,139.000000,0.000000e+00,1.542813e+09,-0.7149,0.88500
2,3,Bitcoin,BTC,2013-05-01 23:59:59,139.889999,107.720001,139.000000,116.989998,0.000000e+00,1.298955e+09,-0.5242,0.54675
3,4,Bitcoin,BTC,2013-05-02 23:59:59,125.599998,92.281898,116.379997,105.209999,0.000000e+00,1.168517e+09,-0.4732,0.73840
4,5,Bitcoin,BTC,2013-05-03 23:59:59,108.127998,79.099998,106.250000,97.750000,0.000000e+00,1.085995e+09,-0.4222,0.73040
...,...,...,...,...,...,...,...,...,...,...,...,...
2844,2845,Bitcoin,BTC,2021-02-10 23:59:59,47145.568199,43881.152599,46469.761202,44918.184492,8.730109e+10,8.366169e+11,-0.4919,0.58590
2845,2846,Bitcoin,BTC,2021-02-11 23:59:59,48463.467125,44187.762351,44898.711611,47909.331195,8.138891e+10,8.923649e+11,-0.4767,0.58590
2846,2847,Bitcoin,BTC,2021-02-12 23:59:59,48745.733800,46424.977818,47877.034375,47504.851183,7.655504e+10,8.848741e+11,-0.4767,0.58590
2847,2848,Bitcoin,BTC,2021-02-13 23:59:59,48047.744591,46392.282333,47491.202555,47105.517473,7.025046e+10,8.774789e+11,-0.4939,0.57190


In [30]:
df.to_csv('sentiment_seperate_25thresh_vader.csv', header=True, index=False, columns=list(df.axes[1]))

# normal vader with 0.05 threshold

In [31]:
df4 = pd.read_csv('sentiment_df.csv', parse_dates=['date'])
df4

Unnamed: 0,author,body,created_utc,score,date,Compound Sentiment Score
0,TechnoMagik,I'm not sure how you eliminate spread.. If I a...,1368332818,1.0,2013-05-12,0.9014
1,mytwobitcents,fixed thanks,1368321753,2.0,2013-05-12,0.4404
2,sex_with_a_goat,"The Spanish one is wrong, we don't use 'y' wit...",1368318717,2.0,2013-05-12,-0.4767
3,davidpbrown,"Yes, Russian Trolls are the most obvious answer.",1368298185,2.0,2013-05-11,0.4019
4,bigglejones,"A ""consultant"" asking advice on how to be a co...",1368257850,2.0,2013-05-11,0.0813
...,...,...,...,...,...,...
4553702,Cold_Goose_4242,I want 75k end of week,1612828909,2.0,2021-02-09,0.0772
4553703,damnusernamegotcutof,Have mercy oh crypto gods for I can only get s...,1612828904,2.0,2021-02-09,0.3612
4553704,larrydavid4eyedfuck,"yes thats correct, I own the ethereum wallet t...",1612828903,1.0,2021-02-09,0.7608
4553705,ChocolateMorsels,"This is insanity. Thank you Elon, very cool!",1612828897,3.0,2021-02-09,0.1742


In [32]:
df_positive = df3.copy(deep=True)
df_negative = df3.copy(deep=True)

df_positive_drop = df_positive[df_positive['Compound Sentiment Score'] <= 0.05].index
df_positive_drop
df_clean_pos = df_positive.drop(index = df_positive_drop)

df_negative_drop = df_negative[df_negative['Compound Sentiment Score'] >= -0.05].index
df_negative
df_clean_neg = df_negative.drop(index = df_negative_drop)



nonneutral_05 = pd.concat((df_clean_neg, df_clean_pos))
nonneutral_05

Unnamed: 0,author,body,created_utc,score,date,Compound Sentiment Score
2,sex_with_a_goat,"The Spanish one is wrong, we don't use 'y' wit...",1368318717,2.0,2013-05-12,-0.4767
5,davidpbrown,Websites get hacked.. news at 11.\n\nDon't kee...,1368255591,3.0,2013-05-11,-0.4019
6,davidpbrown,20% not in cold storage = ?a sizeable amount o...,1368255514,1.0,2013-05-11,-0.1779
8,sgodsdogs,it seems there's been a lot of talking heads i...,1368249064,5.0,2013-05-11,-0.5574
11,Tencoin1,"I don't understand, what chance?. The end of t...",1368235203,1.0,2013-05-11,-0.1877
...,...,...,...,...,...,...
4553702,Cold_Goose_4242,I want 75k end of week,1612828909,2.0,2021-02-09,0.0772
4553703,damnusernamegotcutof,Have mercy oh crypto gods for I can only get s...,1612828904,2.0,2021-02-09,0.3612
4553704,larrydavid4eyedfuck,"yes thats correct, I own the ethereum wallet t...",1612828903,1.0,2021-02-09,0.7608
4553705,ChocolateMorsels,"This is insanity. Thank you Elon, very cool!",1612828897,3.0,2021-02-09,0.1742


In [33]:
test_phrase_median_25_pos = df_clean_pos.groupby([pd.Grouper(key='date',freq='D')])['Compound Sentiment Score'].median()
# test_phrase_median = compound_sentiment_bitcoin[1205:]
test_phrase_median_25_pos
test_phrase_median_25_neg = df_clean_neg.groupby([pd.Grouper(key='date',freq='D')])['Compound Sentiment Score'].median()
test_phrase_median_25_neg, test_phrase_median_25_pos
test_phrase_median_25_neg = test_phrase_median_25_neg.interpolate(method = 'linear')
test_phrase_median_25_pos = test_phrase_median_25_pos.interpolate(method = 'linear')


In [34]:
test_phrase_median_25_app = test_phrase_median_25_neg[48:]
test_phrase_median_25_app
test_phrase_median_25_appp = test_phrase_median_25_pos[49:]
test_phrase_median_25_appp

date
2013-04-29    0.45880
2013-04-30    0.88500
2013-05-01    0.52180
2013-05-02    0.69350
2013-05-03    0.73040
               ...   
2021-02-10    0.52670
2021-02-11    0.52670
2021-02-12    0.51825
2021-02-13    0.50400
2021-02-14    0.50230
Freq: D, Name: Compound Sentiment Score, Length: 2849, dtype: float64

In [35]:
sep_sentiment = pd.read_csv('coin_Bitcoin.csv')
sep_sentiment_full = sep_sentiment[:2849]
df = sep_sentiment_full.copy(deep=True)

In [36]:
df['negative sentiment'] = test_phrase_median_25_app.values
df['positive sentiment']=test_phrase_median_25_appp.values
df

Unnamed: 0,SNo,Name,Symbol,Date,High,Low,Open,Close,Volume,Marketcap,negative sentiment,positive sentiment
0,1,Bitcoin,BTC,2013-04-29 23:59:59,147.488007,134.000000,134.444000,144.539993,0.000000e+00,1.603769e+09,-0.14055,0.45880
1,2,Bitcoin,BTC,2013-04-30 23:59:59,146.929993,134.050003,144.000000,139.000000,0.000000e+00,1.542813e+09,-0.12800,0.88500
2,3,Bitcoin,BTC,2013-05-01 23:59:59,139.889999,107.720001,139.000000,116.989998,0.000000e+00,1.298955e+09,-0.38120,0.52180
3,4,Bitcoin,BTC,2013-05-02 23:59:59,125.599998,92.281898,116.379997,105.209999,0.000000e+00,1.168517e+09,-0.40170,0.69350
4,5,Bitcoin,BTC,2013-05-03 23:59:59,108.127998,79.099998,106.250000,97.750000,0.000000e+00,1.085995e+09,-0.42220,0.73040
...,...,...,...,...,...,...,...,...,...,...,...,...
2844,2845,Bitcoin,BTC,2021-02-10 23:59:59,47145.568199,43881.152599,46469.761202,44918.184492,8.730109e+10,8.366169e+11,-0.40190,0.52670
2845,2846,Bitcoin,BTC,2021-02-11 23:59:59,48463.467125,44187.762351,44898.711611,47909.331195,8.138891e+10,8.923649e+11,-0.40190,0.52670
2846,2847,Bitcoin,BTC,2021-02-12 23:59:59,48745.733800,46424.977818,47877.034375,47504.851183,7.655504e+10,8.848741e+11,-0.40190,0.51825
2847,2848,Bitcoin,BTC,2021-02-13 23:59:59,48047.744591,46392.282333,47491.202555,47105.517473,7.025046e+10,8.774789e+11,-0.40190,0.50400


In [37]:
df.to_csv('sentiment_seperate_05thresh_vader.csv', header=True, index=False, columns=list(df.axes[1]))