In [9]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as tkr
import seaborn as sns
import scipy as stats
import regex as re

import nltk # sentiment library
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
nltk.download('vader_lexicon') # download vader lexicon
nltk.download("stopwords")
nltk.download("wordnet")

from nltk.sentiment import SentimentIntensityAnalyzer as SIA

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ckard\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ckard\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ckard\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
    tokenizer = RegexpTokenizer('\s+', gaps=True)
    stop_words = stopwords.words("english")
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    lemmatizer = WordNetLemmatizer()
    df = pd.read_csv('bitcoin.csv',dtype={'conversation_id':str,
                                          'timezone':str,
                                          'photos':str,
                                          'user_id':str,
                                          'replies_count':str,
                                          'retweets_count':str,
                                          'likes_count':str,
                                          'video':str,
                                          'near':str,
                                          'geo':str,
                                          'source':str,
                                          'user_rt_id':str,
                                          'user_rt':str,
                                          'retweet_id':str,
                                          'reply_to':str,
                                          'trans_src':str,
                                          'trans_dest':str,
                                          'retweet_date':str,
                                          'translate':str})

In [6]:
    df=df.drop(['name','place','quote_url','near','geo','user_rt','retweet_id','retweet_date','translate','trans_src','trans_dest','source','thumbnail','user_rt_id'],axis=1)
    index_names = df[df['language'] != 'en'].index
    df.drop(index_names, inplace=True)
    df = df.drop(['conversation_id','timezone','language','mentions','urls','photos','replies_count','retweets_count','likes_count','retweet','video','reply_to'], axis=1)
    print(df.head())

                    id                                  created_at  \
0  1241514861353406468  2020-03-21 17:59:43 Mountain Daylight Time   
1  1241514829120040960  2020-03-21 17:59:36 Mountain Daylight Time   
2  1241514705904115712  2020-03-21 17:59:06 Mountain Daylight Time   
3  1241514692880683008  2020-03-21 17:59:03 Mountain Daylight Time   
4  1241514689634467845  2020-03-21 17:59:02 Mountain Daylight Time   

         date      time              user_id         username  \
0  2020-03-21  17:59:43  1016637745102577665         vlada682   
1  2020-03-21  17:59:36           4863588554       jesamine09   
2  2020-03-21  17:59:06   897577030476324864  thebitcoinpizza   
3  2020-03-21  17:59:03   932046802680938496     bitcoinchute   
4  2020-03-21  17:59:02   880916204378042368     emmamackay30   

                                               tweet  \
0  32k $icx ans 1m $vet is it enough to become ri...   
1  BIT FUN. Play games, have fun, earn bitcoin!. ...   
2  The #BitcoinPizza

In [10]:
def regex_tweet(tweet):
    tweet = ' '.join(re.sub("(\$[A-Za-z0-9]+)", " ", tweet).split())
    tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)", " ", tweet).split())
    tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " ", tweet).split())
    tweet = ' '.join(re.sub("[\.\,\!\?\:\;\-\=]", " ", tweet).split())
    tweet = ' '.join(re.sub("[\[\]]", " ", tweet).split())
    return tweet

In [12]:
df['regex_tweet'] = df.apply(lambda row: regex_tweet(row['tweet']), axis=1)

In [16]:
print(df.head())

                    id                                  created_at  \
0  1241514861353406468  2020-03-21 17:59:43 Mountain Daylight Time   
1  1241514829120040960  2020-03-21 17:59:36 Mountain Daylight Time   
2  1241514705904115712  2020-03-21 17:59:06 Mountain Daylight Time   
3  1241514692880683008  2020-03-21 17:59:03 Mountain Daylight Time   
4  1241514689634467845  2020-03-21 17:59:02 Mountain Daylight Time   

         date      time              user_id         username  \
0  2020-03-21  17:59:43  1016637745102577665         vlada682   
1  2020-03-21  17:59:36           4863588554       jesamine09   
2  2020-03-21  17:59:06   897577030476324864  thebitcoinpizza   
3  2020-03-21  17:59:03   932046802680938496     bitcoinchute   
4  2020-03-21  17:59:02   880916204378042368     emmamackay30   

                                               tweet  \
0  32k $icx ans 1m $vet is it enough to become ri...   
1  BIT FUN. Play games, have fun, earn bitcoin!. ...   
2  The #BitcoinPizza

**README**
- emot: library used to process emoji and emoticons

- first see polarity adjusted score with TextBlob 

- then compare adjusted polarity scores using NLTK's sentiment analyzer afterwards

**processing emojis via emot library:**
----

In [15]:
# 1. Via pip:
# $ pip install emot --upgrade

#import sys
#!{sys.executable} -m pip install emot --upgrade

# 2. From master branch: 
# $ git clone https://github.com/NeelShah18/emot.git
# $ cd emot
# $ python setup.py install

import emot

def clean_mean(val):
    return val.replace('_', ' ').replace('-', ' ').replace(':', ' ')

def convert_emojicon(text, isPrint=True):
    for emoti in emot.emo_unicode.EMOTICONS:
        if emoti in text:
            text = text.replace(emoti, clean_mean(emot.emo_unicode.EMOTICONS.get(emoti, '')))
            if isPrint==True:
                print(emoti)
            
    for emoti in emot.emo_unicode.UNICODE_EMO:
        if emoti in text:
            text = text.replace(emoti, clean_mean(emot.emo_unicode.UNICODE_EMO.get(emoti, '')))
            if isPrint==True:
                print(emoti)
            
    for emoti in emot.emo_unicode.EMOTICONS_EMO:
        if emoti in text:
            text = text.replace(emoti, clean_mean(emot.emo_unicode.EMOTICONS_EMO.get(emoti, '')))
            if isPrint==True:
                print(emoti)
    return text

In [1]:
#df['emojied_tweets'] = df.apply(lambda row: convert_emojicon(row['regex_tweet']), axis=1)

**via TextBlob Sentiment Analyser:**
----


In [19]:
from textblob import TextBlob

def stem_tokenize(text):
    stop_words = stopwords.words("english")
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    lemmatizer = WordNetLemmatizer()
    tokenzr = RegexpTokenizer('\s+', gaps = True)
    
    tokenized_text = tokenzr.tokenize(text.lower())
    words = [lemmatizer.lemmatize(w) for w in tokenized_text if w not in stop_words]
    stem_text = " ".join([stemmer.stem(i) for i in words])

    return stem_text #converted emoji to unicode description of emoji is return as string text

In [21]:
def compareAdjustedScores(text):
    adjusted_text = convert_emojicon(text, isPrint==False)
    adjusted_stemmed = stem_tokenize(adjusted_text)
    
    print("pre-adjustments: {}; post-adjustments: {}".format(TextBlob(text).sentiment.polarity, TextBlob(adjusted_stemmed).sentiment.polarity))

In [22]:
isPrint = True
#
sample_sad = "😞"
print(returnedText:=convert_emojicon(sample_sad, isPrint==True)) #converted emoji to unicode description of emoji is return as string text
compareAdjustedScores(sample_sad)
print("\n---\n")
#
sample_sad2 = "🙁"
print(returnedText:=convert_emojicon(sample_sad2, isPrint==True))
compareAdjustedScores(sample_sad2)
print("\n---\n")
#
sample_happy = "😀"
print(returnedText:=convert_emojicon(sample_happy, isPrint==True))
compareAdjustedScores(sample_happy)
print("\n---\n")
#
sample_happy2= "🙂"
print(returnedText:=convert_emojicon(sample_happy2, isPrint==True))
compareAdjustedScores(sample_happy2)

😞
 disappointed face 
pre-adjustments: 0.0; post-adjustments: 0.0

---

🙁
 slightly frowning face 
pre-adjustments: 0.0; post-adjustments: -0.16666666666666666

---

😀
 grinning face 
pre-adjustments: 0.0; post-adjustments: 0.0

---

🙂
 slightly smiling face 
pre-adjustments: 0.0; post-adjustments: 0.06666666666666667


**via NLTK Sentiment Analyser:**
---

In [28]:
sia = SIA()
# initialize object first, as SIA (from NLTK) is an object class

###
sample_text_sad1a = "😞"
score_sad1a, processedScore_sad1a = sia.polarity_scores(sample_text_sad1a), sia.polarity_scores(score_adjustedStemmed:=stem_tokenize(convert_emojicon(sample_text_sad1a, isPrint==False)))
print("\nCase 1a:  disappointed face\n pre-emoji processing: {}\n post-emoji processing: {}\n".format(score_sad1a, processedScore_sad1a))
print("before: {}; after: {}".format(score_sad1a['compound'], processedScore_sad1a['compound'],'\n'))

print(score_adjustedStemmed:=stem_tokenize(convert_emojicon(sample_text_sad1a, isPrint==True)))
####

sample_text_sad1b = "🙁"
score_sad1b, processedScore_sad1b = sia.polarity_scores(sample_text_sad1b), sia.polarity_scores(score_adjustedStemmed:=stem_tokenize(convert_emojicon(sample_text_sad1b, isPrint==False)))
print("\nCase 1b:  slightly frowning face \n pre-emoji processing: {}\n post-emoji processing: {}\n".format(score_sad1b, processedScore_sad1b))
print("before: {}; after: {}".format(score_sad1b['compound'], processedScore_sad1b['compound'],'\n'))

print(convert_emojicon(sample_text_sad1b, isPrint==True))
###

sample_text_happy2a = "😀"
score_happy2a, processedScore_happy2a = sia.polarity_scores(sample_text_happy2a), sia.polarity_scores(score_adjustedStemmed:=stem_tokenize(convert_emojicon(sample_text_happy2a, isPrint==False)))
print("\nCase 2a: grinning face \npre-emoji processing: {}\npost-emoji processing: {}\n".format(score_happy2a,processedScore_happy2a))
print("before: {}; after: {}".format(score_happy2a['compound'], processedScore_happy2a['compound'],'\n'))

print(convert_emojicon(sample_text_happy2a, isPrint==True))
###

sample_text_happy2b = "🙂"
score_happy2b, processedScore_happy2b = sia.polarity_scores(sample_text_happy2b), sia.polarity_scores(convert_emojicon(sample_text_happy2b, isPrint==False))
print("\nCase 2b: slightly smiling face \n pre-emoji processing: {}\n post-emoji processing: {}\n".format(score_happy2b, processedScore_happy2b))
print("before: {}; after: {}".format(score_happy2b['compound'], processedScore_happy2b['compound'],'\n'))

print(convert_emojicon(sample_text_happy2b, isPrint==True))
print(df.head())


Case 1a:  disappointed face
 pre-emoji processing: {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
 post-emoji processing: {'neg': 0.73, 'neu': 0.27, 'pos': 0.0, 'compound': -0.4019}

before: 0.0; after: -0.4019
😞
disappoint face

Case 1b:  slightly frowning face 
 pre-emoji processing: {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
 post-emoji processing: {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

before: 0.0; after: 0.0
🙁
 slightly frowning face 

Case 2a: grinning face 
pre-emoji processing: {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
post-emoji processing: {'neg': 0.0, 'neu': 0.244, 'pos': 0.756, 'compound': 0.4767}

before: 0.0; after: 0.4767
😀
 grinning face 

Case 2b: slightly smiling face 
 pre-emoji processing: {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
 post-emoji processing: {'neg': 0.0, 'neu': 0.425, 'pos': 0.575, 'compound': 0.4033}

before: 0.0; after: 0.4033
🙂
 slightly smiling face 
                    id                   

In [37]:
print(sia.polarity_scores(scoreAdjusted:=convert_emojicon(sample_text_sad1b, isPrint==False)))
scoreAdjusted

{'neg': 0.513, 'neu': 0.487, 'pos': 0.0, 'compound': -0.2748}


' slightly frowning face '

In [24]:
df.drop()

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


'slight frown face'

In [32]:
df['sentiment'] = df.apply(lambda row: sia.polarity_scores(row['emojied_tweets']), axis=1)

In [31]:
print(df.head())

                    id                                  created_at  \
0  1241514861353406468  2020-03-21 17:59:43 Mountain Daylight Time   
1  1241514829120040960  2020-03-21 17:59:36 Mountain Daylight Time   
2  1241514705904115712  2020-03-21 17:59:06 Mountain Daylight Time   
3  1241514692880683008  2020-03-21 17:59:03 Mountain Daylight Time   
4  1241514689634467845  2020-03-21 17:59:02 Mountain Daylight Time   

         date      time              user_id         username  \
0  2020-03-21  17:59:43  1016637745102577665         vlada682   
1  2020-03-21  17:59:36           4863588554       jesamine09   
2  2020-03-21  17:59:06   897577030476324864  thebitcoinpizza   
3  2020-03-21  17:59:03   932046802680938496     bitcoinchute   
4  2020-03-21  17:59:02   880916204378042368     emmamackay30   

                                               tweet  \
0  32k $icx ans 1m $vet is it enough to become ri...   
1  BIT FUN. Play games, have fun, earn bitcoin!. ...   
2  The #BitcoinPizza

----
- adjusting score, without tokenization

- adjusting score, then tokenization 

(eval how tokenization/stemmatization influences adjusted polarity scores for emojis)


In [10]:
### tokenize than convert
toBe_Adjusted = "😀"
sample_adjusted=convert_emojicon(toBe_Adjusted, isPrint==False)
print("Case 3: no tokenization, grinning face \npre-emoji processing: {}\npost-emoji processing: {}\n".format(sia.polarity_scores(toBe_Adjusted), sia.polarity_scores(sample_adjusted)))

print("adjust score without tokenization: ",sia.polarity_scores(toBe_Adjusted)['compound'], sia.polarity_scores(sample_adjusted)['compound'],'\n')


### tokenize after converting
toBe_adjusted_stemmed = "😀"
sample_adjusted=convert_emojicon(toBe_adjusted_stemmed, isPrint==False)
sample_adjustedStemmed=stem_tokenize(sample_adjusted)
convert_emojicon(toBe_adjusted_stemmed, isPrint==False)
print("Case 3: +tokenization, grinning face \npre-emoji processing: {}\npost-emoji processing: {}\n".format(sia.polarity_scores(toBe_adjusted_stemmed), sia.polarity_scores(sample_adjustedStemmed)))

print("adjust score WITH tokenization: ", sia.polarity_scores(toBe_adjusted_stemmed)['compound'], sia.polarity_scores(sample_adjustedStemmed)['compound'],'\n')


Case 3: no tokenization, grinning face 
pre-emoji processing: {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
post-emoji processing: {'neg': 0.0, 'neu': 0.286, 'pos': 0.714, 'compound': 0.3612}

adjust score without tokenization:  0.0 0.3612 

Case 3: +tokenization, grinning face 
pre-emoji processing: {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
post-emoji processing: {'neg': 0.0, 'neu': 0.244, 'pos': 0.756, 'compound': 0.4767}

adjust score WITH tokenization:  0.0 0.4767 



In [35]:
def get_sent(collection):
    return collection['compound']
    

In [36]:
df['sentiment_compound'] = df.apply(lambda row:get_sent(row['sentiment']), axis=1)

In [42]:
def regex_final(tweet):
    tweet = ' '.join(re.sub("[^a-zA-Z0-9 -]", " ", tweet).split())
    return tweet

In [43]:
df['pure_tweets'] = df.apply(lambda row:regex_final(row['emojied_tweets']), axis=1)

In [39]:
compression_opts = dict(method='zip',
                            archive_name='btc_analyzed.csv')

In [40]:
df.to_csv('btc.zip',index=False, compression=compression_opts)

In [45]:

df = df.drop(['user_id','regex_tweet','emojied_tweets','tweet'],axis=1)

KeyError: "['user_id' 'regex_tweet' 'emojied_tweets'] not found in axis"

In [46]:
df = df.drop(['tweet'],axis=1)

In [47]:
print(df.head())

                    id                                  created_at  \
0  1241514861353406468  2020-03-21 17:59:43 Mountain Daylight Time   
1  1241514829120040960  2020-03-21 17:59:36 Mountain Daylight Time   
2  1241514705904115712  2020-03-21 17:59:06 Mountain Daylight Time   
3  1241514692880683008  2020-03-21 17:59:03 Mountain Daylight Time   
4  1241514689634467845  2020-03-21 17:59:02 Mountain Daylight Time   

         date      time         username                     hashtags  \
0  2020-03-21  17:59:43         vlada682                           []   
1  2020-03-21  17:59:36       jesamine09        ['bitcoin', 'faucet']   
2  2020-03-21  17:59:06  thebitcoinpizza  ['bitcoinpizza', 'bitcoin']   
3  2020-03-21  17:59:03     bitcoinchute                           []   
4  2020-03-21  17:59:02     emmamackay30  ['bitcoin', 'cryptonation']   

                                  cashtags  \
0  ['icx', 'vet', 'btc', 'life', 'crypto']   
1                                       []   
2 

In [48]:
df=df.rename(columns={'sentiment_compound':'score'})
print(df.head())

                    id                                  created_at  \
0  1241514861353406468  2020-03-21 17:59:43 Mountain Daylight Time   
1  1241514829120040960  2020-03-21 17:59:36 Mountain Daylight Time   
2  1241514705904115712  2020-03-21 17:59:06 Mountain Daylight Time   
3  1241514692880683008  2020-03-21 17:59:03 Mountain Daylight Time   
4  1241514689634467845  2020-03-21 17:59:02 Mountain Daylight Time   

         date      time         username                     hashtags  \
0  2020-03-21  17:59:43         vlada682                           []   
1  2020-03-21  17:59:36       jesamine09        ['bitcoin', 'faucet']   
2  2020-03-21  17:59:06  thebitcoinpizza  ['bitcoinpizza', 'bitcoin']   
3  2020-03-21  17:59:03     bitcoinchute                           []   
4  2020-03-21  17:59:02     emmamackay30  ['bitcoin', 'cryptonation']   

                                  cashtags  \
0  ['icx', 'vet', 'btc', 'life', 'crypto']   
1                                       []   
2 

In [49]:
df=df.rename(columns={'pure_tweets':'tweets'})
print(df.head())

                    id                                  created_at  \
0  1241514861353406468  2020-03-21 17:59:43 Mountain Daylight Time   
1  1241514829120040960  2020-03-21 17:59:36 Mountain Daylight Time   
2  1241514705904115712  2020-03-21 17:59:06 Mountain Daylight Time   
3  1241514692880683008  2020-03-21 17:59:03 Mountain Daylight Time   
4  1241514689634467845  2020-03-21 17:59:02 Mountain Daylight Time   

         date      time         username                     hashtags  \
0  2020-03-21  17:59:43         vlada682                           []   
1  2020-03-21  17:59:36       jesamine09        ['bitcoin', 'faucet']   
2  2020-03-21  17:59:06  thebitcoinpizza  ['bitcoinpizza', 'bitcoin']   
3  2020-03-21  17:59:03     bitcoinchute                           []   
4  2020-03-21  17:59:02     emmamackay30  ['bitcoin', 'cryptonation']   

                                  cashtags  \
0  ['icx', 'vet', 'btc', 'life', 'crypto']   
1                                       []   
2 

In [50]:
df.to_csv('btc.zip',index=False, compression=compression_opts)

In [51]:
df = df.drop(['created_at','sentiment'],axis=1)

In [53]:
df=df.rename(columns={'score':'sentiment'})

In [54]:
print(df.head())

                    id        date      time         username  \
0  1241514861353406468  2020-03-21  17:59:43         vlada682   
1  1241514829120040960  2020-03-21  17:59:36       jesamine09   
2  1241514705904115712  2020-03-21  17:59:06  thebitcoinpizza   
3  1241514692880683008  2020-03-21  17:59:03     bitcoinchute   
4  1241514689634467845  2020-03-21  17:59:02     emmamackay30   

                      hashtags                                 cashtags  \
0                           []  ['icx', 'vet', 'btc', 'life', 'crypto']   
1        ['bitcoin', 'faucet']                                       []   
2  ['bitcoinpizza', 'bitcoin']                                       []   
3                           []                                  ['btc']   
4  ['bitcoin', 'cryptonation']                                       []   

                                                link  sentiment  \
0  https://twitter.com/Vlada682/status/1241514861...     0.5574   
1  https://twitter.com/j

In [71]:
df.to_csv('btc.zip',index=False, compression=compression_opts, date_format='%Y-%m-%d')

In [59]:
from datetime import datetime
def reformat_date(date):
    datetimeobject = datetime.strptime(date,'%m/%d/%Y')
    newFormat=datetimeobject.strftime('%Y-%m-%d')
    return newFormat
    

        
        
            
    
    

In [65]:
df['date'] = pd.to_datetime(df['date'])

In [68]:
#convert_dict = {'date':str}
#df = df.astype(convert_dict)
print(df.dtypes)
print(df.head())

id                   object
date         datetime64[ns]
time                 object
username             object
hashtags             object
cashtags             object
link                 object
sentiment           float64
tweets               object
dtype: object
                    id       date      time         username  \
0  1241514861353406468 2020-03-21  17:59:43         vlada682   
1  1241514829120040960 2020-03-21  17:59:36       jesamine09   
2  1241514705904115712 2020-03-21  17:59:06  thebitcoinpizza   
3  1241514692880683008 2020-03-21  17:59:03     bitcoinchute   
4  1241514689634467845 2020-03-21  17:59:02     emmamackay30   

                      hashtags                                 cashtags  \
0                           []  ['icx', 'vet', 'btc', 'life', 'crypto']   
1        ['bitcoin', 'faucet']                                       []   
2  ['bitcoinpizza', 'bitcoin']                                       []   
3                           []                   