In [1]:
import pandas as pd
import numpy as np

# misc
import datetime as dt
from pprint import pprint
from itertools import chain



# sentiment analysis
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, RegexpTokenizer # tokenize words
from nltk.corpus import stopwords

# visualization
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = (10, 8) # default plot size
import seaborn as sns
sns.set(style='whitegrid', palette='Dark2')
from wordcloud import WordCloud

In [2]:
nltk.download('vader_lexicon') # get lexicons data
nltk.download('punkt') # for tokenizer
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to C:\Users\Su
[nltk_data]     Yunhua\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Su
[nltk_data]     Yunhua\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Su
[nltk_data]     Yunhua\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
#bitcoin=titles of all post
print(len(bitcoin))

991


In [6]:
bitcoin0 = bitcoin[0]

# pprint(vars(news0)) 
print(bitcoin0.title) # headline
print(bitcoin0.score) # upvotes
print(bitcoin0.created) # UNIX timestamps 
print(dt.datetime.fromtimestamp(bitcoin0.created)) # date and time
print(bitcoin0.num_comments) # no. of comments
print(bitcoin0.upvote_ratio) # upvote / total votes
print(bitcoin0.total_awards_received) # no. of awards given

It's official! 1 Bitcoin = $10,000 USD
48607
1511918724.0
2017-11-28 17:25:24
4436
0.81
27


In [7]:
# create lists of the information from each post
title = [bitcoin.title for bitcoin in bitcoin]

In [24]:
data = [[dt.datetime.fromtimestamp(bitcoin.created_utc), bitcoin.title] for bitcoin in bitcoin] 

df = pd.DataFrame(data, columns=['Date', 'Title'])

print(df)

AttributeError: 'str' object has no attribute 'created_utc'

In [19]:
date = [bitcoin.created for bitcoin in bitcoin]

AttributeError: 'str' object has no attribute 'created'

In [8]:
bitcoin = pd.DataFrame({
    "title": title,
})
bitcoin.head()

Unnamed: 0,title
0,"It's official! 1 Bitcoin = $10,000 USD"
1,The last 3 months in 47 seconds.
2,Tesla buys $1.5b in Bitcoin and is looking to ...
3,It's over 9000!!!
4,Everyone who's trading BTC right now


In [9]:
sid = SentimentIntensityAnalyzer()

pos_text = "Vader is awesome"
cap_pos_text = "Vader is AWESOME!" # captilization and ! increases the effect
neg_text = "Vader is bad"

print(sid.polarity_scores(pos_text))
print(sid.polarity_scores(cap_pos_text))
print(sid.polarity_scores(neg_text))

{'neg': 0.0, 'neu': 0.328, 'pos': 0.672, 'compound': 0.6249}
{'neg': 0.0, 'neu': 0.281, 'pos': 0.719, 'compound': 0.729}
{'neg': 0.636, 'neu': 0.364, 'pos': 0.0, 'compound': -0.5423}


In [11]:
res = [*bitcoin['title'].apply(sid.polarity_scores)]
pprint(res[:3])

[{'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0},
 {'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0},
 {'compound': 0.3818, 'neg': 0.0, 'neu': 0.88, 'pos': 0.12}]


In [12]:
sentiment_df = pd.DataFrame.from_records(res)
bitcoin = pd.concat([bitcoin, sentiment_df], axis=1, join='inner')
bitcoin.head()

Unnamed: 0,title,neg,neu,pos,compound
0,"It's official! 1 Bitcoin = $10,000 USD",0.0,1.0,0.0,0.0
1,The last 3 months in 47 seconds.,0.0,1.0,0.0,0.0
2,Tesla buys $1.5b in Bitcoin and is looking to ...,0.0,0.88,0.12,0.3818
3,It's over 9000!!!,0.0,1.0,0.0,0.0
4,Everyone who's trading BTC right now,0.0,1.0,0.0,0.0


In [15]:
THRESHOLD = 0.2

conditions = [
    (bitcoin['compound'] <= -THRESHOLD),
    (bitcoin['compound'] > -THRESHOLD) & (bitcoin['compound'] < THRESHOLD),
    (bitcoin['compound'] >= THRESHOLD),
    ]

values = ["neg", "neu", "pos"]
bitcoin['label'] = np.select(conditions, values)

bitcoin.head()

Unnamed: 0,title,neg,neu,pos,compound,label
0,"It's official! 1 Bitcoin = $10,000 USD",0.0,1.0,0.0,0.0,neu
1,The last 3 months in 47 seconds.,0.0,1.0,0.0,0.0,neu
2,Tesla buys $1.5b in Bitcoin and is looking to ...,0.0,0.88,0.12,0.3818,pos
3,It's over 9000!!!,0.0,1.0,0.0,0.0,neu
4,Everyone who's trading BTC right now,0.0,1.0,0.0,0.0,neu


In [None]:
sentence0 = news.title.iloc[0]
print(sentence0)
words0 = news.title.iloc[0].split()
print(words0)

pos_list, neg_list, neu_list = [], [], []

for word in words0:
  if (sid.polarity_scores(word)['compound']) >= THRESHOLD:
    pos_list.append(word)
  elif (sid.polarity_scores(word)['compound']) <= -THRESHOLD:
    neg_list.append(word)
  else:
    neu_list.append(word)                

print('\nPositive:',pos_list)        
print('Neutral:',neu_list)    
print('Negative:',neg_list) 
score = sid.polarity_scores(sentence0)

print(f"\nThis sentence is {round(score['neg'] * 100, 2)}% negative")
print(f"This sentence is {round(score['neu'] * 100, 2)}% neutral")
print(f"This sentence is {round(score['pos'] * 100, 2)}% positive")
print(f"The compound value : {score['compound']} <= {-THRESHOLD}")
print(f"\nThis sentence is NEGATIVE")

# source https://stackoverflow.com/a/51515048/11386747

In [None]:
news.label.value_counts()

In [None]:
sns.histplot(news.label);

In [None]:
def news_title_output(df, label):
  res = df[df['label'] == label].title.values
  print(f'{"=" * 20}')
  print("\n".join(title for title in res))

# randomly sample
news_sub = news.groupby('label').sample(n = 5, random_state = 7)

print("Positive news")
news_title_output(news_sub, "pos")

print("\nNeutral news")
news_title_output(news_sub, "neu")

print("\nNegative news")
news_title_output(news_sub, "neg")

In [None]:
text = "Let's see how the NLTK tokenizer works!"

# using word tokenizer
print(nltk.word_tokenize(text))

# using regexp tokenizer
tk = nltk.tokenize.RegexpTokenizer(r'\s+', gaps=True) # split on whitespace
print(tk.tokenize(text))

tk = nltk.tokenize.RegexpTokenizer(r'\w+') # remove punct
print(tk.tokenize(text))

In [None]:
stop_words = stopwords.words('english')
print(len(stop_words))
print(stop_words[:10])

In [None]:
def custom_tokenize(text):
  # remove single quote and dashes
  text = text.replace("'", "").replace("-", "").lower()

  # split on words only
  tk = nltk.tokenize.RegexpTokenizer(r'\w+')
  tokens = tk.tokenize(text)

  # remove stop words
  words = [w for w in tokens if not w in stop_words]
  return words

print(custom_tokenize(text))

In [None]:
def tokens_2_words(df, label):
  # subset titles based on label
  titles = df[df['label'] == label].title
  # apply our custom tokenize function to each title
  tokens = titles.apply(custom_tokenize)
  # join nested lists into a single list
  words = list(chain.from_iterable(tokens))
  return words

pos_words = tokens_2_words(news, 'pos')
neg_words = tokens_2_words(news, 'neg')

In [None]:
pos_freq = nltk.FreqDist(pos_words)
pos_freq.most_common(20)

In [None]:
extract_sentence_from_word(news, "facebook", "neg")