## Initial setup

In [None]:
import dateutil
import numpy as np
import pandas as pd
import simplejson as json

In [3]:
with open('secrets.txt', 'r') as f:
    env = json.load(f)
print('Loaded env vars')

Loaded env vars


## Quick tokenization and sentiment

In [None]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()

tokenized_train_tweets = [(tweet_tokenizer.tokenize(t),l) for t,l in train_tweets]

print(len(tokenized_train_tweets))
print(tokenized_train_tweets[0])

In [None]:
import nltk
nltk.download('twitter_samples')

In [None]:
from nltk.corpus import twitter_samples

In [None]:
pos_tweets = [(t, 'pos') for t in twitter_samples.strings('positive_tweets.json')]
neg_tweets = [(t, 'neg') for t in twitter_samples.strings('negative_tweets.json')]
print(len(pos_tweets))
print(len(neg_tweets))

In [None]:
import random

random.shuffle(pos_tweets)
random.shuffle(neg_tweets)

In [None]:
train_pos_tweets = pos_tweets[:4500]
test_pos_tweets = pos_tweets[4500:]

train_neg_tweets = neg_tweets[:4500]
test_neg_tweets = neg_tweets[4500:]

train_tweets = train_pos_tweets + train_neg_tweets
test_tweets = test_pos_tweets + test_neg_tweets

print(len(train_tweets))
print(len(test_tweets))

In [None]:
from nltk.sentiment import SentimentAnalyzer

sentim_analyzer = SentimentAnalyzer()

In [None]:
all_words = sentim_analyzer.all_words(tokenized_train_tweets)
print(len(all_words))
print(all_words[0:10])

In [None]:
from nltk.sentiment.util import extract_unigram_feats

unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4)
print(len(unigram_feats))
print(unigram_feats[0:20])
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

In [None]:
training_set = sentim_analyzer.apply_features(train_tweets)
test_set = sentim_analyzer.apply_features(test_tweets)

In [None]:
from nltk.classify import NaiveBayesClassifier

trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)
for key, value in sorted(sentim_analyzer.evaluate(test_set).items()):
    print('{0}: {1}'.format(key, value))

In [None]:
# t = 'RT @RedditBTC: Russia plans $10 Billion Bitcoin investment... so it begins. https://t.co/vSSeeZWG9s'
t = "i don't get bitcoin"
classifier.classify(sentim_analyzer.extract_features(t))

In [None]:
# import twython
import nltk.sentiment.util
from nltk.classify import NaiveBayesClassifier

trainer = NaiveBayesClassifier.train

# ans = nltk.sentiment.util.demo_tweets(trainer, output='output.txt')

## Retrieving tweets with tweepy

In [None]:
import tweepy
from tweepy import Cursor

auth = tweepy.OAuthHandler(env[tw_oauth_key], env[tw_oauth_secret])
auth.set_access_token(env[tw_token_key], env[tw_token_secret])

api = tweepy.API(auth)

In [None]:
bitcoin_tweets = api.search('bitcoin', lang='en', count=100)
print(len(bitcoin_tweets))
print([t.text for t in bitcoin_tweets[0:5]])

In [None]:
len(bitcoin_tweets)

In [None]:
cursor = Cursor(api.search, q='bitcoin', lang='en', count=100, tweet_mode='extended')

In [None]:
tlist = []

for page in cursor.pages(15):
    tlist.extend(page)

print(len(tlist))

In [None]:
tweets = [t.full_text for t in tlist]
print(len(tweets))

## Using textblob for quick polarity

In [None]:
from textblob import TextBlob

In [None]:
for t in tweets[0:50]:
    print('Tweet:\n' + t)
    blob = TextBlob(t)
    sents = blob.sentences
    if len(sents) >= 1:
        sent = sents[0]
        print(sent.sentiment)
    print('')

## Retrieving tweets from ElasticSearch

In [None]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
from elasticsearch_dsl import Document, Date, Text, Integer, Float
from elasticsearch_dsl.connections import connections


In [None]:
# Create a model for tweets stored in the elasticsearch instance:

from elasticsearch_dsl import Document, Date, Text, Integer, Float

class ESTweet(Document):
    created_at = Date()
    stored_at = Date()
    full_text = Text(analyzer='snowball')
    subjectivity = Float()
    polarity = Float()
    author_id = Integer()
    author_followers = Integer()

    class Index:
        name = 'tweets'


In [None]:
# env vars are loaded at the beginning of the file.
client = Elasticsearch(hosts=[env['es_endpoint']])

In [None]:
# Get all tweets from Elasticsearch instance (dangerous without a limit!):

MAX_TWEETS = 1000

search = Search(index='tweets').using(client)\
            .query("match_all")\
            .sort({'created_at': {'order': 'desc'}})[:MAX_TWEETS]
results = search.execute() # results not used at the moment

tweet_hits = []
for hit in search:
    tweet_hits.append(hit)
len(tweet_hits)

In [None]:
tweet_hits[:10]

In [None]:
# Not currently used:
# tweet_dicts = [t.to_dict() for t in tweets]

In [None]:
def hitlist_to_dataframe(hit_list):
    """
    Transform tweet hitlist (from Elasticsearch) to a dataframe
    """
    id_strs = []
    created_ats = []
    full_texts = []
    ndropped = 0
    for hit in hit_list:
        id_str = str(hit.meta.id)
        
        if 'created_at' in hit and 'full_text' in hit:
            id_strs.append(id_str)
            created_ats.append(dateutil.parser.parse(hit['created_at']))
            full_texts.append(hit['full_text'])
        else:
            ndropped += 1
    df = pd.DataFrame({'id_str': id_strs, 'created_at': created_ats, 'full_text': full_texts})
    print('ndropped = {}'.format(ndropped))
    return df


In [None]:
df_tweets = hitlist_to_dataframe(tweet_hits)
tweet_hits = None # ready for garbage collection

df_tweets.head(10)

In [None]:
df_tweets.set_index(df_tweets['created_at'])
df_tweets = df_tweets.drop(columns=['id_str', 'created_at'])
df_tweets.head()

## Show a wordcloud

In [None]:
from wordcloud import WordCloud

In [None]:
# Compile the full corpus for a word cloud:

corpus = ''
for text in df_tweets['full_text']:
    corpus += ' '.join(tweet_tokens(text))
print(len(corpus))

In [None]:
wordcloud = WordCloud(width=800, height=450, random_state=21, max_font_size=110).generate(corpus)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

## NLP on tweets using Sentiment Analysis Dataset (SAD)

### Step 1: Load the data

In [None]:
import matplotlib.pyplot as plt
import nltk.tokenize
import sklearn.feature_extraction.text
import sklearn.model_selection
import sklearn.naive_bayes
import sklearn.pipeline
from sklearn import base

In [None]:
df_sad = pd.read_csv('Sentiment Analysis Dataset.csv', error_bad_lines=False)

In [None]:
df_sad.shape

In [None]:
df_sad.head()

In [None]:
labels_sad = df_sad['Sentiment']
labels_sad.head()

In [None]:
df_sad = df_sad.drop(columns=['ItemID', 'Sentiment', 'SentimentSource'])
df_sad.head()

### Step 2: A first pass at cleaning

In [None]:
# Custom nltk transformer for cleaning and tokenizing tweets:

class TwitterPreprocessor(base.BaseEstimator, base.TransformerMixin):
    def __init__(self):
        self.tw_tokenizer = nltk.tokenize.TweetTokenizer(
            strip_handles=True, reduce_len=True, preserve_case=False)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        ans = X.copy()
        ans['SentimentText'] = ans['SentimentText'].apply(self._prepare_tweet)
        return ans
        
    def _prepare_tweet(self, tweet):
        tokens = self.tw_tokenizer.tokenize(tweet)
        clean_tweet = ' '.join([token for token in tokens
                                if len(token) > 1
                                and not token.startswith('http')])
        return clean_tweet


In [None]:
tw_preprocessor = TwitterPreprocessor()
df2 = tw_preprocessor.transform(df_sad)
df2.head()

In [None]:
# min_df is chosen by off-hand estimate. we should search over it.
# try bigrams ASAP
tfidf_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
    ngram_range=(1,1), min_df=100
)

In [None]:
raise Exception('Caution: this is slow! Comment this exception if you are sure you want to continue')
df3 = tfidf_vectorizer.fit_transform(df2['SentimentText'])
print(df3.shape)

In [None]:
type(df3)

In [None]:
tfidf_vectorizer.vocabulary_

In [None]:
# Reduce the serialized file size
tfidf_vectorizer.stop_words_ = None
tfidf_vectorizer.vocabulary_ = None

In [None]:
np.linspace(0,10,11)

In [None]:
mnb_estimator = sklearn.naive_bayes.MultinomialNB()

# TODO:
# param_grid = {
#     'tfidf_vectorizer__min_df': linspace(0,)
# }

grid_search_cv = sklearn.model_selection.GridSearchCV(
    mnb_estimator, param_grid={'alpha': [0.1, 0.5, 1.0, 10.0]}, cv=5
)

In [None]:
grid_search_cv.fit(df3, labels_sad)

In [None]:
best_estimator = grid_search_cv.best_estimator_

In [None]:
best_estimator

In [None]:
import dill

dill.dump(tw_preprocessor, open('tw_preprocessor.dill', 'wb'))
dill.dump(tfidf_vectorizer, open('tfidf_vectorizer.dill', 'wb'))
dill.dump(best_estimator, open('best_mnb_estimator.dill', 'wb'))

In [None]:
mnb_estimator = sklearn.naive_bayes.MultinomialNB()

grid_search_cv = sklearn.model_selection.GridSearchCV(
    mnb_estimator, param_grid={'alpha': [0.1, 0.5, 1.0, 10.0]}, cv=5
)

pipe = sklearn.pipeline.Pipeline([
    ('tw_preprocessor', tw_preprocessor),
    ('tfidf_vectorizer', tfidf_vectorizer),
    ('grid_search_cv', grid_search_cv)
])
pipe

In [None]:
mnb_estimator.score(df3, labels_sad)

In [None]:
pipe = sklearn.pipeline.Pipeline([
    ('tw_preprocessor', tw_preprocessor),
    ('tfidf_vectorizer', tfidf_vectorizer),
    ('estimator', estimator)
])
pipe

In [None]:
def build_word_cloud(df, column_name='full_text'):
    corpus = ''
    for text in df[column_name]:
        corpus += text
        corpus += ' '

    cloud = wordcloud.WordCloud(
        width=800, height=450, random_state=21, max_font_size=110, background_color='white'
    ).generate(corpus)

    plt.figure(figsize=(8, 5))
    plt.imshow(cloud, interpolation="bilinear")
    plt.axis('off')
    plt.show()


In [None]:
build_word_cloud(df2)

In [None]:
estimator.predict(df3[0:100])

tab = pd.DataFrame()

In [None]:
labels_sad[:100]

In [None]:
tab = df2[:100]
tab['labels_sad'] = labels_sad[:100]
tab['predicted'] = estimator.predict(df3[:100])
tab

In [None]:
df2.head()

In [None]:
example = pd.DataFrame({'SentimentText': ['bitcoin is awesome and it will grow this year']})
print(example)

In [None]:
pipe.predict(example)

## Bokeh

In [None]:
import bokeh
from bokeh.plotting import figure, show
from bokeh.io import output_notebook

In [None]:
output_notebook()

In [None]:
df_tweets.head()

In [None]:
def get_tb_polarity(tweet):
    blob = textblob.TextBlob(tweet)
    sents = blob.sentences
    if len(sents) >= 1:
        sent = sents[0]
        return float(sent.sentiment.polarity)
    else:
        return np.NaN


In [None]:
df_tweets['tb_polarity'] = df_tweets['full_text'].apply(get_tb_polarity)
df_tweets.head()

In [None]:
df_tweets.dtypes

In [None]:
df_tweets['tb_polarity'].describe()

In [None]:
s_polarity = pd.Series(data=df_tweets['tb_polarity'].values, index=df_tweets['created_at'].values)
s_polarity = s_polarity.sort_index()
len(s_polarity)

In [None]:
s_polarity = s_polarity.resample('H').mean()
print(len(s_polarity))
s_polarity.head()

In [None]:
import bokeh.models

p = figure(
    plot_width=800, plot_height=450,
    tools="",
    x_axis_label='time', y_axis_label='polarity',
    x_axis_type='datetime'
)

# Format background colors:
low_box = bokeh.models.BoxAnnotation(top=0, fill_alpha=0.1, fill_color='red')
high_box = bokeh.models.BoxAnnotation(bottom=0, fill_alpha=0.1, fill_color='green')
p.add_layout(low_box)
p.add_layout(high_box)

# Format gridlines:
p.xgrid[0].grid_line_color=None
p.ygrid[0].grid_line_alpha=0.5

# Format view range:
p.y_range = bokeh.models.Range1d(-1.0, 1.0)

# Prepare data:
cd_source = bokeh.models.ColumnDataSource({'timestamp': s_polarity.index, 'polarity': s_polarity})

p.line('timestamp', 'polarity', source=cd_source, line_width=2)
p.circle('timestamp', 'polarity', source=cd_source, fill_color="white", size=2)

p.add_tools(bokeh.models.HoverTool(
    tooltips=[
        ('time', '@timestamp{%F %H:%M}'),
        ("polarity", "@polarity{+0.00}")
    ],
    formatters={'timestamp': 'datetime', 'polarity': 'numeral'},
    mode='vline'
))


In [None]:
show(p)