In [1]:
from datetime import datetime
import json
import re

import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pymongo
import twitter

In [2]:
with open("/Users/dclark171/projects/myKeys.json", 'r') as f:
    data = json.load(f)
twitter_keys = data['keys']['twitter']
host = data['keys']['mongodb']['projects']['info440']['host']

In [3]:
with open("etfs.json", 'r') as f:
    etfs = json.load(f)

In [4]:
def oauth_login():
    # XXX: Go to  to create an app and get values
    # for these credentials that you'll need to provide in place of these
    # empty string values that are defined as placeholders.
    # See https://developer.twitter.com/en/docs/basics/authentication/overview/oauth
    # for more information on Twitter's OAuth implementation.
    
    CONSUMER_KEY = twitter_keys['CONSUMER_KEY']
    CONSUMER_SECRET = twitter_keys['CONSUMER_SECRET']
    OAUTH_TOKEN = twitter_keys['OAUTH_TOKEN']
    OAUTH_TOKEN_SECRET = twitter_keys['OAUTH_TOKEN_SECRET']
    
    auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
                               CONSUMER_KEY, CONSUMER_SECRET)
    
    twitter_api = twitter.Twitter(auth=auth)
    return twitter_api

def twitter_search(twitter_api, q, max_results=300, **kw):

    # See https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets
    # and https://developer.twitter.com/en/docs/tweets/search/guides/standard-operators
    # for details on advanced search criteria that may be useful for 
    # keyword arguments
    
    # See https://dev.twitter.com/docs/api/1.1/get/search/tweets    
    search_results = twitter_api.search.tweets(q=q, count=100, **kw)
    
    statuses = search_results['statuses']
    
    # Iterate through batches of results by following the cursor until we
    # reach the desired number of results, keeping in mind that OAuth users
    # can "only" make 180 search queries per 15-minute interval. See
    # https://developer.twitter.com/en/docs/basics/rate-limits
    # for details. A reasonable number of results is ~1000, although
    # that number of results may not exist for all queries.
    
    # Enforce a reasonable limit
    max_results = min(1000, max_results)
    
    for _ in range(10): # 10*100 = 1000
        try:
            next_results = search_results['search_metadata']['next_results']
        except KeyError as e: # No more results when next_results doesn't exist
            break
            
        # Create a dictionary from next_results, which has the following form:
        # ?max_id=313519052523986943&q=NCAA&include_entities=1
        kwargs = dict([ kv.split('=') 
                        for kv in next_results[1:].split("&") ])
        
        search_results = twitter_api.search.tweets(**kwargs)
        statuses += search_results['statuses']
        
        if len(statuses) > max_results: 
            break
            
    return statuses

In [5]:
def preprocess(message):
    """
    This function takes a string as input, then performs these operations: 
        - lowercase
        - remove URLs
        - remove ticker symbols 
        - removes punctuation
        - tokenize by splitting the string on whitespace 
        - removes any single character tokens
    
    Parameters
    ----------
        message : The text message to be preprocessed.
        
    Returns
    -------
        tokens: The preprocessed text into tokens.
    """ 
    #TODO: Implement 
    
    # Lowercase the twit message
    text = message.lower()
    
    # Replace URLs with a space in the message
    text = re.sub(r'https?:\/\/.*[\r\n]*', ' ', text)
    
    # Replace ticker symbols with a space. The ticker symbols are any stock symbol that starts with $.
    text = re.sub(r'[$][A-Za-z][\S]*', ' ', text)
    
    # Replace StockTwits usernames with a space. The usernames are any word that starts with @.
    text = re.sub(r'[$][A-Za-z][\S]*', ' ', text)

    # Replace everything not a letter with a space
    text = re.sub(r'[\W_]+', ' ', text)
    
    # Tokenize by splitting the string on whitespace into a list of words
    tokens = text.split()

    # Lemmatize words using the WordNetLemmatizer. You can ignore any word that is not longer than one character.
    wnl = nltk.stem.WordNetLemmatizer()
    tokens = [wnl.lemmatize(token) for token in tokens if len(token) > 1]
    
    return tokens   

In [6]:
def create_collection(db, coll):
    if "$" in coll:
        coll = coll[1:]
    return db[coll]

def get_polarity_score(tweets_lst):
    polarity_arr = np.zeros(len(tweets_lst))
    for i, text in enumerate(tweets_lst):
        # Extract the text portion of the tweet

        # Measure the polarity of the tweet
        polarity = analyzer.polarity_scores(text['text'])    
        # Store the normalized, weighted composite score
        polarity_arr[i] = polarity['compound']
    return polarity_arr

In [7]:
# Connect to search_results DB
client = pymongo.MongoClient(host)
db = client.search_results
# Conenct to twitter
twitter_api = oauth_login()
analyzer = SentimentIntensityAnalyzer()

In [8]:
# Gather twitter response, insert into DB
queries = list(map(lambda x: '$' + x, etfs.keys()))
ct = 0
for q in queries:
    # Create new collection titled the search query
    print(f"Searching twitter for {q}")
    coll = create_collection(db, q)
    # Get search results
    response = twitter_search(twitter_api, q)
    for doc in response:
        inserted_id = coll.insert_one(doc).inserted_id

Searching twitter for $SPY
Searching twitter for $QQQ
Searching twitter for $IWM
Searching twitter for $DIA
Searching twitter for $VTI
Searching twitter for $MDY
Searching twitter for $DBC
Searching twitter for $FEZ
Searching twitter for $OEF
Searching twitter for $IWF
Searching twitter for $IWD
Searching twitter for $PFF
Searching twitter for $VOO
Searching twitter for $IJH
Searching twitter for $IWO
Searching twitter for $IWN
Searching twitter for $ACWI
Searching twitter for $IEMG


In [9]:
# Get tweets for SPY then subset by date posted
spy_tweets = [x for x in client['search_results']['SPY'].find({},{"_id": 0, "text": 1, "created_at": 1})]

spy_d1 = []
spy_d2 = []
for i in spy_tweets:
    i['created_at'] = datetime.strptime(i['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime("%m/%d/%y")
    if i['created_at'] == '04/27/20':
        spy_d1.append(i)
    elif i['created_at'] == '04/28/20':
        spy_d2.append(i)

In [10]:
# Get polarity scores
spy_scores_d1 = get_polarity_score(spy_d1)
spy_scores_d2 = get_polarity_score(spy_d2)

In [11]:
print(f"SPY Sentiment Average April 27th: {np.average(spy_scores_d1):.2f}\nSPY Day Change April 27th: +1.44%")
print(f"\nSPY Sentiment Average April 28th: {np.average(spy_scores_d2):.2f}\nSPY Day Change April 28th: -0.47%")

SPY Sentiment Average April 27th: 0.12
SPY Day Change April 27th: +1.44%

SPY Sentiment Average April 28th: 0.08
SPY Day Change April 28th: -0.47%


In [12]:
# Get tweets for DIA then subset by date posted
dia_tweets = [x for x in client['search_results']['DIA'].find({},{"_id": 0, "text": 1, "created_at": 1})]

dia_d1 = []
dia_d2 = []
for i in dia_tweets:
    i['created_at'] = datetime.strptime(i['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime("%m/%d/%y")
    if i['created_at'] == '04/27/20':
        dia_d1.append(i)
    elif i['created_at'] == '04/28/20':
        dia_d2.append(i)

In [13]:
# Get polarity scores
dia_scores_d1 = get_polarity_score(dia_d1)
dia_scores_d2 = get_polarity_score(dia_d2)

In [14]:
print(f"DIA Sentiment Average April 27th: {np.average(dia_scores_d1):.2f}\nDIA Day Change April 27th: +1.47%")
print(f"\nDIA Sentiment Average April 28th: {np.average(dia_scores_d2):.2f}\nDIA Day Change April 28th: -0.12%")

DIA Sentiment Average April 27th: 0.14
DIA Day Change April 27th: +1.47%

DIA Sentiment Average April 28th: 0.11
DIA Day Change April 28th: -0.12%
