In [1]:
from datetime import datetime
import json
import re

import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pymongo
import twitter

In [2]:
with open("/Users/dclark171/projects/myKeys.json", 'r') as f:
    data = json.load(f)
twitter_keys = data['keys']['twitter']
host = data['keys']['mongodb']['projects']['info440']['host']

In [3]:
with open("etfs.json", 'r') as f:
    etfs = json.load(f)

In [4]:
def oauth_login():
    # XXX: Go to  to create an app and get values
    # for these credentials that you'll need to provide in place of these
    # empty string values that are defined as placeholders.
    # See https://developer.twitter.com/en/docs/basics/authentication/overview/oauth
    # for more information on Twitter's OAuth implementation.
    
    CONSUMER_KEY = twitter_keys['CONSUMER_KEY']
    CONSUMER_SECRET = twitter_keys['CONSUMER_SECRET']
    OAUTH_TOKEN = twitter_keys['OAUTH_TOKEN']
    OAUTH_TOKEN_SECRET = twitter_keys['OAUTH_TOKEN_SECRET']
    
    auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
                               CONSUMER_KEY, CONSUMER_SECRET)
    
    twitter_api = twitter.Twitter(auth=auth)
    return twitter_api

def twitter_search(twitter_api, q, max_results=300, **kw):

    # See https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets
    # and https://developer.twitter.com/en/docs/tweets/search/guides/standard-operators
    # for details on advanced search criteria that may be useful for 
    # keyword arguments
    
    # See https://dev.twitter.com/docs/api/1.1/get/search/tweets    
    search_results = twitter_api.search.tweets(q=q, count=100, **kw)
    
    statuses = search_results['statuses']
    
    # Iterate through batches of results by following the cursor until we
    # reach the desired number of results, keeping in mind that OAuth users
    # can "only" make 180 search queries per 15-minute interval. See
    # https://developer.twitter.com/en/docs/basics/rate-limits
    # for details. A reasonable number of results is ~1000, although
    # that number of results may not exist for all queries.
    
    # Enforce a reasonable limit
    max_results = min(1000, max_results)
    
    for _ in range(10): # 10*100 = 1000
        try:
            next_results = search_results['search_metadata']['next_results']
        except KeyError as e: # No more results when next_results doesn't exist
            break
            
        # Create a dictionary from next_results, which has the following form:
        # ?max_id=313519052523986943&q=NCAA&include_entities=1
        kwargs = dict([ kv.split('=') 
                        for kv in next_results[1:].split("&") ])
        
        search_results = twitter_api.search.tweets(**kwargs)
        statuses += search_results['statuses']
        
        if len(statuses) > max_results: 
            break
            
    return statuses

In [5]:
def preprocess(message):
    """
    This function takes a string as input, then performs these operations: 
        - lowercase
        - remove URLs
        - remove ticker symbols 
        - removes punctuation
        - tokenize by splitting the string on whitespace 
        - removes any single character tokens
    
    Parameters
    ----------
        message : The text message to be preprocessed.
        
    Returns
    -------
        tokens: The preprocessed text into tokens.
    """ 
    #TODO: Implement 
    
    # Lowercase the twit message
    text = message.lower()
    
    # Replace URLs with a space in the message
    text = re.sub(r'https?:\/\/.*[\r\n]*', ' ', text)
    
    # Replace ticker symbols with a space. The ticker symbols are any stock symbol that starts with $.
    text = re.sub(r'[$][A-Za-z][\S]*', ' ', text)
    
    # Replace StockTwits usernames with a space. The usernames are any word that starts with @.
    text = re.sub(r'[$][A-Za-z][\S]*', ' ', text)

    # Replace everything not a letter with a space
    text = re.sub(r'[\W_]+', ' ', text)
    
    # Tokenize by splitting the string on whitespace into a list of words
    tokens = text.split()

    # Lemmatize words using the WordNetLemmatizer. You can ignore any word that is not longer than one character.
    wnl = nltk.stem.WordNetLemmatizer()
    tokens = [wnl.lemmatize(token) for token in tokens if len(token) > 1]
    
    return tokens   

In [6]:
def create_collection(db, coll):
    if "$" in coll:
        coll = coll[1:]
    return db[coll]

def get_polarity_score(tweets_lst):
    polarity_arr = np.zeros(len(tweets_lst))
    for i, text in enumerate(tweets_lst):
        # Extract the text portion of the tweet

        # Measure the polarity of the tweet
        polarity = analyzer.polarity_scores(text['text'])    
        # Store the normalized, weighted composite score
        polarity_arr[i] = polarity['compound']
    return polarity_arr

In [7]:
# Connect to search_results DB
client = pymongo.MongoClient(host)
db = client.search_results
# Conenct to twitter
twitter_api = oauth_login()
analyzer = SentimentIntensityAnalyzer()

In [8]:
# Gather twitter response, insert into DB
queries = list(map(lambda x: '$' + x, etfs.keys()))
for q in queries:
    # Create new collection titled the search query
    coll = create_collection(db, q)
    # Get search results
    response = twitter_search(twitter_api, q)
    for doc in response:
        print(coll.insert_one(doc).inserted_id)

5ea8c86b6153ced90536e606
5ea8c86b6153ced90536e607
5ea8c86b6153ced90536e608
5ea8c86b6153ced90536e609
5ea8c86b6153ced90536e60a
5ea8c86b6153ced90536e60b
5ea8c86b6153ced90536e60c
5ea8c86b6153ced90536e60d
5ea8c86b6153ced90536e60e
5ea8c86b6153ced90536e60f
5ea8c86b6153ced90536e610
5ea8c86b6153ced90536e611
5ea8c86b6153ced90536e612
5ea8c86b6153ced90536e613
5ea8c86b6153ced90536e614
5ea8c86b6153ced90536e615
5ea8c86b6153ced90536e616
5ea8c86b6153ced90536e617
5ea8c86b6153ced90536e618
5ea8c86b6153ced90536e619
5ea8c86b6153ced90536e61a
5ea8c86b6153ced90536e61b
5ea8c86b6153ced90536e61c
5ea8c86b6153ced90536e61d
5ea8c86b6153ced90536e61e
5ea8c86b6153ced90536e61f
5ea8c86b6153ced90536e620
5ea8c86b6153ced90536e621
5ea8c86b6153ced90536e622
5ea8c86b6153ced90536e623
5ea8c86b6153ced90536e624
5ea8c86b6153ced90536e625
5ea8c86b6153ced90536e626
5ea8c86c6153ced90536e627
5ea8c86c6153ced90536e628
5ea8c86c6153ced90536e629
5ea8c86c6153ced90536e62a
5ea8c86c6153ced90536e62b
5ea8c86c6153ced90536e62c
5ea8c86c6153ced90536e62d


5ea8c8746153ced90536e74f
5ea8c8746153ced90536e750
5ea8c8746153ced90536e751
5ea8c8746153ced90536e752
5ea8c8746153ced90536e753
5ea8c8746153ced90536e754
5ea8c8746153ced90536e755
5ea8c8746153ced90536e756
5ea8c8746153ced90536e757
5ea8c8746153ced90536e758
5ea8c8746153ced90536e759
5ea8c8746153ced90536e75a
5ea8c8746153ced90536e75b
5ea8c8746153ced90536e75c
5ea8c8746153ced90536e75d
5ea8c8746153ced90536e75e
5ea8c8746153ced90536e75f
5ea8c8746153ced90536e760
5ea8c8746153ced90536e761
5ea8c8746153ced90536e762
5ea8c8746153ced90536e763
5ea8c8746153ced90536e764
5ea8c8746153ced90536e765
5ea8c8746153ced90536e766
5ea8c8746153ced90536e767
5ea8c8746153ced90536e768
5ea8c8746153ced90536e769
5ea8c8746153ced90536e76a
5ea8c8746153ced90536e76b
5ea8c8746153ced90536e76c
5ea8c8746153ced90536e76d
5ea8c8746153ced90536e76e
5ea8c8746153ced90536e76f
5ea8c8746153ced90536e770
5ea8c8746153ced90536e771
5ea8c8746153ced90536e772
5ea8c8746153ced90536e773
5ea8c8746153ced90536e774
5ea8c8746153ced90536e775
5ea8c8746153ced90536e776


5ea8c87e6153ced90536e89e
5ea8c87e6153ced90536e89f
5ea8c87e6153ced90536e8a0
5ea8c87e6153ced90536e8a1
5ea8c87e6153ced90536e8a2
5ea8c87e6153ced90536e8a3
5ea8c87e6153ced90536e8a4
5ea8c87e6153ced90536e8a5
5ea8c87e6153ced90536e8a6
5ea8c87e6153ced90536e8a7
5ea8c87e6153ced90536e8a8
5ea8c87e6153ced90536e8a9
5ea8c87e6153ced90536e8aa
5ea8c87e6153ced90536e8ab
5ea8c87e6153ced90536e8ac
5ea8c87e6153ced90536e8ad
5ea8c87e6153ced90536e8ae
5ea8c87e6153ced90536e8af
5ea8c87e6153ced90536e8b0
5ea8c87e6153ced90536e8b1
5ea8c87e6153ced90536e8b2
5ea8c87e6153ced90536e8b3
5ea8c87e6153ced90536e8b4
5ea8c87e6153ced90536e8b5
5ea8c87e6153ced90536e8b6
5ea8c87e6153ced90536e8b7
5ea8c87e6153ced90536e8b8
5ea8c87e6153ced90536e8b9
5ea8c87e6153ced90536e8ba
5ea8c87e6153ced90536e8bb
5ea8c87e6153ced90536e8bc
5ea8c87e6153ced90536e8bd
5ea8c87e6153ced90536e8be
5ea8c87e6153ced90536e8bf
5ea8c87f6153ced90536e8c0
5ea8c87f6153ced90536e8c1
5ea8c87f6153ced90536e8c2
5ea8c87f6153ced90536e8c3
5ea8c87f6153ced90536e8c4
5ea8c87f6153ced90536e8c5


5ea8c8876153ced90536e9ed
5ea8c8876153ced90536e9ee
5ea8c8876153ced90536e9ef
5ea8c8876153ced90536e9f0
5ea8c8876153ced90536e9f1
5ea8c8876153ced90536e9f2
5ea8c8886153ced90536e9f3
5ea8c8886153ced90536e9f4
5ea8c8886153ced90536e9f5
5ea8c8886153ced90536e9f6
5ea8c8886153ced90536e9f7
5ea8c8886153ced90536e9f8
5ea8c8886153ced90536e9f9
5ea8c8886153ced90536e9fa
5ea8c8886153ced90536e9fb
5ea8c8886153ced90536e9fc
5ea8c8886153ced90536e9fd
5ea8c8886153ced90536e9fe
5ea8c8886153ced90536e9ff
5ea8c8886153ced90536ea00
5ea8c8886153ced90536ea01
5ea8c8886153ced90536ea02
5ea8c8886153ced90536ea03
5ea8c8886153ced90536ea04
5ea8c8886153ced90536ea05
5ea8c8886153ced90536ea06
5ea8c8886153ced90536ea07
5ea8c8886153ced90536ea08
5ea8c8886153ced90536ea09
5ea8c8886153ced90536ea0a
5ea8c8886153ced90536ea0b
5ea8c8886153ced90536ea0c
5ea8c8886153ced90536ea0d
5ea8c8886153ced90536ea0e
5ea8c8886153ced90536ea0f
5ea8c8886153ced90536ea10
5ea8c8886153ced90536ea11
5ea8c8886153ced90536ea12
5ea8c8886153ced90536ea13
5ea8c8886153ced90536ea14


5ea8c8926153ced90536eb36
5ea8c8926153ced90536eb37
5ea8c8926153ced90536eb38
5ea8c8926153ced90536eb39
5ea8c8926153ced90536eb3a
5ea8c8926153ced90536eb3b
5ea8c8936153ced90536eb3c
5ea8c8936153ced90536eb3d
5ea8c8936153ced90536eb3e
5ea8c8936153ced90536eb3f
5ea8c8936153ced90536eb40
5ea8c8936153ced90536eb41
5ea8c8936153ced90536eb42
5ea8c8936153ced90536eb43
5ea8c8936153ced90536eb44
5ea8c8936153ced90536eb45
5ea8c8936153ced90536eb46
5ea8c8936153ced90536eb47
5ea8c8936153ced90536eb48
5ea8c8936153ced90536eb49
5ea8c8936153ced90536eb4a
5ea8c8936153ced90536eb4b
5ea8c8936153ced90536eb4c
5ea8c8936153ced90536eb4d
5ea8c8936153ced90536eb4e
5ea8c8936153ced90536eb4f
5ea8c8936153ced90536eb50
5ea8c8936153ced90536eb51
5ea8c8936153ced90536eb52
5ea8c8936153ced90536eb53
5ea8c8936153ced90536eb54
5ea8c8946153ced90536eb55
5ea8c8946153ced90536eb56
5ea8c8946153ced90536eb57
5ea8c8946153ced90536eb58
5ea8c8946153ced90536eb59
5ea8c8946153ced90536eb5a
5ea8c8946153ced90536eb5b
5ea8c8946153ced90536eb5c
5ea8c8946153ced90536eb5d


In [9]:
# Get tweets for SPY then subset by date posted
spy_tweets = [x for x in client['search_results']['SPY'].find({},{"_id": 0, "text": 1, "created_at": 1})]

spy_d1 = []
spy_d2 = []
for i in spy_tweets:
    i['created_at'] = datetime.strptime(i['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime("%m/%d/%y")
    if i['created_at'] == '04/27/20':
        spy_d1.append(i)
    elif i['created_at'] == '04/28/20':
        spy_d2.append(i)

In [10]:
# Get polarity scores
spy_scores_d1 = get_polarity_score(spy_d1)
spy_scores_d2 = get_polarity_score(spy_d2)

In [11]:
print(f"SPY Sentiment Average April 27th: {np.average(spy_scores_d1):.2f}\nSPY Day Change April 27th: +1.44%")
print(f"\nSPY Sentiment Average April 28th: {np.average(spy_scores_d2):.2f}\nSPY Day Change April 28th: -0.47%")

SPY Sentiment Average April 27th: 0.12
SPY Day Change April 27th: +1.44%

SPY Sentiment Average April 28th: 0.08
SPY Day Change April 28th: -0.47%


In [24]:
# Get tweets for DIA then subset by date posted
dia_tweets = [x for x in client['search_results']['DIA'].find({},{"_id": 0, "text": 1, "created_at": 1})]

dia_d1 = []
dia_d2 = []
for i in dia_tweets:
    i['created_at'] = datetime.strptime(i['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime("%m/%d/%y")
    if i['created_at'] == '04/27/20':
        dia_d1.append(i)
    elif i['created_at'] == '04/28/20':
        dia_d2.append(i)

In [25]:
# Get polarity scores
dia_scores_d1 = get_polarity_score(dia_d1)
dia_scores_d2 = get_polarity_score(dia_d2)

In [26]:
print(f"DIA Sentiment Average April 27th: {np.average(dia_scores_d1):.2f}\nDIA Day Change April 27th: +1.47%")
print(f"\nDIA Sentiment Average April 28th: {np.average(dia_scores_d2):.2f}\nDIA Day Change April 28th: -0.12%")

DIA Sentiment Average April 27th: 0.14
DIA Day Change April 27th: +1.47%

DIA Sentiment Average April 28th: 0.12
DIA Day Change April 28th: -0.12%
