In [1]:
import tweepy as tw
import json
import os
import requests
import sys
import warnings
import time
import random
import logging
warnings.filterwarnings("ignore")

In [2]:
# Data Science modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
plt.style.use('ggplot')

In [3]:
# returns python object representation of JSON in response
def get_response(symbol, older_than, retries=5):
    url = 'https://api.stocktwits.com/api/2/streams/symbol/%s.json?max=%d' % (symbol, older_than-1)
    for _ in range(retries):
        response = requests.get(url)
        if response.status_code == 200:
            return json.loads(response.content)
        elif response.status_code == 429:
            print(response.content)
            return None
        time.sleep(1.0)
    # couldn't get response
    return None

In [7]:
# extends the current dataset for a given symbol with more twits
def get_older_tweets(symbol, num_queries):    
    path = 'Data/StockTwitsData/%s.json' % symbol
    if os.path.exists(path):
        # extending an existing json file
        with open(path, 'r') as f:
            data = json.load(f)
            if len(data) > 0:
                older_than = data[-1]['id']
            else:
                older_than = 1000000000000
    else:
        # creating a new json file
        data = []
        older_than = 1000000000000  # any huge number
    
    for i in range(num_queries):
        content = get_response(symbol, older_than)
        if content == None:
            print('Error, an API query timed out')
            break
        data.extend(content['messages'])
        older_than = data[-1]['id']
        sys.stdout.write('\rSuccessfully made query %d' % (i+1))
        sys.stdout.flush()
        # sleep to make sure we don't get throttled
        time.sleep(0.5)
    
    full_data = {}
    full_data["data"]=data
        
    # write the new data to the JSON file
    with open(path, 'w') as f:
        json.dump(full_data, f)
    print
    print('...Done')

In [9]:
symbols = ['SPY', 'XLC', 'XLY', 'XLP', 'XLE', 'XLF','XLV', 'XLI','XLB',
           'XLRE', 'XLK','XLU']
tweets_per_symbol = 10000
for symbol in symbols:
    path = 'Data/StockTwits/%s.json' % symbol
    if os.path.exists(path):
        with open(path, 'r') as f:
            num_tweets = len(json.load(f))
    else:
        num_tweets = 0
    num_queries = int((tweets_per_symbol - num_tweets - 1)/30 + 1)
    
    if num_queries > 0:
        print('Getting tweets for symbol %s' % symbol)
        get_older_tweets(symbol, num_queries)

Getting tweets for symbol SPY
Successfully made query 334...Done
Getting tweets for symbol XLC
Successfully made query 334...Done
Getting tweets for symbol XLY
Successfully made query 334...Done
Getting tweets for symbol XLP
Successfully made query 334...Done
Getting tweets for symbol XLE
Successfully made query 334...Done
Getting tweets for symbol XLF
Successfully made query 334...Done
Getting tweets for symbol XLV
Successfully made query 334...Done
Getting tweets for symbol XLI
Successfully made query 334...Done
Getting tweets for symbol XLB
Successfully made query 334...Done
Getting tweets for symbol XLRE
Successfully made query 334...Done
Getting tweets for symbol XLK
Successfully made query 334...Done
Getting tweets for symbol XLU
Successfully made query 334...Done
