In [5]:
import requests
import pandas as pd

import sklearn
from sklearn.metrics import cohen_kappa_score
import data_reader

In [8]:
DATA_3_FILEPATH = "./data/data3/data3.csv"
DATA_4_FILEPATH = "./data/data4/stocktwits_data_ALL.csv"

## Data 3
~1200 tweets

In [None]:
LIST_OF_SYMBOLS = ['SPY', 'DJIA', 'QQQ', 'VIX', 'BTC.X', 'ETH.X', 'XRP.X', 'ADA.X', 'SOL.X']

URL = "https://api.stocktwits.com/api/2/streams/symbol/{}.json?max={}&filter=top"
URL_START = "https://api.stocktwits.com/api/2/streams/symbol/{}.json?filter=top"
    
def scrape_data(symbol, target_count):
    # All scraped data, list of dicts
    all_messages = []
    
    # Initial API get request
    url = URL_START.format(symbol)
    response = requests.get(url)
    content = response.json()
    messages = content['messages']
    last_id = process_messages(symbol, messages, all_messages)
    print("{} - Scraped count: {}".format(symbol, len(all_messages)))
    
    # Continue API get requests till >= target_count
    while(len(all_messages)<target_count):
        try:
            url = URL.format(symbol, last_id)
            response = requests.get(url)
            content = response.json()
            messages = content['messages']
            last_id = process_messages(symbol, messages, all_messages)
            print("{} - Scraped count: {}".format(symbol, len(all_messages)))
        except:
            break
            
    stocktwit_df = pd.DataFrame(columns=['Sentiment', "User_id", "Message", "Date", "Time", "Symbol"])
    df = pd.DataFrame(all_messages)
    stocktwit_df = pd.concat([stocktwit_df, df])
    return stocktwit_df
    
def process_messages(symbol, messages, all_messages):
    for message in messages:
        message_dict = {}
        
        # Only get messages with sentiment
        try:
            message_dict['Sentiment'] = message['entities']['sentiment']['basic']
        except TypeError:
            continue
            
        message_dict['User_id'] = message['id']
        message_dict['Message'] = message['body']
        message_dict['Date'] = message['created_at'].split('T')[0]
        message_dict['Time'] = message['created_at'].split('T')[1]
        message_dict['Symbol'] = symbol
        all_messages.append(message_dict)

    last_id = str(messages[-1]['id'])
    return last_id


stocktwit_df = pd.DataFrame(columns=['Sentiment', "User_id", "Message", "Date", "Time", "Symbol"])

for symbol in LIST_OF_SYMBOLS:
    curr_df = scrape_data(symbol, 200)
    stocktwit_df = pd.concat([stocktwit_df, curr_df])
    
stocktwit_df.to_csv(DATA_3_FILEPATH, index = False)

After annotating...

In [4]:
DATA_3_FILEPATH_CLEANED = "./data/data3/data3_final.csv"

data3 = pd.read_csv(DATA_3_FILEPATH_CLEANED)
cohen_kappa_score([1 if senti=="Bullish" else 0 for senti in data3["Sentiment"]], data3["Annotator1"])

0.8554340112983937

## Data4

In [None]:
LIST_OF_SYMBOLS = ['SPY', 'DJIA', 'QQQ', 'VIX', 'BTC.X', 'ETH.X', 'XRP.X', 'ADA.X', 'SOL.X']

URL = "https://api.stocktwits.com/api/2/streams/symbol/{}.json?max={}&filter=top"
URL_START = "https://api.stocktwits.com/api/2/streams/symbol/{}.json?filter=top"

def save_to_csv(df, symbol):
    filename_csv = "./scrape_data/stocktwits_data_"+symbol+".csv"
    df.to_csv(filename_csv, index = False)
    print("File saved as: ", filename_csv)
    
def scrape_data(symbol, target_count):
    # All scraped data, list of dicts
    all_messages = []
    
    # Initial API get request
    url = URL_START.format(symbol)
    response = requests.get(url)
    content = response.json()
    messages = content['messages']
    last_id = process_messages(symbol, messages, all_messages)
    print("{} - Scraped count: {}".format(symbol, len(all_messages)))
    
    prev_high = 0
    # Continue API get requests till >= target_count
    while(len(all_messages)<target_count):
        try:
            url = URL.format(symbol, last_id)
            response = requests.get(url)
            content = response.json()
            messages = content['messages']
            last_id = process_messages(symbol, messages, all_messages)

            if(len(all_messages)//1000 > prev_high):
                print("{} - Scraped count: {}".format(symbol, len(all_messages)))
                prev_high = len(all_messages)//1000
        except:
            break
    
    print("{} - Scraped count: {}".format(symbol, len(all_messages)))
    stocktwit_df = pd.DataFrame(columns=['Sentiment', "User_id", "Message", "Date", "Time", "Symbol"])
    df = pd.DataFrame(all_messages)
    stocktwit_df = pd.concat([stocktwit_df, df])
    save_to_csv(stocktwit_df, symbol)
    
def process_messages(symbol, messages, all_messages):
    for message in messages:
        message_dict = {}
        
        # Only get messages with sentiment
        try:
            message_dict['Sentiment'] = message['entities']['sentiment']['basic']
        except TypeError:
            continue
            
        message_dict['User_id'] = message['id']
        message_dict['Message'] = message['body']
        message_dict['Date'] = message['created_at'].split('T')[0]
        message_dict['Time'] = message['created_at'].split('T')[1]
        message_dict['Symbol'] = symbol
        all_messages.append(message_dict)

    last_id = str(messages[-1]['id'])
    return last_id


stocktwit_df = pd.DataFrame(columns=['Sentiment', "User_id", "Message", "Date", "Time", "Symbol"])

for symbol in LIST_OF_SYMBOLS:
    scrape_data(symbol, 110000)

In [None]:
stocktwit_df = pd.DataFrame(columns=['Sentiment', "User_id", "Message", "Date", "Time", "Symbol"])

for symbol in LIST_OF_SYMBOLS:
    curr_df = pd.read_csv("./scrape_data/stocktwits_data_"+symbol+".csv")
    stocktwit_df = pd.concat([stocktwit_df, curr_df])
    
stocktwit_df = stocktwit_df.drop_duplicates(subset='Message')
stocktwit_df.to_csv(DATA_4_FILEPATH, index = False)

Clean data and save to csv, since file is huge...

In [None]:
DATA_4_FILEPATH_CLEANED = "./data/data4/stocktwits_data_ALL_cleaned.csv"

data4 = pd.read_csv(DATA_4_FILEPATH)
data4 = data_reader.preprocess_data(data4, "Message")
data4.to_csv(DATA_4_FILEPATH_CLEANED, index = False)