In [2]:
# This notebook processes specified raw datasets with tweets for a specific set ot tickers
# The draft of preparation code taken from this article: https://arxiv.org/abs/2103.16388

# Install, Import statements

In [3]:
# NEW 2022-11
!python -V

Python 3.8.16


In [4]:
# Note: GPU is not required for this notebook
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [5]:
!pip install contractions
!pip install emoji
!pip install ekphrasis
!pip install yfinance --upgrade --no-cache-dir

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (110 kB)
[K     |████████████████████████████████| 110 kB 5.3 MB/s 
[?25hCollecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[K     |████████████████████████████████| 287 kB 39.6 MB/s 
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.73 pyahocorasick-1.4.4 textsearch-0.0.24
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-2.2.0.tar.gz (240 kB)
[K     |████████████████████████████████| 240 kB 4.9

In [6]:
from collections import defaultdict
import contractions
from ekphrasis.classes.segmenter import Segmenter
import emoji
from datetime import datetime, timedelta, date, timezone
import itertools
import json
import numpy as np
import os
import pandas as pd
#from pandas_datareader import data as pdr
from pprint import pprint
import random
import re
import requests 
import string
import sys
#import tensorflow as tf  # NEW 2022-11: commented
#import tensorflow_hub as hub  # NEW 2022-11: commented
#from tensorflow import keras # NEW 2022-11: commented
import torch
from tqdm.notebook import tqdm  # NEW 2022-11
import yfinance

# NLTK section
import nltk
nltk.download('punkt')
nltk.download('stopwords')
# from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer 
from nltk.corpus import stopwords as sw
from nltk.tokenize import word_tokenize 
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.stem import PorterStemmer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Mounts

In [7]:
# Reqiured for mounting
import google.colab
google.colab.drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
assert os.path.isdir('/content/drive/MyDrive')

In [9]:
# Required for getting files by id
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import pandas as pd
# Authenticate the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Defs

In [10]:
#Function to load files of different companies
def get_compData(comp):
  if comp =='AAPL':
    # AAPL: https://drive.google.com/file/d/1diLooyj8DtyyxwiihpbTJdfY4D98irLE/view?usp=sharing
    id = '1diLooyj8DtyyxwiihpbTJdfY4D98irLE'
    downloaded1 = drive.CreateFile({'id':id}) 
    downloaded1.GetContentFile('stocktwits_AAPL.csv')
    df_st1 = pd.read_csv('stocktwits_AAPL.csv')
    return df_st1
  elif comp=='ADBE':
    # ADBE: https://drive.google.com/file/d/1SMyRg8aUnnQTbukVadBo814qdS6xDnP1/view?usp=sharing
    id = '1SMyRg8aUnnQTbukVadBo814qdS6xDnP1'
    downloaded2 = drive.CreateFile({'id':id}) 
    downloaded2.GetContentFile('stocktwits_ADBE.csv')
    df_st2 = pd.read_csv('stocktwits_ADBE.csv')
    return df_st2
  elif comp=='AMZN':
    # AMZN: https://drive.google.com/file/d/1ffmFOrlcaTCCByKMq7E1LCXXK6daMkB_/view?usp=sharing
    id = '1ffmFOrlcaTCCByKMq7E1LCXXK6daMkB_'
    downloaded3 = drive.CreateFile({'id':id}) 
    downloaded3.GetContentFile('stocktwits_AMZN.csv')
    df_st3 = pd.read_csv('stocktwits_AMZN.csv')
    return df_st3
  elif comp=='BAC':
    # BAC: https://drive.google.com/file/d/1HeaKJtlSz2xiLT3sx5FLdYmzCkVDGIjW/view?usp=sharing
    id = '1HeaKJtlSz2xiLT3sx5FLdYmzCkVDGIjW'
    downloaded4 = drive.CreateFile({'id':id}) 
    downloaded4.GetContentFile('stocktwits_BAC.csv')
    df_st4 = pd.read_csv('stocktwits_BAC.csv')
    return df_st4

  elif comp=='BRK.A':
    #BRK_A: https://drive.google.com/file/d/1HeQhAU20YyT1tRCD-yN83vrUGY7jDDSt/view?usp=sharing
    id = '1HeQhAU20YyT1tRCD-yN83vrUGY7jDDSt'
    downloaded5 = drive.CreateFile({'id':id}) 
    downloaded5.GetContentFile('stocktwits_BRK_A.csv')
    df_st5 = pd.read_csv('stocktwits_BRK_A.csv')
    return df_st5
  elif comp=='BRK.B':
    #BRK_B: https://drive.google.com/file/d/1FRrwrIVJVyFqg025SWasKW84qOJMhe84/view?usp=sharing
    id = '1FRrwrIVJVyFqg025SWasKW84qOJMhe84'
    downloaded6 = drive.CreateFile({'id':id}) 
    downloaded6.GetContentFile('stocktwits_BRK_B.csv')
    df_st6 = pd.read_csv('stocktwits_BRK_B.csv')
    return df_st6
  elif comp=='DIA':
    #DIA: https://drive.google.com/file/d/1riZ8IdkupLre9NQ7McBO21wDFVX_NPvg/view?usp=sharing
    id = '1riZ8IdkupLre9NQ7McBO21wDFVX_NPvg'
    downloaded7 = drive.CreateFile({'id':id}) 
    downloaded7.GetContentFile('stocktwits_DIA.csv')
    df_st7 = pd.read_csv('stocktwits_DIA.csv')
    return df_st7
  elif comp=='DIS':
    #DIS: https://drive.google.com/file/d/1y5IT3gA_yIJnFLTsxy1GHdIY4SHwj84Y/view?usp=sharing
    id = '1y5IT3gA_yIJnFLTsxy1GHdIY4SHwj84Y'
    downloaded8 = drive.CreateFile({'id':id}) 
    downloaded8.GetContentFile('stocktwits_DIS.csv')
    df_st8 = pd.read_csv('stocktwits_DIS.csv')
    return df_st8
  elif comp=='FB':
    #FB: https://drive.google.com/file/d/1x1FugJhUQx9xKWS9LYnio1MU8oi5iuTc/view?usp=sharing
    id = '1x1FugJhUQx9xKWS9LYnio1MU8oi5iuTc'
    downloaded9 = drive.CreateFile({'id':id}) 
    downloaded9.GetContentFile('stocktwits_FB.csv')
    df_st9 = pd.read_csv('stocktwits_FB.csv')
    return df_st9
  elif comp=='GOOG':
    #GOOG: https://drive.google.com/file/d/1UMnPXfG_XkgLJAQ2VtLvMPBPZhN68Ezw/view?usp=sharing
    id = '1UMnPXfG_XkgLJAQ2VtLvMPBPZhN68Ezw'
    downloaded10 = drive.CreateFile({'id':id}) 
    downloaded10.GetContentFile('stocktwits_GOOG.csv')
    df_st10 = pd.read_csv('stocktwits_GOOG.csv')
    return df_st10
  elif comp=='GOOGL':
    #GOOGL: https://drive.google.com/file/d/1NbCpQX0sTgXsqcW7xMbkrDPlAOscIoTL/view?usp=sharing
    id = '1NbCpQX0sTgXsqcW7xMbkrDPlAOscIoTL'
    downloaded11 = drive.CreateFile({'id':id}) 
    downloaded11.GetContentFile('stocktwits_GOOGL.csv')
    df_st11 = pd.read_csv('stocktwits_GOOGL.csv')
    return df_st11
  elif comp=='HD':
    #HD: https://drive.google.com/file/d/10wbCzJMcjLWAqrlEtHlWieIrq7dyolG2/view?usp=sharing
    id = '10wbCzJMcjLWAqrlEtHlWieIrq7dyolG2'
    downloaded12 = drive.CreateFile({'id':id}) 
    downloaded12.GetContentFile('stocktwits_HD.csv')
    df_st12 = pd.read_csv('stocktwits_HD.csv')
    return df_st12
  elif comp=='INTC':
    # INTC: https://drive.google.com/file/d/1k1-NSl8qLTa2oDs1G8CFC4CJzkv9mugw/view?usp=sharing 
    id = '1k1-NSl8qLTa2oDs1G8CFC4CJzkv9mugw'
    downloaded13 = drive.CreateFile({'id':id}) 
    downloaded13.GetContentFile('stocktwits_INTC.csv')
    df_st13 = pd.read_csv('stocktwits_INTC.csv')
    return df_st13
  elif comp=='JNJ':
    # JNJ: https://drive.google.com/file/d/1Qiwu9vbDYU527szR8waaFDoFCyY4i_bc/view?usp=sharing
    id = '1Qiwu9vbDYU527szR8waaFDoFCyY4i_bc'
    downloaded14 = drive.CreateFile({'id':id}) 
    downloaded14.GetContentFile('stocktwits_JNJ.csv')
    df_st14 = pd.read_csv('stocktwits_JNJ.csv')
    return df_st14
  elif comp=='NFLX':
    # NFLX: https://drive.google.com/file/d/1DdJ8MPdgt9bxF3ZagkWUp45N4RMQ8Al6/view?usp=sharing
    id = '1DdJ8MPdgt9bxF3ZagkWUp45N4RMQ8Al6'
    downloaded15 = drive.CreateFile({'id':id}) 
    downloaded15.GetContentFile('stocktwits_NFLX.csv')
    df_st15 = pd.read_csv('stocktwits_NFLX.csv')
    return df_st15
  elif comp=='PG':
    # PG: https://drive.google.com/file/d/1tlueLaJhlduNMgRHk8Omkog5nNI8I8Yk/view?usp=sharing
    id = '1tlueLaJhlduNMgRHk8Omkog5nNI8I8Yk'
    downloaded16 = drive.CreateFile({'id':id}) 
    downloaded16.GetContentFile('stocktwits_PG.csv')
    df_st16 = pd.read_csv('stocktwits_PG.csv')
    return df_st16
  elif comp=='QQQ':
    # QQQ: https://drive.google.com/file/d/1gUsl5L4VBgsL9oqBxaUdk6tA9r4VxDjo/view?usp=sharing
    id = '1gUsl5L4VBgsL9oqBxaUdk6tA9r4VxDjo'
    downloaded17 = drive.CreateFile({'id':id}) 
    downloaded17.GetContentFile('stocktwits_QQQ.csv')
    df_st17 = pd.read_csv('stocktwits_QQQ.csv')
    return df_st17
  elif comp=='SPY':
    # SPY: https://drive.google.com/file/d/10s-zYQPIqlkNsUahgzDRqJGw0VAkn-R1/view?usp=sharing
    id = '10s-zYQPIqlkNsUahgzDRqJGw0VAkn-R1'
    downloaded18 = drive.CreateFile({'id':id}) 
    downloaded18.GetContentFile('stocktwits_SPY.csv')
    df_st18 = pd.read_csv('stocktwits_SPY.csv')
    return df_st18
  elif comp=='T':
    # T: https://drive.google.com/file/d/1rk3PsikhgrA7MxV28tUbzj7EOn9K5Ixu/view?usp=sharing
    id = '1rk3PsikhgrA7MxV28tUbzj7EOn9K5Ixu'
    downloaded19 = drive.CreateFile({'id':id}) 
    downloaded19.GetContentFile('stocktwits_T.csv')
    df_st19 = pd.read_csv('stocktwits_T.csv')
    return df_st19
  elif comp=='TSLA':
    # TSLA: https://drive.google.com/file/d/1on57uk2gd_CLsnj1dcRfuB_KsYydzEp2/view?usp=sharing
    id = '1on57uk2gd_CLsnj1dcRfuB_KsYydzEp2'
    downloaded20 = drive.CreateFile({'id':id}) 
    downloaded20.GetContentFile('stocktwits_TSLA.csv')
    df_st20 = pd.read_csv('stocktwits_TSLA.csv')
    return df_st20
  elif comp=='UNH':
    # UNH: https://drive.google.com/file/d/1zguMHb3pL2tCT4TYV8XJcP-dWTcq28mY/view?usp=sharing
    id = '1zguMHb3pL2tCT4TYV8XJcP-dWTcq28mY'
    downloaded21 = drive.CreateFile({'id':id}) 
    downloaded21.GetContentFile('stocktwits_UNH.csv')
    df_st21 = pd.read_csv('stocktwits_UNH.csv')
    return df_st21
  elif comp=='V':
    # V: https://drive.google.com/file/d/1qLI1Rsyf2ebZu53I8QaFuOQSeTyZShLg/view?usp=sharing
    id = '1qLI1Rsyf2ebZu53I8QaFuOQSeTyZShLg'
    downloaded22 = drive.CreateFile({'id':id}) 
    downloaded22.GetContentFile('stocktwits_V.csv')
    df_st22 = pd.read_csv('stocktwits_V.csv')
    return df_st22
  elif comp=='VIX':
    # VIX: https://drive.google.com/file/d/1SoIue0nfsn_GGMOroFg3tEp8re-v7PnJ/view?usp=sharing
    id = '1SoIue0nfsn_GGMOroFg3tEp8re-v7PnJ'
    downloaded23 = drive.CreateFile({'id':id}) 
    downloaded23.GetContentFile('stocktwits_VIX.csv')
    df_st23 = pd.read_csv('stocktwits_VIX.csv')
    return df_st23
  elif comp=='VZ':
    # VZ: https://drive.google.com/file/d/1ddISbB0qfDpM69senqEmmf6xbNWOpNUJ/view?usp=sharing
    id = '1ddISbB0qfDpM69senqEmmf6xbNWOpNUJ'
    downloaded24 = drive.CreateFile({'id':id}) 
    downloaded24.GetContentFile('stocktwits_VZ.csv')
    df_st24 = pd.read_csv('stocktwits_VZ.csv')
    return df_st24
  elif comp=='WMT':
    # WMT: https://drive.google.com/file/d/14Zdh1ZCj5RxZltXknG3Qisxhwzge5rkV/view?usp=sharing
    id = '14Zdh1ZCj5RxZltXknG3Qisxhwzge5rkV'
    downloaded25 = drive.CreateFile({'id':id}) 
    downloaded25.GetContentFile('stocktwits_WMT.csv')
    df_st25 = pd.read_csv('stocktwits_WMT.csv')
    return df_st25

In [11]:
def get_tweets_df_for_ticker(ticker: str, first_quote_date_str: str = None, verbose=True) -> pd.DataFrame:
    print(f"Getting data for ticker: {ticker} and first_quote_date_str: {first_quote_date_str}")
    df = get_compData(ticker)
    if verbose:
      print(f"\nHead:\n", df.head())
      print(f"\nTail:\n", df.tail())

    #date time split code
    #df['datetime'] = df['datetime'].astype('datetime64[ns]') #len(df_st1['datetime'])
    df['datetime'] = pd.to_datetime(df['datetime'], errors='raise')
    df['Date'] = pd.to_datetime([d.date() for d in df['datetime']], errors='raise')
    # df['Time'] = [d.time() for d in df['datetime']]  # Cannot convert to datetime type
    df['Weekday'] = [d.date().weekday() for d in df['Date']]

    # 
    if first_quote_date_str is not None:
        first_quote_date =  datetime.strptime(first_quote_date_str, "%Y-%m-%d").astimezone(timezone.utc)  # tz_localize('utc')
        df = df[df.datetime >= first_quote_date]
    if verbose:
        print(f"\nTail\n", df.tail())

    return df

In [12]:
def get_yahoo_data_for_df(df: pd.DataFrame, left_border_days=1, right_border_days=1, yf_protection_days=3, 
                          verbose=True) -> pd.DataFrame:

    start_dt = df['Date'].min() + timedelta(days=-left_border_days)
    end_dt = df['Date'].max() + timedelta(days=right_border_days)
    compny = df['symbol'].iloc[0]
    if compny =='BRK.A':
      compny = 'BRK-A'
    elif compny =='BRK.B':
      compny ='BRK-B'
    elif compny =='VIX':
      compny = '^VIX'
    elif compny == 'FB':
      compny = 'META'
    # elif compny == 'GOOG':
    #   compny = 'GOOGL'  # Data from 2004 instead of 
    else:
      pass

    if verbose:
        print(f"Start gettings data for {compny} in range {start_dt} to {end_dt}")
    # Request for end_dt + 1 as YF does not inlclude right border.
    # Note: auto_adjust defaults to False, but in fact all prices are autoadjusted even if it is False (?)
    # Note: keepna=True seems not to fill weekends, etc. (?)
    prot_delta = timedelta(days=yf_protection_days)
    yahoo_data = yfinance.download(
        compny, start=start_dt - prot_delta, end=end_dt + prot_delta, keepna=True, auto_adjust=True)
    yahoo_data.reset_index(level=0, inplace=True)
    print('Columns:', yahoo_data.columns)
    print('Shape:', yahoo_data.shape)

    if verbose:
        print('\nHead:\n', yahoo_data.head())
        print('\nInfo:')
        yahoo_data.info()
        print('\nDescribe:\n', yahoo_data.describe(include='all', datetime_is_numeric=True))

    # Check dates (probably, in case of holidays, etc. the assertions may fail) -> some protection dates should be used
    print("T1525")
    assert yahoo_data['Date'].min() <= start_dt, \
        f"Not enough data obtained: {yahoo_data['Date'].min()} vs {start_dt} ({start_dt:%A})"
    assert yahoo_data['Date'].max() >= end_dt, \
        f"Not enough data obtained: {yahoo_data['Date'].max()} vs {end_dt} ({end_dt:%A})"

    # Check if Close and Adjusted Close are the same (new change in yfinance?)
    #assert np.allclose(yahoo_data['Close'], yahoo_data['Adj Close'], equal_nan=True)
    assert 'Adj Close' not in yahoo_data.columns  # Due to auto_adjust=True param

    return yahoo_data

In [13]:
def fill_missing_dates(yf_df: pd.DataFrame, verbose=True) -> pd.DataFrame:
    # Check data types
    assert yf_df.columns.to_list() == ['Date', 'Open', 'High', 'Low', 'Close', 'Volume']
    assert np.issubdtype(yf_df.index.dtype, np.integer)
    assert np.issubdtype(yf_df.Date.dtype, np.datetime64)
    if verbose:
        print('\nBefore reindexing:\n', yf_df.head())

    # Set "Date" column as index with filling gaps with NaNs
    new_df = yf_df.set_index('Date').asfreq('D')  #.reset_index()
    assert type(new_df.index) == pd.DatetimeIndex
    if verbose:
        print('\nAfter reindexing:\n', new_df.head())

    # Fill NaN values in the 'Close' column by ffill strategy
    new_df['Close'] = new_df['Close'].fillna(method='ffill')
    if verbose:
        print('\nAfter ffill for NAs in Close column:\n', new_df.head())

    # Fill NaN values in other columns from the 'Close' column
    new_df['Open'] = new_df['Open'].fillna(new_df['Close'])
    new_df['High'] = new_df['High'].fillna(new_df['Close'])
    new_df['Low'] = new_df['Low'].fillna(new_df['Close'])
    if verbose:
        print('\nAfter filling NAs in other columns:\n', new_df.head())

    print(f'Dates count: {len(yf_df)} -> {len(new_df)}')
    return new_df

In [14]:
def get_tweets_with_price_raw_data(df_tweets: pd.DataFrame, df_prices: pd.DataFrame, 
                                   day_shifts = [-1, 0, 1, 2, 3, 4, 5, 6, 7], verbose=True) -> pd.DataFrame:

    # Check types, etc.
    assert df_tweets.columns.to_list() == ['symbol', 'message', 'datetime', 'user', 'message_id', 'Date', 'Weekday']
    assert np.issubdtype(df_tweets.Date.dtype, np.datetime64)

    assert df_prices.columns.to_list() == ['Open', 'High', 'Low', 'Close', 'Volume']
    assert np.issubdtype(df_prices.index.dtype, np.datetime64)
    
    # Fast check if min date for prices is sufficient
    min_tweet_date = df_tweets.Date.min()
    expected_min_price_date = min_tweet_date + timedelta(days=min(day_shifts))
    actual_min_price_date = df_prices.index.min()
    assert actual_min_price_date <= expected_min_price_date, \
        f"Not enough dates in price: {actual_min_price_date} instead of {expected_min_price_date}"

    # Fast check if max date for prices is sufficient
    max_tweet_date = df_tweets.Date.max()
    expected_max_price_date = max_tweet_date + timedelta(days=max(day_shifts))
    actual_max_price_date = df_prices.index.max()
    assert actual_max_price_date >= expected_max_price_date, \
        f"Not enough dates in price: {actual_max_price_date} instead of {expected_max_price_date}"

    # Cycle for each tweet
    tmp_dict = defaultdict(list)

    for index, row in tqdm(df_tweets.iterrows(), total=len(df_tweets)):
        tweet_date = row.Date
        tmp_list = []
        for shift in day_shifts:
            price_row = df_prices.loc[tweet_date + timedelta(days=shift)]
            assert isinstance(price_row, pd.Series)
            tmp_dict[f"d{shift}_O"].append(price_row['Open'])
            tmp_dict[f"d{shift}_H"].append(price_row['High'])
            tmp_dict[f"d{shift}_L"].append(price_row['Low'])
            tmp_dict[f"d{shift}_C"].append(price_row['Close'])
            tmp_dict[f"d{shift}_V"].append(price_row['Volume'])

    out_df = df_tweets.copy()
    for col_name, col_data in tmp_dict.items():
        out_df[col_name] = col_data

    if verbose:
        print('\nFinal head:\n', out_df.head())

    return out_df

In [15]:
def get_label(ch):
  if ch>0.5:
    return 1
  elif ch<-0.5:
    return -1
  else:
    return 0

In [16]:
def remove_stopwords(msg_seg: list, stop_words: list):
    assert isinstance(msg_seg, list)
    filtered_sentence = [w for w in msg_seg if not w in stop_words]
    return filtered_sentence

def remove_punctuation_re(x):
    x = ' '.join(re.sub("https?://\S+","",x).split())     #Removing URLs

    x = ' '.join(re.sub("^@\S+|\s@\S+","",x).split())     #Removing Mentions

    # x = ' '.join(re.sub(r'[^$\w\s]'," ",x).split())
    x = ' '.join(re.sub(r'[^\w\s]'," ",x).split())        #Removes Hashtags

    x = ' '.join(re.sub(r'_'," ",x).split())              #Removing _ from emojis text

    return x

# replace repeating letter
def do_rpt_replace(match):
    # print(match.group(1))

    return match.group(1)+match.group(1)

# for repeating characters in words
RE_MESSAGE_RPT = re.compile(r"(.)\1{2,}", re.IGNORECASE)

# substitute original word with replaced word, if any
def processRepeatings(data):
    # print('RPT1:', data)
    re_t= re.sub(RE_MESSAGE_RPT, do_rpt_replace, data )
    # print(re_t)
    #print('RPT2:', re_t)
    return re_t

In [17]:
def get_preprocessed_tweets_df(df_tweets: pd.DataFrame, to_remove_stopwords: bool, to_remove_repetitions: bool, 
                               verbose=True) -> pd.DataFrame:

    # Prepare processors
    stop_words = sw.words("english")
    tweet_tokenizer = TweetTokenizer()
    detokenizer = TreebankWordDetokenizer()
    # segmenter using the word statistics from Twitter
    seg_tw = Segmenter(corpus="twitter")


    out_df = df_tweets.copy()
    # fill nan values in file with '0'
    # df_tweets.isna().values.any()
    out_df['message'] = out_df['message'].fillna('0')

    out_df['message'] = out_df['message'].str.lower()
    messages = out_df['message'].tolist()

    message_p = []
    for i, msg in tqdm(enumerate(messages), total=len(messages)):
        print("Orig:", msg) if verbose else None

        if msg == '0': #nan replaced by '0'
            message_p.append('-1')

        else:
            # remove emojis
            msg = emoji.demojize(msg)

            # fix contractions
            msg = contractions.fix(msg)

            # remove punctuations
            msg = remove_punctuation_re(msg) 

            #tokenize
            msg_tokens = tweet_tokenizer.tokenize(msg)

            #For Hashtags elongated words using Word segmenter
            message_seg = []
            for w in msg_tokens:
              if len(w)>=300:
                w=w[:100]
                print(w)
              message_seg.append(seg_tw.segment(w))

            # remove stopwords
            if to_remove_stopwords:
                msg_list = remove_stopwords(message_seg, stop_words=stop_words)
            else:
                msg_list = message_seg

            if 'rt' in msg_list:
                # remove retweets
                message_p.append('-1')
            else: 
                # detokenize
                msg = detokenizer.detokenize(msg_list)

                # removing repeating characters like hurrrryyyyyy-- worrks on tokenized list
                if to_remove_repetitions:
                    msg = processRepeatings(msg)

                message_p.append(msg)      

        print("Final:", msg, "\n") if verbose else None

        if i == 10 and verbose:
            break

    out_df['message'] = message_p
    
    # Drop Retweets
    out_df.drop(out_df[out_df['message'] == '-1'].index, inplace = True)     
    
    return out_df

# Settings

In [18]:
# Main settings

# Pairs of "ticker" - "first quote date"
SRC_TICKERS_DATES_PAIRS = [
    ('AAPL', None), 
    ('AMZN', None), 
    ('FB', '2012-06-01'),    # Note: data from 2012 (?), in 2021 changed to META, but here the data is up to 2020-07.
    ('GOOGL', None),  # Quotes in NASDAQ since 2004
    ('GOOG', None),  # Quotes in NASDAQ since 2014
    ('NFLX', None),
    ]

MAX_SHIFT_DAYS = 7  # Days shift will range from -1 to MAX_SHIFT_DAYS
  
TO_REMOVE_STOPWORDS = False  # sw.words("english") contains about 200 tokens insluding "not", "don't", etc.
TO_REMOVE_REPETITIONS = False

OUT_FILE_ADDITIONAL_SUFFIX = f"_1y"
OUT_FLOAT_FORMAT = '%.2f'

#change the directory you want to save the file
GD_OUT_PATH = r'/content/drive/MyDrive/_PR_ROOT/_2022/2022-11_NLP-Huawei_Final_project/stocktwits_finsentiment_analysis/data/interim/040_output__nb010_v1/'
assert os.path.isdir(GD_OUT_PATH)

In [19]:
# Additional settings
YF_PROTECTION_DAYS = 3  # For some reason, yfinance sometimes gets not exact date borderes

# Dump all tickers (local csv files will be created)

In [20]:
tickers_25 = [
'AAPL',
'ADBE',
'AMZN',
'BAC',
'BRK.A',
'BRK.B',
'DIA',
'DIS',
'FB',
'GOOG',
'GOOGL',
'HD',
'INTC',
'JNJ',
'NFLX',
'PG',
'QQQ',
'SPY',
'T',
'TSLA',
'UNH',
'V',
'VIX',
'VZ',
'WMT',
]


In [21]:
# for t in tickers_25:
#     _ = get_compData(t)

In [23]:
# !cp *.csv drive/MyDrive/_PR_ROOT/_2022/2022-11_NLP-Huawei_Final_project/Datasets/2022-12-16__raw_csvs_25_tickers/

In [22]:
# assert False

AssertionError: ignored

# Main cycle for tickers

In [None]:
FAST_CHECK = False
VERBOSE = True 
sep = '*' * 100 

tickers = [('VIX', None)] if FAST_CHECK else SRC_TICKERS_DATES_PAIRS[2:]

for ticker, first_quote_date_str in tickers:
  print(f"{sep}\n* Step1: get_tweets_df_for_ticker\n{sep}")
  df_tweets = get_tweets_df_for_ticker(ticker, first_quote_date_str, verbose=True)
  
  print(f"{sep}\n* Step2: get_yahoo_data_for_df\n{sep}")
  df_yf = get_yahoo_data_for_df(df_tweets, right_border_days=MAX_SHIFT_DAYS, verbose=True)

  print(f"{sep}\n* Step3: fill_missing_dates\n{sep}")
  df_yf2 = fill_missing_dates(df_yf, verbose=True)

  print(f"{sep}\n* Step4: get_tweets_with_price_raw_data\n{sep}")
  if FAST_CHECK:
      df_tweets = df_tweets[:1000]
  df_tweets_with_price = get_tweets_with_price_raw_data(
      df_tweets, df_prices=df_yf2, day_shifts=list(range(-1, MAX_SHIFT_DAYS + 1)), verbose=True)

  print(f"{sep}\n* Step5: get_preprocessed_tweets_df\n{sep}")
  df_final = get_preprocessed_tweets_df(df_tweets_with_price, to_remove_stopwords=TO_REMOVE_STOPWORDS, 
                                        to_remove_repetitions=TO_REMOVE_REPETITIONS, verbose=False)

  # Save to file
  print(f"{sep}\n* Step6: save to file\n{sep}")
  ts = datetime.now().strftime('%Y-%m-%dT%H%M%S')
  sw_tag = f"RmSW={int(TO_REMOVE_STOPWORDS)}"
  rep_tag = f"RmRep={int(TO_REMOVE_REPETITIONS)}"
  out_file_name = f"{ts}_{ticker}_{sw_tag}_{rep_tag}{OUT_FILE_ADDITIONAL_SUFFIX}.csv.gz"
  df_final.to_csv(GD_OUT_PATH + out_file_name, header=True, index=False, 
                  encoding='utf_8', float_format=OUT_FLOAT_FORMAT, compression='gzip')
  print("Success")

In [None]:
# Reading twitter - 1grams ...
# Reading twitter - 2grams ...
# 10/? [00:00<00:00, 9.65it/s]
# T1758: 0
# Orig: $uvxy $vxx $svxy $spy if you absolutely must look at stochastics for the $vix these are the ones and only ones to monitor. everything is left to derivate/reaction principal functions. any questions on exactly what the stochastics are/mean, ask!
# RPT1: uvxy vxx svxy spy absolutely must look stochastic s vix ones ones monitor everything left der iv ate reaction principal functions questions exactly stochastic s mean ask
# RPT2: uvxy vxx svxy spy absolutely must look stochastic s vix ones ones monitor everything left der iv ate reaction principal functions questions exactly stochastic s mean ask
# Final: uvxy vxx svxy spy absolutely must look stochastic s vix ones ones monitor everything left der iv ate reaction principal functions questions exactly stochastic s mean ask 

# T1758: 1
# Orig: $tsla  $1610 put 8/7/20  - 1 contract at $162.30.  $amzn  $2400 10/16/20 - 1 contract at $33.40.  otm for amzn because it is all i had left in my account.  parabolas have to come down sometimes.  i believe fundamentals will come into view this week (ie real earnings). $vix seems to be going higher.
# RPT1: tsla  1610 put  8  7  20  1 contract  162  30 amzn  2400  10  16  20  1 contract  33  40 otm amzn left account parabolas come sometimes believe fundamentals come view week i e real earnings vix seems going higher
# RPT2: tsla  1610 put  8  7  20  1 contract  162  30 amzn  2400  10  16  20  1 contract  33  40 otm amzn left account parabolas come sometimes believe fundamentals come view week i e real earnings vix seems going higher
# Final: tsla  1610 put  8  7  20  1 contract  162  30 amzn  2400  10  16  20  1 contract  33  40 otm amzn left account parabolas come sometimes believe fundamentals come view week i e real earnings vix seems going higher 

# T1758: 2
# Orig: $spy $aapl $vix $goog $amzn 

# can&#39;t wait to make $$$ tomorrow, too many cuck bears in disbelief after buying those 325s atm. let me know how that goes tomorrow fuckwits. it&#39;s a good thing you all suck at math otherwise i wouldn&#39;t be retired.

# https://www.youtube.com/watch?v=gvymcvrijbo
# RPT1: spy aapl vix goog amzn  39 wait make tomorrow many cuck bears disbelief buying 325s atm let know goes tomorrow fuckwits  39 good thing suck math otherwise  39 retired
# RPT2: spy aapl vix goog amzn  39 wait make tomorrow many cuck bears disbelief buying 325s atm let know goes tomorrow fuckwits  39 good thing suck math otherwise  39 retired
# Final: spy aapl vix goog amzn  39 wait make tomorrow many cuck bears disbelief buying 325s atm let know goes tomorrow fuckwits  39 good thing suck math otherwise  39 retired 

# T1758: 3
# Orig: $spy btmfd!
# $vix dip that is
# RPT1: spy bt mfd vix dip
# RPT2: spy bt mfd vix dip
# Final: spy bt mfd vix dip 

# T1758: 4
# Orig: $vix 5 red days and then rdr green. some caution
# RPT1: vix  5 red days rdr green caution
# RPT2: vix  5 red days rdr green caution
# Final: vix  5 red days rdr green caution 

# T1758: 5
# Orig: $uxvy $vix daily chart updated ... oh oh ...
# RPT1: u xv y vix daily chart updated oh oh
# RPT2: u xv y vix daily chart updated oh oh
# Final: u xv y vix daily chart updated oh oh 

# T1758: 6
# Orig: $uvxy  how come only up 1% when $vix is up 4.5%?
# RPT1: uvxy come  1 vix  4  5
# RPT2: uvxy come  1 vix  4  5
# Final: uvxy come  1 vix  4  5 

# T1758: 7
# Orig: $spy $vix the turn has wormed.
# RPT1: spy vix turn wormed
# RPT2: spy vix turn wormed
# Final: spy vix turn wormed 

# T1758: 8
# Orig: nyse advance-decline line breaking out to all-time high... 

# another major bullish evidence. 

# $spx $spy $qqq $dia $vix
# RPT1: nyse advance decline line breaking time high another major bullish evidence spx spy qqq dia vix
# RPT2: nyse advance decline line breaking time high another major bullish evidence spx spy qq dia vix
# Final: nyse advance decline line breaking time high another major bullish evidence spx spy qq dia vix 

# T1758: 9
# Orig: $spy got legs, $vix fading
# RPT1: spy got legs vix fading
# RPT2: spy got legs vix fading
# Final: spy got legs vix fading 