In [1]:
from transformers import BertTokenizer, BertModel
from transformers import get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW
import pandas as pd
import numpy as np
import preprocessor

class BertClassifier(nn.Module):
    def __init__(self, freeze=False):
        super(BertClassifier, self).__init__()

        input_layer = 768
        hidden_layer = 50
        output_layer = 2

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.classifier = nn.Sequential(
            nn.Linear(input_layer, hidden_layer), 
            nn.ReLU(), 
            nn.Linear(hidden_layer, output_layer))

        if freeze:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask)
        h_cls = outputs[0][:, 0, :]
        logits = self.classifier(h_cls)

        return logits

model = BertClassifier()
model.load_state_dict(torch.load('stock_sentiment_model.pt'))

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
                      
model.to(device)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [8]:
for filename in os.listdir('data/data1/'):
    if filename.endswith(".xlsx") or filename.endswith(".xls"):  
        parts = filename.split('_')

        new_name = parts[2]
        print(new_name)

aal
aapl
adbe
adp
adsk
akam
alxn
amat
amgn
amzn
atvi
avgo
bbby
bidu
bmrn
ca
celg
cern
chkp
chtr
cmcsa
cost
csco
csx
ctrp
ctsh
disca
disck
dish
dltr
ea
ebay
endp
esrx
expe
fast
fb
fisv
foxa
fox
gild
googl
goog
hsic
ilmn
inct
incy
intu
isrg
jd
khc
lbtya
lbtyk
lltc
lmca
lmck
lrcx
lrcx
lvnta
mar
mat
mdlz
mnst
msft
mu
mxim
myl
nclh
nflx
ntap
ntes
nvda
nxpi
orly
payx
pcar
pcln
pypl
qcom
qvca
regn
rost
sbac
sbux
sndk
srcl
stx
swks
symc
tmus
trip
tsco
tsla
txn
ulta
viab
vod
vrsk
vrtx
wba
wdc
wfm
xlnx
yhoo


In [20]:
import os
import datetime
import pandas as pd
from transformers import BertTokenizer, BertModel
from transformers import get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW
import pandas as pd
import numpy as np
import preprocessor

def preprocess_tweet(row):
    text = row['Text']
    text = preprocessor.clean(text)
    return text

# Get the list of stock data to convert
directory = "data/data1/"  # Replace with the actual directory path

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocessing_for_bert(data):
    input_ids = []
    attention_masks = []

    for text in data:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

for filename in os.listdir(directory):
    if filename.endswith(".xlsx") or filename.endswith(".xls"):  # Check if the file is an Excel file
        file_path = os.path.join(directory, filename)
        stock = pd.read_excel(file_path, sheet_name='Stream')

        # Assign the ticker name as a column
        stock['Ticker'] = filename.split('_')[0]

        # Convert string date times to datetime
        stock['Date'] = pd.to_datetime(stock['Date'])
        stock['Hour'] = stock['Hour'].apply(lambda t: pd.Timedelta(hours=int(t[:2]), minutes=int(t[3:])))
        stock['Datetime'] = stock['Date'] + stock['Hour']

        # Rename column that holds the tweets content
        stock.rename(columns={'Tweet content': 'Text'}, inplace=True)

        # Preprocess the tweet content
        stock['Text_Cleaned'] = stock.apply(preprocess_tweet, axis=1)

        # Remove excess columns
        stock = stock[['Tweet Id', 'Ticker', 'Datetime', 'Text', 'Text_Cleaned', 'Favs', 'RTs', 'Followers', 'Following', 'Is a RT']]

        # Fill NAs in Favs, RTs, Followers, and Following with 0
        stock = stock.fillna(0)

        # Encode processed tweets for the Bert NLP model
        stock_inputs, stock_masks = preprocessing_for_bert(stock['Text_Cleaned'].values)

        # Put stock data in PyTorch dataloader for processing
        stock_data = TensorDataset(stock_inputs, stock_masks)
        stock_sampler = RandomSampler(stock_data)
        stock_dataloader = DataLoader(stock_data, sampler=stock_sampler, batch_size=16)

        # Assign model to evaluate
        model.eval()

        predictions = []

        # For each batch
        for batch in stock_dataloader:
            # Get encoded inputs and masks
            batch_inputs, batch_masks = batch

            # Send variables to device (GPU if available)
            batch_inputs = batch_inputs.to(torch.device('cuda'))
            batch_masks = batch_masks.to(torch.device('cuda'))

            # Predict classes with Bert for given inputs
            with torch.no_grad():
                logits = model(batch_inputs, batch_masks)

            # Convert predictions to 0s and 1s
            preds = torch.argmax(logits, dim=1).flatten()
            predictions.append(preds)

        # Combine all batch predictions
        predictions = torch.cat(predictions).cpu().numpy()

        # Add predictions to stock dataframe
        stock['Sentiment'] = predictions
        
        parts = filename.split('_')

        # Get the desired part, which is the second element (index 1)
        new_name = parts[2]

        # Save predictions as a new CSV
        stock.to_csv('data/raw/' + new_name + '_stock_data_sentiment.csv', index=False)

        # Show stock names as they are completed
        print(new_name.split('_')[0], '- completed')

aal - completed
aapl - completed
adbe - completed
adp - completed
adsk - completed
akam - completed
alxn - completed
amat - completed
amgn - completed
amzn - completed
atvi - completed
avgo - completed
bbby - completed
bidu - completed
bmrn - completed
ca - completed
celg - completed
cern - completed
chkp - completed
chtr - completed
cmcsa - completed
cost - completed
csco - completed
csx - completed
ctrp - completed
ctsh - completed
disca - completed
disck - completed
dish - completed
dltr - completed
ea - completed
ebay - completed
endp - completed
esrx - completed
expe - completed
fast - completed
fb - completed
fisv - completed
foxa - completed
fox - completed
gild - completed
googl - completed
goog - completed
hsic - completed
ilmn - completed
inct - completed
incy - completed
intu - completed
isrg - completed
jd - completed
khc - completed
lbtya - completed
lbtyk - completed
lltc - completed
lmca - completed
lmck - completed
lrcx - completed
lrcx - completed
lvnta - completed
mar

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import datetime as dt
import yfinance as yf

In [7]:
files = os.listdir('data/raw/')

stocks = pd.DataFrame()

directory = "data/raw/"  

for filename in os.listdir(directory):
    if filename.endswith(".csv") or filename.endswith(".csv"):  
        file_path = os.path.join(directory, filename)
        
        parts = filename.split('_')

        new_name = parts[0]
        print('data/raw/' + new_name)

data/raw/aal
data/raw/aapl
data/raw/adbe
data/raw/adp
data/raw/adsk
data/raw/akam
data/raw/alxn
data/raw/amat
data/raw/amgn
data/raw/amzn
data/raw/atvi
data/raw/avgo
data/raw/bbby
data/raw/bidu
data/raw/bmrn
data/raw/ca
data/raw/celg
data/raw/cern
data/raw/chkp
data/raw/chtr
data/raw/cmcsa
data/raw/cost
data/raw/csco
data/raw/csx
data/raw/ctrp
data/raw/ctsh
data/raw/disca
data/raw/disck
data/raw/dish
data/raw/dltr
data/raw/ea
data/raw/ebay
data/raw/endp
data/raw/esrx
data/raw/expe
data/raw/fast
data/raw/fb
data/raw/fisv
data/raw/foxa
data/raw/fox
data/raw/gild
data/raw/googl
data/raw/goog
data/raw/hsic
data/raw/ilmn
data/raw/inct
data/raw/incy
data/raw/intu
data/raw/isrg
data/raw/jd
data/raw/khc
data/raw/lbtya
data/raw/lbtyk
data/raw/lltc
data/raw/lmca
data/raw/lmck
data/raw/lrcx
data/raw/lvnta
data/raw/mar
data/raw/mat
data/raw/mdlz
data/raw/mnst
data/raw/msft
data/raw/mu
data/raw/mxim
data/raw/myl
data/raw/nclh
data/raw/nflx
data/raw/ntap
data/raw/ntes
data/raw/nvda
data/raw/nxpi
dat

In [5]:
import os
import pandas as pd
import datetime
import yfinance as yf

files = os.listdir('data/raw/')

stocks = pd.DataFrame()

for filename in files:
    if filename.endswith(".csv"):
        try:
            data = pd.read_csv('data/raw/' + filename)
        
            data['Datetime'] = pd.to_datetime(data['Datetime'])

            data.loc[data['Sentiment']==0, 'Sentiment'] = -1
        
            data['Tweets'] = 1
            data['Weight'] = 1

            data['Followers_Mean'] = data['Followers'].rolling(10000, min_periods=1).mean()
            data['Followers_Std'] = data['Followers'].rolling(10000, min_periods=1).std()
            data['Followers_Std'] = data['Followers_Std'].fillna(data['Followers_Std'].values[1])

            data.loc[ (data['Followers']>=data['Followers_Mean']) & (data['Followers'] < (data['Followers_Mean']+data['Followers_Std'])), 'Weight'] += 1
            data.loc[ (data['Followers']>=(data['Followers_Mean']+data['Followers_Std'])) & (data['Followers'] < (data['Followers_Mean']+data['Followers_Std']*2)), 'Weight'] += 2
            data.loc[data['Followers']>=(data['Followers_Mean']+data['Followers_Std']*2), 'Weight'] += 3

            data['RTs_Mean'] = data['RTs'].rolling(10000, min_periods=1).mean()
            data['RTs_Std'] = data['RTs'].rolling(10000, min_periods=1).std()
            data['RTs_Std'] = data['RTs_Std'].fillna(data['RTs_Std'].values[1])
    
            data.loc[ (data['RTs']>=data['RTs_Mean']) & (data['RTs'] < (data['RTs_Mean']+data['RTs_Std'])), 'Weight'] += 1
            data.loc[ (data['RTs']>=(data['RTs_Mean']+data['RTs_Std'])) & (data['RTs'] < (data['RTs_Mean']+data['RTs_Std']*2)), 'Weight'] += 2
            data.loc[data['RTs']>=(data['RTs_Mean']+data['RTs_Std']*2), 'Weight'] += 3

            data['Sentiment_Weighted'] = data['Sentiment']*data['Weight']
        
            data = data.groupby([data.Datetime.dt.month, data.Datetime.dt.day]).sum(numeric_only=True)
                    
            data['Ticker'] = filename.split('_')[0]
        
            data['Date'] = pd.to_datetime([ str(x)+'/'+str(y)+'/2016' for (x,y) in data.index.values ])+datetime.timedelta(days=1)
            
            data = data.reset_index(drop=True)
    
            data['Sentiment_Weighted'] /= data['Tweets']
        
            data['Sentiment_MA'] = data['Sentiment_Weighted'].rolling(3, min_periods=1).mean()
            data['Tweets_MA'] = data['Tweets'].rolling(3, min_periods=1).mean()

            start_date = data['Date'].min()
            end_date = data['Date'].max()+datetime.timedelta(days=2)

            prices = yf.download(tickers=filename.split('_')[0], start=start_date, end=end_date).reset_index()
        
            prices['Percent_Change'] = (prices['Adj Close'].pct_change()*100).shift(-1)
        
            prices['Percent_Change_Bin'] = pd.cut(prices['Percent_Change'], [-100, 0, 2, 100], labels=[0, 1, 2])

            data = data.merge(prices, on='Date', how='left')

            data = data[['Ticker', 'Date', 'Sentiment_Weighted', 'Sentiment_MA', 'Tweets', 'Tweets_MA', 'Adj Close', 'Percent_Change', 'Percent_Change_Bin']]
        
            data = data.dropna().reset_index(drop=True)
        
            data.to_csv('data/processed/' + filename.split('_')[0] + '_stock_data_inputs.csv', index=False)
        
            stocks = pd.concat([stocks, data])
        
            print(filename.split('_')[0], '- Completed')

        except Exception as e:
            print(filename.split('_')[0], '-', e)
    

stocks.to_csv('data/processed/combined_stock_inputs1.csv', index=False)

[*********************100%***********************]  1 of 1 completed
aal - Completed
[*********************100%***********************]  1 of 1 completed
aapl - Completed
[*********************100%***********************]  1 of 1 completed
adbe - Completed
[*********************100%***********************]  1 of 1 completed
adp - Completed
[*********************100%***********************]  1 of 1 completed
adsk - Completed
[*********************100%***********************]  1 of 1 completed
akam - Completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- ALXN: No timezone found, symbol may be delisted
alxn - Completed
[*********************100%***********************]  1 of 1 completed
amat - Completed
[*********************100%***********************]  1 of 1 completed
amgn - Completed
[*********************100%***********************]  1 of 1 completed
amzn - Completed
[*********************100%***********************]  1 of 1 completed
atv