In [None]:
from sentiment.modeling import *
from sentiment.data_loader import *
from sentiment.training import *
from transformers import BertTokenizer
import pandas as pd
from transformers import DataCollatorWithPadding
from torch.utils.data import Dataset, DataLoader
from utils import *
import ast

stock_names = ["apple", "amazon", "google", "microsoft", "tesla"]

model_bert = SentimentClassifier(2, "bert-base-uncased", do_fine_tune = False).to(device)
model_bert.load_state_dict(torch.load("sentiment/saved_models/bert-base-with-dropout.bin")) # tried to load different model - 

model_finbert = SentimentClassifier(2, "ProsusAI/finbert", do_fine_tune = False).to(device)
model_finbert.load_state_dict(torch.load("sentiment/saved_models/finbert-fine-tune.bin")) # tried to load different model - 


def sanitize_date(date):
    date = date.split("T")[0]
    return date


def preprocess_data(df):
    df.dropna(inplace=True)
    df['stock'] = df['found_labels'].apply(lambda x: ast.literal_eval(x)[0][0])
    df['date_sanitized'] = df['publisheddate'].apply(lambda x: sanitize_date(x))
    df.rename(columns={'cleaned_text': 'news'}, inplace=True)
    return df


def get_pred(model, data_loader):
    model = model.eval()
    predictions = []
    prediction_probs = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            outputs = model(input_ids = input_ids, attention_mask = attention_mask)
            _, preds = torch.max(outputs, dim=1)
            probs = F.softmax(outputs, dim =1)
            predictions.extend(preds)
            prediction_probs.extend(probs)
    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    return predictions, prediction_probs



import ast
def prepare_data(stock_name):
    df = df.dropna()
    df['stock'] = df['found_labels'].apply(lambda x: ast.literal_eval(x)[0][0])
    df['date_sanitized'] = df['publisheddate'].apply(lambda x: sanitize_date(x))
    df.rename(columns={'cleaned_text': 'news'}, inplace=True)
    return df


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def create_data_load(df, tokenizer, max_len, batch_size):
    collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)
    ds = NewsDataset(
        news=df.news.to_list(),
        labels = [None]* len(df),
        tokenizer=tokenizer,
        max_len=max_len,
        train=False
    )

    return DataLoader(
        ds,
        batch_size=batch_size,
        collate_fn=collator)
    

In [None]:
temp = ["classification/saved/{}.xlsx".format(i) for i in stock_names]
combined_df = concat_excels("combined.xlsx", *temp)

combined_df = preprocess_data(combined_df)
print(combined_df.shape)


In [None]:
apple_df = combined_df[combined_df['stock'] == 'apple']
amazon_df = combined_df[combined_df['stock'] == 'amazon']
google_df = combined_df[combined_df['stock'] == 'google']
microsoft_df = combined_df[combined_df['stock'] == 'microsoft']
tesla_df = combined_df[combined_df['stock'] == 'tesla']