In [85]:
from sentiment.modeling import *
from sentiment.data_loader import *
from sentiment.training import *
from transformers import BertTokenizer
import pandas as pd
from transformers import DataCollatorWithPadding
from torch.utils.data import Dataset, DataLoader
from utils import *
import ast



In [59]:
stock_names = ["apple", "amazon", "google", "microsoft", "tesla"]

In [138]:
model_bert = SentimentClassifier(2, "bert-base-uncased", do_fine_tune = False).to(device)
model_bert.load_state_dict(torch.load("sentiment/saved_models/bert-base-with-dropout.bin")) # tried to load different model - 

<All keys matched successfully>

In [139]:
model_finbert = SentimentClassifier(2, "ProsusAI/finbert", do_fine_tune = False).to(device)
model_finbert.load_state_dict(torch.load("sentiment/saved_models/finbert-fine-tune.bin")) # tried to load different model - 

<All keys matched successfully>

In [140]:
def sanitize_date(date):
    date = date.split("T")[0]
    return date

In [150]:
def preprocess_data(df):
    df.dropna(inplace=True)
    df['stock'] = df['found_labels'].apply(lambda x: ast.literal_eval(x)[0][0])
    df['date_sanitized'] = df['publisheddate'].apply(lambda x: sanitize_date(x))
    df.rename(columns={'cleaned_text': 'news'}, inplace=True)
    return df

    

In [64]:
def get_pred(model, data_loader):
    model = model.eval()
    predictions = []
    prediction_probs = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            outputs = model(input_ids = input_ids, attention_mask = attention_mask)
            _, preds = torch.max(outputs, dim=1)
            probs = F.softmax(outputs, dim =1)
            predictions.extend(preds)
            prediction_probs.extend(probs)
    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    return predictions, prediction_probs


In [143]:
import ast
def prepare_data(stock_name):
    df = df.dropna()
    df['stock'] = df['found_labels'].apply(lambda x: ast.literal_eval(x)[0][0])
    df['date_sanitized'] = df['publisheddate'].apply(lambda x: sanitize_date(x))
    df.rename(columns={'cleaned_text': 'news'}, inplace=True)
    return df



In [151]:
temp = ["classification/saved/{}.xlsx".format(i) for i in stock_names]
combined_df = concat_excels("combined.xlsx", *temp)

combined_df = preprocess_data(combined_df)
print(combined_df.shape)


(875, 9)


In [154]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def create_data_load(df, tokenizer, max_len, batch_size):
    collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)
    ds = NewsDataset(
        news=df.news.to_list(),
        labels = [None]* len(df),
        tokenizer=tokenizer,
        max_len=max_len,
        train=False
    )

    return DataLoader(
        ds,
        batch_size=batch_size,
        collate_fn=collator)

## Apple

### Bert

In [155]:
apple_df = combined_df[combined_df['stock'] == 'apple']
print(combined_df.shape)
apple_df.shape



(875, 9)


(29, 9)

In [172]:
MAX_LEN = 512
BATCH_SIZE = 8

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
data_loader = create_data_load(apple_df,tokenizer,  MAX_LEN, batch_size=BATCH_SIZE, )
y_pred, y_pred_probs = get_pred(model_bert, data_loader)

In [174]:
apple_df['Predicted'] = y_pred.tolist()
apple_df.to_excel('data/apple_predictions.xlsx')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  apple_df['Predicted'] = y_pred.tolist()


### Finbert

In [158]:
MAX_LEN = 512
BATCH_SIZE = 8

tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert', do_lower_case=True)
data_loader = create_data_load(apple_df,tokenizer,  MAX_LEN, batch_size=BATCH_SIZE, )
y_pred, y_pred_probs = get_pred(model_finbert, data_loader)
apple_df["Predicted_Finbert"] = y_pred
apple_df.to_excel("data/apple_predictions_finbert.xlsx")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  apple_df["Predicted_Finbert"] = y_pred


# Amazon

In [159]:
amazon_df = combined_df[combined_df['stock'] == 'amazon']
MAX_LEN = 512
BATCH_SIZE = 8

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
data_loader = create_data_load(amazon_df,tokenizer,  MAX_LEN, batch_size=BATCH_SIZE, )
y_pred, y_pred_probs = get_pred(model_bert, data_loader)
amazon_df['Predicted'] = y_pred.tolist()
amazon_df.to_excel('data/amazon_predictions.xlsx')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  amazon_df['Predicted'] = y_pred.tolist()


### Finbert

In [166]:
MAX_LEN = 512
BATCH_SIZE = 8

tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert', do_lower_case=True)
data_loader = create_data_load(amazon_df,tokenizer,  MAX_LEN, batch_size=BATCH_SIZE, )
y_pred, y_pred_probs = get_pred(model_finbert, data_loader)
amazon_df["Predicted_Finbert"] = y_pred.tolist()
amazon_df.to_excel("data/amazon_predictions_finbert.xlsx")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  amazon_df["Predicted_Finbert"] = y_pred.tolist()


# Tesla

In [168]:
tesla_df = combined_df[combined_df['stock'] == 'tesla']
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
data_loader = create_data_load(tesla_df,tokenizer,  MAX_LEN, batch_size=BATCH_SIZE, )
y_pred, y_pred_probs = get_pred(model_bert, data_loader)
tesla_df['Predicted'] = y_pred.tolist()
tesla_df.to_excel('data/tesla_predictions.xlsx')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tesla_df['Predicted'] = y_pred.tolist()


### Finbert

In [169]:

MAX_LEN = 512
BATCH_SIZE = 8

tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert', do_lower_case=True)
data_loader = create_data_load(tesla_df,tokenizer,  MAX_LEN, batch_size=BATCH_SIZE, )
y_pred, y_pred_probs = get_pred(model_finbert, data_loader)
tesla_df['Predicted_Finbert'] = y_pred.tolist()
tesla_df.to_excel('data/tesla_predictions_finbert.xlsx')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tesla_df['Predicted_Finbert'] = y_pred.tolist()


# Microsoft

In [170]:
microsoft_df = combined_df[combined_df['stock'] == 'microsoft']
MAX_LEN = 512
BATCH_SIZE = 8

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
data_loader = create_data_load(microsoft_df,tokenizer,  MAX_LEN, batch_size=BATCH_SIZE, )
y_pred, y_pred_probs = get_pred(model_bert, data_loader)
microsoft_df['Predicted'] = y_pred.tolist()
microsoft_df.to_excel('data/microsoft_predictions.xlsx')
print(len(y_pred))
print(len(microsoft_df))

36
36


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  microsoft_df['Predicted'] = y_pred.tolist()


In [171]:
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert', do_lower_case=True)
data_loader = create_data_load(microsoft_df,tokenizer,  MAX_LEN, batch_size=BATCH_SIZE )
y_pred, y_pred_probs = get_pred(model_finbert, data_loader)
microsoft_df['Predicted_Finbert'] = y_pred.tolist()
microsoft_df.to_excel('data/microsoft_predictions_finbert.xlsx')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  microsoft_df['Predicted_Finbert'] = y_pred.tolist()


# Google

In [175]:
google_df = combined_df[combined_df['stock'] == 'google']
MAX_LEN = 512
BATCH_SIZE = 8

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
data_loader = create_data_load(google_df,tokenizer,  MAX_LEN, batch_size=BATCH_SIZE, )
y_pred, y_pred_probs = get_pred(model_bert, data_loader)
google_df['Predicted'] = y_pred.tolist()
print(len(y_pred.tolist()))
print(len(google_df))
#google_df.to_excel('data/google_predictions.xlsx')

77
77


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  google_df['Predicted'] = y_pred.tolist()


In [176]:
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert', do_lower_case=True)
data_loader = create_data_load(google_df,tokenizer,  MAX_LEN, batch_size=BATCH_SIZE, )
y_pred, y_pred_probs = get_pred(model_finbert, data_loader)
google_df['Predicted_Finbert'] = y_pred.tolist()
google_df.to_excel('data/google_predictions_finbert.xlsx')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  google_df['Predicted_Finbert'] = y_pred.tolist()


In [177]:
MAX_LEN = 256
BATCH_SIZE = 32
df = pd.read_excel("data/dataset_majority_voting.xlsx")
df.rename(columns={"clean_text": "news"}, inplace=True)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
data = create_data_load(df,tokenizer,  MAX_LEN, batch_size=BATCH_SIZE)

y_pred, y_pred_probs = get_pred(model_bert, data)
for i in range(len(y_pred)):
    df.loc[i, "furkanB"] = "Positive" if y_pred[i] else "Negative"
    

df.to_excel("temp.xlsx", index=False)

