In [1]:
import torch
from torch.optim import AdamW
import datetime
import numpy as np
from tqdm import tqdm
import os
import pandas as pd
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import pickle as pkl
import string

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
con_discre = pd.read_csv(f"C:/Users/zacha/Projects/fall_2023/capstone/final_project/text_data/consumer discretionary/consumer discretionary")
con_goods = pd.read_csv(f"C:/Users/zacha/Projects/fall_2023/capstone/final_project/text_data/consumer goods/consumer goods.csv")
financial = pd.read_csv(f"C:/Users/zacha/Projects/fall_2023/capstone/final_project/text_data/financial/financial.csv")
healthcare = pd.read_csv(f"C:/Users/zacha/Projects/fall_2023/capstone/final_project/text_data/healthcare/healthcare.csv")
semiconductors = pd.read_csv(f"C:/Users/zacha/Projects/fall_2023/capstone/final_project/text_data/semiconductors/semiconductors")
telecommunications = pd.read_csv(f"C:/Users/zacha/Projects/fall_2023/capstone/final_project/text_data/telecommunications/telecommunications")
utilities = pd.read_csv(f"C:/Users/zacha/Projects/fall_2023/capstone/final_project/text_data/utilities/utilities")

df_list = [con_discre, con_goods, financial, healthcare, semiconductors, telecommunications, utilities]


In [3]:
df = pd.concat(df_list, ignore_index=True)
print(len(df))

# drop unnecessary columns
df = df.drop(['_id', 'web_url', 'document_type', 'type_of_material', 'word_count', 'keywords', 'query'], axis=1)
df.head()

44487


Unnamed: 0,pub_date,text
0,2020-07-16T20:47:33+0000,"Retail sales increased 7.5 percent, but there ..."
1,2018-11-22T10:00:13+0000,Consumer confidence points to open pocketbooks...
2,2019-01-03T17:01:18+0000,"HONG KONG — For years, no matter what was happ..."
3,2018-01-01T00:42:15+0000,\n\n By KARL RUSSELL and \n \nLANDON T...
4,2019-08-12T19:56:58+0000,"Stocks fell sharply on Wall Street on Monday, ..."


In [4]:
con_discre = None
con_goods = None
financial = None
healthcare = None
semiconductors = None
telecommunications = None
utilities = None

In [5]:
df.rename(columns={'pub_date': 'DATE'}, inplace=True)

# format publication date to a way that can be 
df.DATE = df.DATE.apply(lambda x: x[5:7] + "/" + x[8:10] + "/" + x[0:4])

df.DATE = pd.to_datetime(df.DATE, format='%m/%d/%Y')
df.DATE = pd.to_datetime(df['DATE'])

# Sort the DataFrame by the datetime column
df = df.sort_values(by='DATE')

# Define the date threshold
st_threshold_date = pd.to_datetime('1/2/2018', format='%m/%d/%Y')
end_threshold_date = pd.to_datetime('3/31/2023', format='%m/%d/%Y')

# Filter rows based on the threshold date
df = df[df.DATE >= st_threshold_date]
df = df[df.DATE <= end_threshold_date]

In [6]:
# filter rows with strings longer than 125,000 to prevent the program from crashing
df = df[df["text"].str.len() <= 125000]

In [7]:
df = df.drop_duplicates(subset="DATE", keep='first')
len(df)

1914

In [8]:
df.reset_index(drop=True, inplace=True)

In [9]:
# to clean data
def remove_substring(text, substring, window_size):
    while substring in text:
        index = text.find(substring)
        if index != -1:
            text = text[:index] + text[index + len(substring):]

    for i in range(len(text) - window_size + 1):
        current_window = text[i:i + window_size]
        if substring in current_window:
            return False
    return text

In [10]:
# adjust to what you want to remove
substring_to_avoid = ["\n", "•", "\t"]
window_size = 2
for substr in substring_to_avoid:
    df.text = df.text.apply(lambda x: remove_substring(str(x), substr, window_size))

In [13]:
def compute_sentiment(text, chunksize=512):

    tokens = tokenizer.encode_plus(text, add_special_tokens=False, return_tensors='pt')

    input_id_chunks = tokens['input_ids'][0].split(510)
    mask_chunks = tokens['attention_mask'][0].split(510)

    input_id_chunks = list(input_id_chunks)
    mask_chunks = list(mask_chunks)

    for t in range(len(input_id_chunks)):
        input_id_chunks[t] = torch.cat([torch.Tensor([101]), input_id_chunks[t], torch.Tensor([102])])
        mask_chunks[t] = torch.cat([torch.Tensor([1]), mask_chunks[t], torch.Tensor([1])])

        pad_length = chunksize - input_id_chunks[t].shape[0]
        if pad_length > 0:
            input_id_chunks[t] = torch.cat([input_id_chunks[t], torch.Tensor([0] * pad_length)])
            mask_chunks[t] = torch.cat([mask_chunks[t], torch.Tensor([0] * pad_length)])

    input_ids = torch.stack(input_id_chunks)
    attention_mask = torch.stack(mask_chunks)

    input_ids = input_ids
    attention_mask = attention_mask

    input_dictionary = {'input_ids': input_ids.long(), 'attention_mask': attention_mask.int()}


    outputs = model(**input_dictionary)

    probs = torch.nn.functional.softmax(outputs[0], dim=-1)
    mean = probs.mean(dim=0)

    agg_prob = torch.nn.functional.softmax(outputs[0], dim=-1)
    agg_mean = probs.mean(dim=0)
    arching_sent = torch.argmax(agg_mean).item()
    agg_mean = torch.Tensor.tolist(agg_mean)
    
    if arching_sent == 0:
        arching_sent = 1
        return [agg_mean, arching_sent]
    elif arching_sent == 1:
        arching_sent = -1
        return [agg_mean, arching_sent]
    elif arching_sent == 2:
        arching_sent = 0
        return [agg_mean, arching_sent]
    

In [14]:
# Specify the model name
model_name = "ProsusAI/finbert"

# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained(model_name, use_fast=False)
model = BertForSequenceClassification.from_pretrained(model_name)

In [15]:
df["all_sent_finbert"] = np.nan
df["all_sent_finbert"] = df["all_sent_finbert"].astype('object')
df["dom_sent_finbert"] = np.nan
df["all_sent_news"] = np.nan
df["all_sent_news"] = df["all_sent_news"].astype('object')
df["dom_sent_news"] = np.nan
df.head()

Unnamed: 0,DATE,text,all_sent_finbert,dom_sent_finbert,all_sent_news,dom_sent_news
0,2018-01-02,"Senator Orrin G. Hatch of Utah, the longest-se...",,,,
1,2018-01-03,Good Wednesday. Here’s what we’re watching: T...,,,,
2,2018-01-04,Tesla’s first mass-market offering keeps encou...,,,,
3,2018-01-05,Afghan officials have pleaded with three Ameri...,,,,
4,2018-01-06,Humira is the best-selling prescription drug i...,,,,


In [16]:
# computational issues using apply. Used this as a work around
length_df = len(df)
print(f"length of the dataframe is {length_df}")
count = 0

for index, row in df.iterrows():
    text = row['text']
    results = compute_sentiment(text)
    df.at[index, "all_sent_finbert"] = results[0]
    df.at[index, "dom_sent_finbert"] = results[1]
    count += 1
    print(f"done row {count} of {length_df}")

length of the dataframe is 1914


Token indices sequence length is longer than the specified maximum sequence length for this model (4015 > 512). Running this sequence through the model will result in indexing errors


done row 1 of 1914
done row 2 of 1914
done row 3 of 1914
done row 4 of 1914
done row 5 of 1914
done row 6 of 1914
done row 7 of 1914
done row 8 of 1914
done row 9 of 1914
done row 10 of 1914
done row 11 of 1914
done row 12 of 1914
done row 13 of 1914
done row 14 of 1914
done row 15 of 1914
done row 16 of 1914
done row 17 of 1914
done row 18 of 1914
done row 19 of 1914
done row 20 of 1914
done row 21 of 1914
done row 22 of 1914
done row 23 of 1914
done row 24 of 1914
done row 25 of 1914
done row 26 of 1914
done row 27 of 1914
done row 28 of 1914
done row 29 of 1914
done row 30 of 1914
done row 31 of 1914
done row 32 of 1914
done row 33 of 1914
done row 34 of 1914
done row 35 of 1914
done row 36 of 1914
done row 37 of 1914
done row 38 of 1914
done row 39 of 1914
done row 40 of 1914
done row 41 of 1914
done row 42 of 1914
done row 43 of 1914
done row 44 of 1914
done row 45 of 1914
done row 46 of 1914
done row 47 of 1914
done row 48 of 1914
done row 49 of 1914
done row 50 of 1914
done row 

done row 397 of 1914
done row 398 of 1914
done row 399 of 1914
done row 400 of 1914
done row 401 of 1914
done row 402 of 1914
done row 403 of 1914
done row 404 of 1914
done row 405 of 1914
done row 406 of 1914
done row 407 of 1914
done row 408 of 1914
done row 409 of 1914
done row 410 of 1914
done row 411 of 1914
done row 412 of 1914
done row 413 of 1914
done row 414 of 1914
done row 415 of 1914
done row 416 of 1914
done row 417 of 1914
done row 418 of 1914
done row 419 of 1914
done row 420 of 1914
done row 421 of 1914
done row 422 of 1914
done row 423 of 1914
done row 424 of 1914
done row 425 of 1914
done row 426 of 1914
done row 427 of 1914
done row 428 of 1914
done row 429 of 1914
done row 430 of 1914
done row 431 of 1914
done row 432 of 1914
done row 433 of 1914
done row 434 of 1914
done row 435 of 1914
done row 436 of 1914
done row 437 of 1914
done row 438 of 1914
done row 439 of 1914
done row 440 of 1914
done row 441 of 1914
done row 442 of 1914
done row 443 of 1914
done row 444 

done row 788 of 1914
done row 789 of 1914
done row 790 of 1914
done row 791 of 1914
done row 792 of 1914
done row 793 of 1914
done row 794 of 1914
done row 795 of 1914
done row 796 of 1914
done row 797 of 1914
done row 798 of 1914
done row 799 of 1914
done row 800 of 1914
done row 801 of 1914
done row 802 of 1914
done row 803 of 1914
done row 804 of 1914
done row 805 of 1914
done row 806 of 1914
done row 807 of 1914
done row 808 of 1914
done row 809 of 1914
done row 810 of 1914
done row 811 of 1914
done row 812 of 1914
done row 813 of 1914
done row 814 of 1914
done row 815 of 1914
done row 816 of 1914
done row 817 of 1914
done row 818 of 1914
done row 819 of 1914
done row 820 of 1914
done row 821 of 1914
done row 822 of 1914
done row 823 of 1914
done row 824 of 1914
done row 825 of 1914
done row 826 of 1914
done row 827 of 1914
done row 828 of 1914
done row 829 of 1914
done row 830 of 1914
done row 831 of 1914
done row 832 of 1914
done row 833 of 1914
done row 834 of 1914
done row 835 

done row 1171 of 1914
done row 1172 of 1914
done row 1173 of 1914
done row 1174 of 1914
done row 1175 of 1914
done row 1176 of 1914
done row 1177 of 1914
done row 1178 of 1914
done row 1179 of 1914
done row 1180 of 1914
done row 1181 of 1914
done row 1182 of 1914
done row 1183 of 1914
done row 1184 of 1914
done row 1185 of 1914
done row 1186 of 1914
done row 1187 of 1914
done row 1188 of 1914
done row 1189 of 1914
done row 1190 of 1914
done row 1191 of 1914
done row 1192 of 1914
done row 1193 of 1914
done row 1194 of 1914
done row 1195 of 1914
done row 1196 of 1914
done row 1197 of 1914
done row 1198 of 1914
done row 1199 of 1914
done row 1200 of 1914
done row 1201 of 1914
done row 1202 of 1914
done row 1203 of 1914
done row 1204 of 1914
done row 1205 of 1914
done row 1206 of 1914
done row 1207 of 1914
done row 1208 of 1914
done row 1209 of 1914
done row 1210 of 1914
done row 1211 of 1914
done row 1212 of 1914
done row 1213 of 1914
done row 1214 of 1914
done row 1215 of 1914
done row 1

done row 1544 of 1914
done row 1545 of 1914
done row 1546 of 1914
done row 1547 of 1914
done row 1548 of 1914
done row 1549 of 1914
done row 1550 of 1914
done row 1551 of 1914
done row 1552 of 1914
done row 1553 of 1914
done row 1554 of 1914
done row 1555 of 1914
done row 1556 of 1914
done row 1557 of 1914
done row 1558 of 1914
done row 1559 of 1914
done row 1560 of 1914
done row 1561 of 1914
done row 1562 of 1914
done row 1563 of 1914
done row 1564 of 1914
done row 1565 of 1914
done row 1566 of 1914
done row 1567 of 1914
done row 1568 of 1914
done row 1569 of 1914
done row 1570 of 1914
done row 1571 of 1914
done row 1572 of 1914
done row 1573 of 1914
done row 1574 of 1914
done row 1575 of 1914
done row 1576 of 1914
done row 1577 of 1914
done row 1578 of 1914
done row 1579 of 1914
done row 1580 of 1914
done row 1581 of 1914
done row 1582 of 1914
done row 1583 of 1914
done row 1584 of 1914
done row 1585 of 1914
done row 1586 of 1914
done row 1587 of 1914
done row 1588 of 1914
done row 1

In [17]:
tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

In [18]:
# computational issues using apply. Used this as a work around
length_df = len(df)
print(f"length of the dataframe is {length_df}")
count = 0

for index, row in df.iterrows():
    text = row['text']
    results = compute_sentiment(text)
    df.at[index, "all_sent_news"] = results[0]
    df.at[index, "dom_sent_news"] = results[1]
    count += 1
    print(f"done row {count} of {length_df}")

length of the dataframe is 1914


Token indices sequence length is longer than the specified maximum sequence length for this model (4028 > 512). Running this sequence through the model will result in indexing errors


done row 1 of 1914
done row 2 of 1914
done row 3 of 1914
done row 4 of 1914
done row 5 of 1914
done row 6 of 1914
done row 7 of 1914
done row 8 of 1914
done row 9 of 1914
done row 10 of 1914
done row 11 of 1914
done row 12 of 1914
done row 13 of 1914
done row 14 of 1914
done row 15 of 1914
done row 16 of 1914
done row 17 of 1914
done row 18 of 1914
done row 19 of 1914
done row 20 of 1914
done row 21 of 1914
done row 22 of 1914
done row 23 of 1914
done row 24 of 1914
done row 25 of 1914
done row 26 of 1914
done row 27 of 1914
done row 28 of 1914
done row 29 of 1914
done row 30 of 1914
done row 31 of 1914
done row 32 of 1914
done row 33 of 1914
done row 34 of 1914
done row 35 of 1914
done row 36 of 1914
done row 37 of 1914
done row 38 of 1914
done row 39 of 1914
done row 40 of 1914
done row 41 of 1914
done row 42 of 1914
done row 43 of 1914
done row 44 of 1914
done row 45 of 1914
done row 46 of 1914
done row 47 of 1914
done row 48 of 1914
done row 49 of 1914
done row 50 of 1914
done row 

done row 397 of 1914
done row 398 of 1914
done row 399 of 1914
done row 400 of 1914
done row 401 of 1914
done row 402 of 1914
done row 403 of 1914
done row 404 of 1914
done row 405 of 1914
done row 406 of 1914
done row 407 of 1914
done row 408 of 1914
done row 409 of 1914
done row 410 of 1914
done row 411 of 1914
done row 412 of 1914
done row 413 of 1914
done row 414 of 1914
done row 415 of 1914
done row 416 of 1914
done row 417 of 1914
done row 418 of 1914
done row 419 of 1914
done row 420 of 1914
done row 421 of 1914
done row 422 of 1914
done row 423 of 1914
done row 424 of 1914
done row 425 of 1914
done row 426 of 1914
done row 427 of 1914
done row 428 of 1914
done row 429 of 1914
done row 430 of 1914
done row 431 of 1914
done row 432 of 1914
done row 433 of 1914
done row 434 of 1914
done row 435 of 1914
done row 436 of 1914
done row 437 of 1914
done row 438 of 1914
done row 439 of 1914
done row 440 of 1914
done row 441 of 1914
done row 442 of 1914
done row 443 of 1914
done row 444 

done row 788 of 1914
done row 789 of 1914
done row 790 of 1914
done row 791 of 1914
done row 792 of 1914
done row 793 of 1914
done row 794 of 1914
done row 795 of 1914
done row 796 of 1914
done row 797 of 1914
done row 798 of 1914
done row 799 of 1914
done row 800 of 1914
done row 801 of 1914
done row 802 of 1914
done row 803 of 1914
done row 804 of 1914
done row 805 of 1914
done row 806 of 1914
done row 807 of 1914
done row 808 of 1914
done row 809 of 1914
done row 810 of 1914
done row 811 of 1914
done row 812 of 1914
done row 813 of 1914
done row 814 of 1914
done row 815 of 1914
done row 816 of 1914
done row 817 of 1914
done row 818 of 1914
done row 819 of 1914
done row 820 of 1914
done row 821 of 1914
done row 822 of 1914
done row 823 of 1914
done row 824 of 1914
done row 825 of 1914
done row 826 of 1914
done row 827 of 1914
done row 828 of 1914
done row 829 of 1914
done row 830 of 1914
done row 831 of 1914
done row 832 of 1914
done row 833 of 1914
done row 834 of 1914
done row 835 

done row 1171 of 1914
done row 1172 of 1914
done row 1173 of 1914
done row 1174 of 1914
done row 1175 of 1914
done row 1176 of 1914
done row 1177 of 1914
done row 1178 of 1914
done row 1179 of 1914
done row 1180 of 1914
done row 1181 of 1914
done row 1182 of 1914
done row 1183 of 1914
done row 1184 of 1914
done row 1185 of 1914
done row 1186 of 1914
done row 1187 of 1914
done row 1188 of 1914
done row 1189 of 1914
done row 1190 of 1914
done row 1191 of 1914
done row 1192 of 1914
done row 1193 of 1914
done row 1194 of 1914
done row 1195 of 1914
done row 1196 of 1914
done row 1197 of 1914
done row 1198 of 1914
done row 1199 of 1914
done row 1200 of 1914
done row 1201 of 1914
done row 1202 of 1914
done row 1203 of 1914
done row 1204 of 1914
done row 1205 of 1914
done row 1206 of 1914
done row 1207 of 1914
done row 1208 of 1914
done row 1209 of 1914
done row 1210 of 1914
done row 1211 of 1914
done row 1212 of 1914
done row 1213 of 1914
done row 1214 of 1914
done row 1215 of 1914
done row 1

done row 1544 of 1914
done row 1545 of 1914
done row 1546 of 1914
done row 1547 of 1914
done row 1548 of 1914
done row 1549 of 1914
done row 1550 of 1914
done row 1551 of 1914
done row 1552 of 1914
done row 1553 of 1914
done row 1554 of 1914
done row 1555 of 1914
done row 1556 of 1914
done row 1557 of 1914
done row 1558 of 1914
done row 1559 of 1914
done row 1560 of 1914
done row 1561 of 1914
done row 1562 of 1914
done row 1563 of 1914
done row 1564 of 1914
done row 1565 of 1914
done row 1566 of 1914
done row 1567 of 1914
done row 1568 of 1914
done row 1569 of 1914
done row 1570 of 1914
done row 1571 of 1914
done row 1572 of 1914
done row 1573 of 1914
done row 1574 of 1914
done row 1575 of 1914
done row 1576 of 1914
done row 1577 of 1914
done row 1578 of 1914
done row 1579 of 1914
done row 1580 of 1914
done row 1581 of 1914
done row 1582 of 1914
done row 1583 of 1914
done row 1584 of 1914
done row 1585 of 1914
done row 1586 of 1914
done row 1587 of 1914
done row 1588 of 1914
done row 1

In [19]:
df.head()

Unnamed: 0,DATE,text,all_sent_finbert,dom_sent_finbert,all_sent_news,dom_sent_news
0,2018-01-02,"Senator Orrin G. Hatch of Utah, the longest-se...","[0.026327209547162056, 0.8391094207763672, 0.1...",-1.0,"[0.9886341094970703, 0.0021902157459408045, 0....",1.0
1,2018-01-03,Good Wednesday. Here’s what we’re watching: T...,"[0.09220370650291443, 0.5681420564651489, 0.33...",-1.0,"[0.3796953856945038, 0.12226345390081406, 0.49...",0.0
2,2018-01-04,Tesla’s first mass-market offering keeps encou...,"[0.169036403298378, 0.6013268232345581, 0.2296...",-1.0,"[0.553765058517456, 0.00035269063664600253, 0....",1.0
3,2018-01-05,Afghan officials have pleaded with three Ameri...,"[0.019375961273908615, 0.8045820593833923, 0.1...",-1.0,"[0.9749003648757935, 0.018721289932727814, 0.0...",1.0
4,2018-01-06,Humira is the best-selling prescription drug i...,"[0.8553577661514282, 0.013487294316291809, 0.1...",1.0,"[0.00017539261898491532, 0.0001776907447492703...",0.0


In [25]:
df = df.drop("text", axis=1)
df.head()

Unnamed: 0,DATE,all_sent_finbert,dom_sent_finbert,all_sent_news,dom_sent_news
0,2018-01-02,"[0.026327209547162056, 0.8391094207763672, 0.1...",-1.0,"[0.9886341094970703, 0.0021902157459408045, 0....",1.0
1,2018-01-03,"[0.09220370650291443, 0.5681420564651489, 0.33...",-1.0,"[0.3796953856945038, 0.12226345390081406, 0.49...",0.0
2,2018-01-04,"[0.169036403298378, 0.6013268232345581, 0.2296...",-1.0,"[0.553765058517456, 0.00035269063664600253, 0....",1.0
3,2018-01-05,"[0.019375961273908615, 0.8045820593833923, 0.1...",-1.0,"[0.9749003648757935, 0.018721289932727814, 0.0...",1.0
4,2018-01-06,"[0.8553577661514282, 0.013487294316291809, 0.1...",1.0,"[0.00017539261898491532, 0.0001776907447492703...",0.0


In [26]:
df.to_csv("C:/Users/zacha/Projects/fall_2023/capstone/final_project/text_data/final.csv", index=False)