In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
     ---------------------------------------- 5.3/5.3 MB 8.2 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
     -------------------------------------- 163.5/163.5 KB 4.9 MB/s eta 0:00:00
Collecting filelock
  Downloading filelock-3.8.0-py3-none-any.whl (10 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp310-cp310-win_amd64.whl (3.3 MB)
     ---------------------------------------- 3.3/3.3 MB 12.3 MB/s eta 0:00:00
Installing collected packages: tokenizers, filelock, huggingface-hub, transformers
Successfully installed filelock-3.8.0 huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


You should consider upgrading via the 'C:\Users\65831\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [3]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

import string
import re
import json

import nltk
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\65831\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\65831\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\65831\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [5]:
df_world= pd.read_csv('../Data/News/Global/world_news.csv')
df_politics= pd.read_csv('../Data/News/Global/politics_news.csv')
df_coronavirus= pd.read_csv('../Data/News/Global/coronavirus_news.csv')
df_aapl= pd.read_csv('../Data/News/Stock/aapl_news.csv')
df_meta= pd.read_csv('../Data/News/Stock/meta_news.csv')
df_tsla= pd.read_csv('../Data/News/Stock/tsla_news.csv')

In [6]:
# will transfer this to another .py file so we can import to both finbert and vader
def remove_irrelevant_content(text):
    headline_only_string = "This headline-only article is meant to show you why a stock is moving, the most difficult aspect of stock trading"

    if headline_only_string in text:
        return ""
    else:
        return text

def remove_punctuation(text):
    text = re.sub(r'[^\w\s]', '', text)
    return text

def lowercase(text):
    return text.lower()

def remove_stopwords(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text

def remove_special_character(text):
    text = text.replace('\n', ' ') 
    return text

def lemmatize(text):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in text]
    return corpus

def preprocess_text(text):
    text = remove_irrelevant_content(text)
    text = remove_punctuation(text)
    text = lowercase(text)
    # text = remove_stopwords(text)
    text = remove_special_character(text)
    # text = lemmatize(text)
    return text

In [12]:
def finbert_chunk_process(df) : 
    print("Processing chunk")
    df['Date'] = pd.to_datetime(df['Date'])
    df['Processed Title'] = df['Title'].apply(lambda x: preprocess_text(x))
    df['Processed Text'] = df['Text'].apply(lambda x: preprocess_text(x))
   
    # title
    news_title = df['Processed Title'].to_list()
    inputs = tokenizer(news_title, padding = True, truncation = True, return_tensors='pt')
    outputs = model(**inputs)

    predictions_title = torch.nn.functional.softmax(outputs.logits, dim=-1)

    model.config.id2label

    df['Positive_Title'] = predictions_title[:, 0].tolist()
    df['Negative_Title'] = predictions_title[:, 1].tolist()
    df['Neutral_Title'] = predictions_title[:, 2].tolist()

    # text
    news_text = df['Processed Text'].to_list()
    inputs = tokenizer(news_text, padding = True, truncation = True, return_tensors='pt')
    outputs = model(**inputs)

    predictions_text = torch.nn.functional.softmax(outputs.logits, dim=-1)

    model.config.id2label

    df['Positive_Text'] = predictions_text[:, 0].tolist()
    df['Negative_Text'] = predictions_text[:, 1].tolist()
    df['Neutral_Text'] = predictions_text[:, 2].tolist()


    return df

def finbert_process(df, chunk_size=50):
    print("Breaking data into smaller chunks")
    chunks = [df[i:i+chunk_size] for i in range(0, len(df), chunk_size)] 

    lst = []
    for chunk in chunks:
        lst.append(finbert_chunk_process(chunk))
    
    output = pd.concat(lst)
    
    return output
    

In [37]:
def sentiment_aggregator(df, title = True, type = "abs_max"):
    """
    Aggregates sentiments on a per day basis.

    Parameters
    ----------
    df: DataFrame
        Dataset generated after sentiment analysis.
    title: boolean
        To indicate if the news title or news body text is used to generate the aggregated sentiment. 
        Default is True (ie. News title is used for aggregated sentiment)
    type: Str {"mean", "abs_max"}
        To indicate method of calculation.
        "mean": Group by Date and takes mean of "Compound"
        "abs_max": Calculates the absolute max of "Positive" and "Negative" column. Then group by Date and takes mean of this new column

    Returns
    -------
    Output : Series
        Contains aggregated sentiment for each day
    """
    if title:
        positive = "Positive_Title"
        negative = "Negative_Title"
    else:
        positive = "Positive_Text"
        negative = "Negative_Text"

    if type == "mean":
        df["Compound"] = df[positive] - df[negative]
        return df.groupby('Date')['Compound'].aggregate('mean')

    elif type == "abs_max":
        df['Negative'] = -df[negative]
        df['Positive'] = df[positive]
        df['Sentiment'] = df.apply(lambda x: max(x["Negative"], x["Positive"], key=abs), axis=1)

        return df.groupby('Date')['Sentiment'].aggregate('mean')


In [19]:
def finbert_all():
    df_world= pd.read_csv('../Data/News/Global/world_news.csv')
    df_politics= pd.read_csv('../Data/News/Global/politics_news.csv')
    df_coronavirus= pd.read_csv('../Data/News/Global/coronavirus_news.csv')
    df_aapl= pd.read_csv('../Data/News/Stock/aapl_news.csv')
    df_meta= pd.read_csv('../Data/News/Stock/meta_news.csv')
    df_tsla= pd.read_csv('../Data/News/Stock/tsla_news.csv')
    
    df_aapl = finbert_process(df_aapl)
    df_meta = finbert_process(df_meta)
    df_tsla = finbert_process(df_tsla)
    df_world = finbert_process(df_world)
    df_politics = finbert_process(df_politics)
    df_coronavirus = finbert_process(df_coronavirus)

    df_aapl.to_csv('../Data-Processed/News/Stock/aapl_finbert.csv',index=False)
    df_meta.to_csv('../Data-Processed/News/Stock/meta_finbert.csv',index=False)
    df_tsla.to_csv('../Data-Processed/News/Stock/tsla_finbert.csv',index=False)
    df_world.to_csv('../Data-Processed/News/Global/world_finbert.csv',index=False)
    df_politics.to_csv('../Data-Processed/News/Global/politics_finbert.csv',index=False)
    df_coronavirus.to_csv('../Data-Processed/News/Global/coronavirus_finbert.csv',index=False)

def aggregate_sentiment_all(title, type):
    df_world_finbert= pd.read_csv('../Data-Processed/News/Global/world_finbert.csv')
    df_politics_finbert= pd.read_csv('../Data-Processed/News/Global/politics_finbert.csv')
    df_coronavirus_finbert= pd.read_csv('../Data-Processed/News/Global/coronavirus_finbert.csv')
    df_aapl_finbert= pd.read_csv('../Data-Processed/News/Stock/aapl_finbert.csv')
    df_meta_finbert= pd.read_csv('../Data-Processed/News/Stock/meta_finbert.csv')
    df_tsla_finbert= pd.read_csv('../Data-Processed/News/Stock/tsla_finbert.csv')

    aggregated_sentiment_aapl = sentiment_aggregator(df_aapl_finbert, title=title, type=type)
    aggregated_sentiment_meta = sentiment_aggregator(df_meta_finbert, title=title, type=type)
    aggregated_sentiment_tsla = sentiment_aggregator(df_tsla_finbert, title=title, type=type)
    aggregated_sentiment_world = sentiment_aggregator(df_world_finbert, title=title, type=type)
    aggregated_sentiment_politics = sentiment_aggregator(df_politics_finbert, title=title, type=type)
    aggregated_sentiment_coronavirus = sentiment_aggregator(df_coronavirus_finbert, title=title, type=type)

    lst = [aggregated_sentiment_aapl, aggregated_sentiment_meta, aggregated_sentiment_tsla, aggregated_sentiment_world, aggregated_sentiment_politics, aggregated_sentiment_coronavirus]
    keys = ["AAPL", "META", "TSLA", "World", "Politics", "Coronavirus"]
    
    return pd.concat(lst, keys=keys, axis=1)

In [38]:
#finbert_all() #Takes quite long to run
aggregate_sentiment_all(True, "abs_max")

Unnamed: 0_level_0,AAPL,META,TSLA,World,Politics,Coronavirus
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-31,0.331527,0.379263,,,0.379263,
2018-02-01,0.042964,0.379263,0.042964,,,
2018-02-02,0.042964,0.042964,,,,
2018-02-04,0.042964,,,0.042964,,
2018-02-05,0.042964,0.379263,,,,
...,...,...,...,...,...,...
2021-08-01,,,,,0.715562,
2021-09-05,,,,,0.042964,
2022-03-26,,,,,0.042964,
2020-04-11,,,,,,0.042964
