In [1]:
%pip install flair

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np

from flair.models import TextClassifier
from flair.data import Sentence

import nltk
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
from nltk.stem.wordnet import WordNetLemmatizer
from datetime import datetime
import string
import re
import json

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/eltontay/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eltontay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/eltontay/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/eltontay/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
df_world= pd.read_csv('../Data/News/Global/world_news.csv')
df_politics= pd.read_csv('../Data/News/Global/politics_news.csv')
df_coronavirus= pd.read_csv('../Data/News/Global/coronavirus_news.csv')
df_aapl= pd.read_csv('../Data/News/Stock/aapl_news.csv')
df_meta= pd.read_csv('../Data/News/Stock/meta_news.csv')
df_tsla= pd.read_csv('../Data/News/Stock/tsla_news.csv')

In [4]:
def remove_irrelevant_content(text):
    headline_only_string = "This headline-only article is meant to show you why a stock is moving, the most difficult aspect of stock trading"

    if headline_only_string in text:
        return ""
    else:
        return text

def remove_punctuation(text):
    text = re.sub(r'[^\w\s]', '', text)
    return text

def lowercase(text):
    return text.lower()

def remove_stopwords(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text

def remove_special_character(text):
    text = text.replace('\n', ' ') 
    return text

def lemmatize(text):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in text]
    return corpus

def preprocess_text(text):
    text = remove_irrelevant_content(text)
    text = remove_punctuation(text)
    text = lowercase(text)
    # text = remove_stopwords(text)
    text = remove_special_character(text)
    # text = lemmatize(text)
    return text

In [5]:
classifier = TextClassifier.load('en-sentiment')

2022-10-20 23:33:59,276 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-distilbert/sentiment-en-mix-distillbert_4.pt not found in cache, downloading to /var/folders/yv/9y1rb7mx4szd2ll5qlqxss040000gn/T/tmpdq45bnny


100%|██████████| 265512723/265512723 [01:10<00:00, 3740441.30B/s]

2022-10-20 23:35:11,192 copying /var/folders/yv/9y1rb7mx4szd2ll5qlqxss040000gn/T/tmpdq45bnny to cache at /Users/eltontay/.flair/models/sentiment-en-mix-distillbert_4.pt





2022-10-20 23:35:11,271 removing temp file /var/folders/yv/9y1rb7mx4szd2ll5qlqxss040000gn/T/tmpdq45bnny
2022-10-20 23:35:11,289 loading file /Users/eltontay/.flair/models/sentiment-en-mix-distillbert_4.pt


Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 9.89kB/s]
Downloading: 100%|██████████| 483/483 [00:00<00:00, 126kB/s]
Downloading: 100%|██████████| 232k/232k [00:01<00:00, 231kB/s]  
Downloading: 100%|██████████| 466k/466k [00:02<00:00, 192kB/s]  


In [6]:
def flair_process(df) : 
    df['Date'] = pd.to_datetime(df['Date'])
    df['Processed Title'] = df['Title'].apply(lambda x: preprocess_text(x))
    # df['Processed Text'] = df['Text'].apply(lambda x: preprocess_text(x))

    df['Sentence Title'] = df['Processed Title'].apply(Sentence)
    # df['Sentence Text'] = df['Processed Text'].apply(Sentence)
    df['Sentence Title'].apply(classifier.predict)
    # df['Sentence Text'].apply(classifier.predict)
    
    df['Label Title'] = df['Sentence Title'].apply(lambda x: x.labels[0].value)
    df['Score Title'] = df['Sentence Title'].apply(lambda x: x.labels[0].score)
    df.loc[df['Label Title'] == 'NEGATIVE', 'Score Title'] = 0 - df['Score Title']

    # df['Label Text'] = df['Sentence Text'].apply(lambda x: x.labels[0].value)
    # df['Score Text'] = df['Sentence Text'].apply(lambda x: x.labels[0].score)
    # df.loc[df['Label Text'] == 'NEGATIVE', 'Score Text'] = 0 - df['Score Text']

    return df

In [7]:
def flair_all():
    df_world= pd.read_csv('../Data/News/Global/world_news.csv')
    df_politics= pd.read_csv('../Data/News/Global/politics_news.csv')
    df_coronavirus= pd.read_csv('../Data/News/Global/coronavirus_news.csv')
    df_aapl= pd.read_csv('../Data/News/Stock/aapl_news.csv')
    df_meta= pd.read_csv('../Data/News/Stock/meta_news.csv')
    df_tsla= pd.read_csv('../Data/News/Stock/tsla_news.csv')
    
    df_aapl = flair_process(df_aapl)
    print("aapl processed, starting meta")
    df_meta = flair_process(df_meta)
    print("meta processed, starting tsla")
    df_tsla = flair_process(df_tsla)
    print("tsla processed, starting world")
    df_world = flair_process(df_world)
    print("world processed, starting politics")
    df_politics = flair_process(df_politics)
    print("politics processed, starting corona")
    df_coronavirus = flair_process(df_coronavirus)
    print("corona processed")

    df_aapl.to_csv('../Data-Processed/News/Stock/aapl_flair.csv',index=False)
    df_meta.to_csv('../Data-Processed/News/Stock/meta_flair.csv',index=False)
    df_tsla.to_csv('../Data-Processed/News/Stock/tsla_flair.csv',index=False)
    df_world.to_csv('../Data-Processed/News/Global/world_flair.csv',index=False)
    df_politics.to_csv('../Data-Processed/News/Global/politics_flair.csv',index=False)
    df_coronavirus.to_csv('../Data-Processed/News/Global/coronavirus_flair.csv',index=False)

In [8]:
flair_all()

aapl processed, starting meta
meta processed, starting tsla
tsla processed, starting world
world processed, starting politics
politics processed, starting corona
corona processed


In [13]:
def sentiment_aggregator(df, title = True):
    # flair only gives one value and a label (POSITIVE or NEGATIVE) so we just use mean
    # if title:
    return df.groupby('Date')['Score Title'].aggregate('mean')

    # else:
    #     return df.groupby('Date')['Score Text'].aggregate('mean')
        

In [14]:
def aggregate_sentiment_all(title):
    df_world_flair= pd.read_csv('../Data-Processed/News/Global/world_flair.csv')
    df_politics_flair= pd.read_csv('../Data-Processed/News/Global/politics_flair.csv')
    df_coronavirus_flair= pd.read_csv('../Data-Processed/News/Global/coronavirus_flair.csv')
    df_aapl_flair= pd.read_csv('../Data-Processed/News/Stock/aapl_flair.csv')
    df_meta_flair= pd.read_csv('../Data-Processed/News/Stock/meta_flair.csv')
    df_tsla_flair= pd.read_csv('../Data-Processed/News/Stock/tsla_flair.csv')

    aggregated_sentiment_aapl = sentiment_aggregator(df_aapl_flair, title=title)
    aggregated_sentiment_meta = sentiment_aggregator(df_meta_flair, title=title)
    aggregated_sentiment_tsla = sentiment_aggregator(df_tsla_flair, title=title)
    aggregated_sentiment_world = sentiment_aggregator(df_world_flair, title=title)
    aggregated_sentiment_politics = sentiment_aggregator(df_politics_flair, title=title)
    aggregated_sentiment_coronavirus = sentiment_aggregator(df_coronavirus_flair, title=title)

    lst = [aggregated_sentiment_aapl, aggregated_sentiment_meta, aggregated_sentiment_tsla, aggregated_sentiment_world, aggregated_sentiment_politics, aggregated_sentiment_coronavirus]
    keys = ["AAPL", "META", "TSLA", "World", "Politics", "Coronavirus"]
    
    return pd.concat(lst, keys=keys, axis=1)

In [16]:
#vader_all()
df_all = aggregate_sentiment_all(title=True)
df_all = df_all.sort_values(by="Date")
df_all.to_csv('../Data-Processed/all_flair.csv')