In [1]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
from pathlib import Path

In [2]:
data_path = Path("data")
hist_path = data_path / "processed_history.parquet"
msg_path = data_path / "msgs_w_issuer.parquet"

In [3]:
msg_df = pd.read_parquet(msg_path)

In [17]:
msg_df["symbol"].unique()

array(['EQNR', 'PAR', 'VIZ', ..., 'MAUD', 'POINT', 'HRMKO'], dtype=object)

### Remove issuers with no known stock symbol
Won't have the stock history for it

In [4]:
hist_df = pd.read_parquet(hist_path)

In [5]:
known_syms = hist_df.index.get_level_values(0).unique()
known_syms.shape

(315,)

In [6]:
msg_df["symbol"].nunique()

1279

In [7]:
msg_known_sym = msg_df[msg_df["symbol"].isin(known_syms)]

In [41]:
msg_known_sym["weighted_label_conf"].describe()#.quantile(np.arange(0.1, 0.5, 0.05))

count    193984.000000
mean          0.938056
std           0.119647
min           0.099609
25%           0.914800
50%           0.998200
75%           1.000000
max           1.000000
Name: weighted_label_conf, dtype: float64

In [38]:
msg_known_sym.value_counts("weighted_label")

weighted_label
__label__en     109689
__label__no      81556
__label__sv       1047
__label__da        934
__label__nn        660
__label__de         46
__label__it         30
__label__fr          8
__label__nl          3
__label__pl          2
__label__ca          2
__label__hu          2
__label__nds         1
__label__fi          1
__label__es          1
__label__vi          1
__label__zh          1
dtype: int64

In [8]:
%%time
closing_hours = pd.to_datetime(["16:30:00"]).time[0]
is_next_day = (msg_known_sym["publishedTime_dt"].dt.time > closing_hours)#.astype(int)
msg_w_next_day = msg_known_sym.assign(is_next_day=is_next_day, 
                                      next_day=(msg_known_sym["publishedTime_dt"]
                                                .transform(lambda dt_idx: dt_idx.date + pd.offsets.BDay() * (dt_idx.time > closing_hours).astype(int))
                                               ))

CPU times: user 4.79 s, sys: 463 ms, total: 5.25 s
Wall time: 5.29 s


### Merge sources

In [9]:
msg_w_hist = msg_w_next_day.merge(hist_df, left_on=["symbol", "next_day"], right_index=True)

In [132]:
msg_w_hist["symbol"].nunique()

274

### Calculate ground truth

In [179]:
categories_df = pd.read_csv(data_path / "news_categories.csv")

In [187]:
cat_no = categories_df[["osl_id", "category_no"]].to_dict()
cat_en =  categories_df.drop(columns="category_no").to_dict()

In [194]:
catid2cat_en = {v: cat_en["category_en"][k] for idx, (k, v) in enumerate(cat_en["osl_id"].items(), start=1)}
catid2cat_no = {v: cat_no["category_no"][k] for idx, (k, v) in enumerate(cat_no["osl_id"].items(), start=1)}

In [None]:
# only 1 category per apperantly 

In [214]:
msg_w_hist["cat_id"] = msg_w_hist["category"].str.strip("[]").apply(eval).str["id"]#.count("id").#.apply(ast.literal_eval).str.count()#.len().describe()#.apply(ast.literal_eval)#.explode()


In [262]:
import gc
gc.collect()

1939

In [10]:
msg_labeled = msg_w_hist.assign(is_positive=(msg_w_hist["intra_day_open_close_pct"] > msg_w_hist["ewm_std_3"].div(msg_w_hist["ewm_3"])*100).astype(int).replace({0: np.nan, 1: "POSITIVE"}),
                                is_negative=(msg_w_hist["intra_day_open_close_pct"] < msg_w_hist["ewm_std_3"].div(msg_w_hist["ewm_3"])*100).astype(int).replace({0: np.nan, 1: "NEGATIVE"})
                               )

In [12]:
msg_labeled["label"] = np.nan
msg_labeled = msg_labeled.assign(label=msg_labeled["label"].fillna(msg_labeled["is_positive"]).fillna(msg_labeled["is_negative"]).fillna("NEUTRAL")).astype(str)
msg_labeled

Unnamed: 0,id,messageId,newsId,title,body,category,markets,issuerId,correctionForMessageId,correctedByMessageId,...,dol_vol_ewm_std_7,dol_vol_ewm_std_15,dol_vol_ewm_std_30,d3_avg_pct,d7_avg_pct,d15_avg_pct,d30_avg_pct,is_positive,is_negative,label
30,440750,440750,391579,Statoil ASA: Completion of share capital incre...,Reference is made to the previous announcement...,"[{'id': 1008, 'category_no': 'KAPITAL- OG STEM...",['XOSL'],1309,0,0,...,802249.9011112965,906536.1954811086,951094.1565136382,0.5927682276229973,0.473653049141487,2.044497895369801,2.5377643504531644,,NEGATIVE,NEGATIVE
31,440720,440720,391549,Statoil ASA: Allocation of Dividend Shares to ...,Reference is made to the announcement by Stato...,"[{'id': 1008, 'category_no': 'KAPITAL- OG STEM...",['XOSL'],1309,0,0,...,802249.9011112965,906536.1954811086,951094.1565136382,0.5927682276229973,0.473653049141487,2.044497895369801,2.5377643504531644,,NEGATIVE,NEGATIVE
32,440749,440749,391578,Statoil ASA: Gjennomføring av kapitalforhøyels...,Det vises til tidligere meldinger fra Statoil ...,"[{'id': 1008, 'category_no': 'KAPITAL- OG STEM...",['XOSL'],1309,0,0,...,802249.9011112965,906536.1954811086,951094.1565136382,0.5927682276229973,0.473653049141487,2.044497895369801,2.5377643504531644,,NEGATIVE,NEGATIVE
33,440719,440719,391548,Statoil ASA: Tildeling av Utbytteaksjer til p...,"Det vises til meldingen fra Statoil ASA (""Sels...","[{'id': 1008, 'category_no': 'KAPITAL- OG STEM...",['XOSL'],1309,0,0,...,802249.9011112965,906536.1954811086,951094.1565136382,0.5927682276229973,0.473653049141487,2.044497895369801,2.5377643504531644,,NEGATIVE,NEGATIVE
34,440717,440717,391546,Statoil ASA: Result of the Dividend Issue for ...,Reference is made to the previous announcement...,"[{'id': 1008, 'category_no': 'KAPITAL- OG STEM...",['XOSL'],1309,0,0,...,802249.9011112965,906536.1954811086,951094.1565136382,0.5927682276229973,0.473653049141487,2.044497895369801,2.5377643504531644,,NEGATIVE,NEGATIVE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459989,514900,514900,466097,Prospekt / opptaksdokument,Admission to trading of shares on Merkur Market,"[{'id': 1103, 'category_no': 'PROSPEKT / OPPTA...",['MERK'],12803,0,0,...,,,,,,,,,,NEUTRAL
459976,519774,519774,471029,Endret salgsprognose 2020,"Det henvises til kvartalsrapport for Q3 2020, ...","[{'id': 1005, 'category_no': 'INNSIDEINFORMASJ...",['MERK'],12803,0,0,...,449083.09760016395,570728.8318264598,611956.1525625557,14.759999999999994,0.6666666666666821,48.652849740932645,112.5185185185185,,NEGATIVE,NEGATIVE
459985,519775,519775,471030,Changed sales forecast 2020,Reference is made to the quarterly report for ...,"[{'id': 1005, 'category_no': 'INNSIDEINFORMASJ...",['MERK'],12803,0,0,...,449083.09760016395,570728.8318264598,611956.1525625557,14.759999999999994,0.6666666666666821,48.652849740932645,112.5185185185185,,NEGATIVE,NEGATIVE
459981,517567,517567,468798,Innkalling til ekstraordinær generalforsamling...,Se vedlagt innkalling.,"[{'id': 1010, 'category_no': 'ANNEN INFORMASJO...",['MERK'],12803,0,0,...,231094.6378370625,388477.8734440484,585458.5438080725,12.857142857142856,18.796992481203013,5.898123324396787,,,NEGATIVE,NEGATIVE


In [232]:
(abs(subset["intra_day_open_close_pct"]) > subset["ewm_std_3"].div(subset["ewm_3"])*100).sum()

73093

In [13]:
msg_labeled.to_parquet(data_path / "stock_msg_labeled_all.parquet")

In [14]:
msg_labeled.columns

Index(['id', 'messageId', 'newsId', 'title', 'body', 'category', 'markets',
       'issuerId', 'correctionForMessageId', 'correctedByMessageId',
       'issuerSign_x', 'issuerName', 'instrId', 'instrumentName',
       'instrumentFullName', 'publishedTime', 'test', 'numbAttachments',
       'attachments', 'clientAnnouncementId', 'infoRequired', 'oamMandatory',
       'title_clean', 'title_lang', 'title_lang_score', 'paragraphs',
       'par_len', 'par_label', 'par_label_score', 'weighted_scores',
       'weighted_label', 'weighted_label_conf', 'n_paragraphs', 'osl_id',
       'symbol', 'issuerSign_y', 'name', 'isActive', 'publishedTime_dt',
       'is_next_day', 'next_day', 'close', 'open', 'high', 'low', 'volume',
       'dol_volume', 'intra_day_high_low_pct', 'intra_day_open_close_pct',
       'gap_pct', 'ewm_3', 'ewm_7', 'ewm_15', 'ewm_30', 'ewm_std_3',
       'ewm_std_7', 'ewm_std_15', 'ewm_std_30', 'dol_vol_ewm_3',
       'dol_vol_ewm_7', 'dol_vol_ewm_15', 'dol_vol_ewm_30',
       

In [15]:
msg_labeled[["id", "body", "label", "title", "symbol", "category", "weighted_label", "weighted_label_conf"]].to_parquet(data_path / "stock_msg_labelled_subset.parquet")