# Notebook to prepare data for test set. Articles are sampled and later on will be labelled.

Imports

In [None]:
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, pipeline
from lib.sentiment_analysis_utils import combine_lede_and_text, remove_text_formatting, read_all_news_in_dir
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
os.getcwd()

Read data

In [None]:
df_en_raw = read_all_news_in_dir(os.getcwd() + "/../data_preparation/raw_data/en/")
df_en_raw

In [None]:
df_en_raw = combine_lede_and_text(df_en_raw)
df_en_raw = remove_text_formatting(df_en_raw)
df_en_raw

Perform preprocessing using functions from lib. Lede and remaining text of article are firstly combined. Then text formatting is removed.

In [None]:
df_en_raw = df_en_raw[df_en_raw['whole_text']!='']
df_en_raw = df_en_raw.drop_duplicates(subset=['whole_text'])

Drop duplicates and articles which text is empty.

In [None]:
df_en_raw

In [None]:
mlb = MultiLabelBinarizer()
df_en_raw = df_en_raw.join(pd.DataFrame(mlb.fit_transform(df_en_raw.categories),
                          columns=mlb.classes_,
                          index=df_en_raw.index))

We use MultiLabelBinarizer to extract categories into separate columns which will allow sampling.

For English, there are nine categories:
  - Advisory - AD
  - Arts and Culture - AC
  - Around Slovenia - AS
  - Business, finance and economy - BE
  - Health, environment, science - HE
  - Politics - PO
  - Roundup - RU
  - Schedule of Events - SE
  - Sports - ST

In [None]:
df_en_raw

In [None]:
authors = set()
for i, row in df_en_raw.iterrows():
    tmp = row.byline.split('/')
    for author in tmp:
        authors.add(author)

print(authors)

extracting authors

In [None]:
categories_to_label = ['AC','AS','BE','HE','PO','ST']

From 9 categories, 6 will be labelled. The other are categories which are for example daily digest of the newspaper. Classifying them will not have much sense and just consume resources. This decision can be justified with business expectations which were confirmed by the press agency we were cooperating with.

In [None]:
NUM_OF_ARTICLES_TO_SAMPLE = 9+1+3

In [None]:
df_testset = pd.DataFrame()
for category in categories_to_label:
    counter = 0
    df_category = pd.DataFrame()
    for i, row in df_en_raw.iterrows():
        if i in [45, 57, 29, 43, 173]:
            continue
        if counter == NUM_OF_ARTICLES_TO_SAMPLE:
            df_testset = pd.concat([df_testset, df_category])
            break

        if row[category] == 1 and row[categories_to_label].sum() == 1:
            counter+=1
            df_category = pd.concat([df_category, df_en_raw.loc[[i]]])


duplicate_rows = df_testset.astype(str).duplicated().any()
print(duplicate_rows)

Articles from different categories are sampled. From each 9 articles are selected - this can be modified by setting NUM_OF_ARTICLES_TO_SAMPLE to different value.

In [None]:
df_testset

In [None]:
tokenizer_ner = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
model_ner = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")

ner_pipeline = pipeline("ner", model=model_ner, tokenizer=tokenizer_ner, grouped_entities=True)

we have decided to use Babelscape/wikineural-multilingual-ner model as it gave the most promising preliminary results. Mainly as it firstly allowed for grouped entities. (Entities with names composed of multiple words). Secondly, it was multilingual and trained on large corpora. Even though Slovenian was not among the languages it was trained on, having other than English languages helped. Even though articles are in English, naturally there are slovenian entities mentioned quite often. We believe that training on multiple languages still allowed for more accurate assessment when non-English entity was mentioned. (This model was the most sensitive returning on average the most entities, whereas other had tendency to omit some of the entities. + returned names were actually entities.)

In [None]:
for i, row in df_testset.iterrows():
    ner_results = ner_pipeline(row.whole_text)
    ner_list = set([result['word'] for result in ner_results])

    df_testset.at[i, 'ner_list'] = ''
    df_testset.at[i, 'ner_list'] = list(ner_list)

    keywords_lower = [keyword.lower() for keyword in row.keywords]
    df_testset.at[i, 'keywords_lower'] = ''
    df_testset.at[i, 'keywords_lower'] = keywords_lower

Producing the lists of ners and embedding them. keywords are turned to lowercase.

In [None]:
df_testset

In [None]:
for category in categories_to_label:
    df_category = df_testset.loc[df_testset[category]==1, ['whole_text', 'ner_list', 'keywords_lower']]
    df_category.to_excel(f"./data_sampled_for_testset_extended/test_set_{category}_extended.xlsx", sheet_name=category)


Saving articles of each category to separate files.

In [None]:
df_testset

In [None]:
df_testset.to_csv('testset_extended.csv')

Saving it also to single csv as it will be more convenient to use it in predictions and model evaluation.