In [5]:
import sys
sys.path.append('../')

In [None]:
import os
import pandas as pd
from src.embeddings import OpenaiAdaEmbedding, BertEmbedding, AngleEmbedding
from src.exploratory_data_analysis import gather_data
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Embedding generation

First we instantiate classes responsible for gathering:
1. Ada Embeddings from OpenAI API
2. Embeddings from `bert-base-uncased` BERT architecture 
3. Embeddings from open-source, state-of-the-art UAE-Large-V1 model https://huggingface.co/WhereIsAI/UAE-Large-V1

To generate Ada Embeddings, an openai_key is needed. One can create such key at https://platform.openai.com/api-keys

In [7]:
api_key = os.environ['OPENAI_API_KEY']
openai_embedding = OpenaiAdaEmbedding(api_key)
bert_embedding = BertEmbedding(model_name='bert-base-uncased')
angle_embedding = AngleEmbedding()

The first step of classification involves creating embeddings of english articles. We gather them from `data/2023_articles_en` folder, where articles are aggregated based on month and day of its publication. Then we create embeddings for `text` column, which includes raw article texts.

In [None]:
path = '../data/2023_articles_en'
for folder in tqdm(os.listdir(path)):
    for filename in os.listdir(f'{path}/{folder}'):
        if filename.endswith(".csv"):
            df = pd.read_csv(f'{path}/{folder}/{filename}')
            df['ada_embedding'] = df['text'].apply(lambda x: openai_embedding.get_embedding(x) if x else None)
            df['bert_embedding'] = df['text'].apply(lambda x: bert_embedding.get_embedding(x) if x else None)
            df['angle_embedding'] = df['text'].apply(lambda x: angle_embedding.get_embedding(x) if x else None)
            df.to_csv(f'{path}/{folder}/{filename}_embedded.csv', index=False)

# Classification

We load:
1. IPTC taxonomy, with computed embeddings for names and descriptions
2. csv's containing news articles into one dataframe

In [24]:
path = '../data/2023_articles_en'
articles = gather_data(path)
taxonomy = pd.read_csv('../data/taxonomy/taxonomy.csv')

Now we perform classification based on cosine similarity. More explicitly, for each news article an IPTC category with highest cosine similarity is chosen as its label.

We enclose the classification method inside a function. 
1. It reads article (taxonomy) embeddings, extracts the valid ones and save them to numpy array. 
2. Creates cosine similarity matrix between article and names of category embeddings (and embeddings of descriptions to be considered), and saves the most similar categories as `high_label`

In [8]:
def iptc_categorisation(article, taxonomy, hierarchy=None, embedding='ada_embedding', type='name', with_eval=True):
    """
    Args:
        article (pd.DataFrame): Dataframe containing the articles.
        taxonomy (pd.DataFrame): Dataframe containing the taxonomy.
        hierarchy (int): Hierarchy of the taxonomy. Possible values: 1, 2, 3.
        embedding (str): Embedding to use.
        type (str): Type of embedding to use. Possible values: 'name', 'description'.
    Returns:
        pd.DataFrame: Dataframe containing the articles with the predicted categories.
    """
    article_df = article[article[embedding] != None]
    
    if hierarchy:
        category_df = taxonomy[taxonomy['hierarchy'] == hierarchy]
    else:
        category_df = taxonomy
        
    if with_eval:
        article_df[embedding] = article_df[embedding].apply(eval)
        category_df[f'{embedding}_{type}'] = category_df[f'{embedding}_{type}'].apply(eval)
    
    article_embeddings = np.array(article_df[embedding].tolist())
    category_name_embeddings = np.array(category_df[f'{embedding}_{type}'].tolist())

    cosine_similarity_matrix = cosine_similarity(article_embeddings, category_name_embeddings)
    cosine_similarity_df = pd.DataFrame(cosine_similarity_matrix, columns=category_df['name'], index=article_df.index)
    categories = cosine_similarity_df.idxmax(axis=1)
    
    return categories

For now we experiment only with Ada Embeddings and at first, only top-level hierarchy from IPTC categories is considered. It is due to number of reasons:
1. It is much easier to create test set for classification problem with 18 categories (number of top-level categories), in comparison to over 900 all categories
2. We want to verify general understanding of the method based on cosine similarity, and if the results are promising, generalize it for more IPTC categories.

In [13]:
articles['high_label'] = iptc_categorisation(articles, taxonomy, hierarchy=1, embedding='ada_embedding', type='name', output_column='high_label')

We also experimet with lower level hierarchy, and finally all hierarchies combined together. Impoprtant note is that for now we consider the embeddings of names of IPTC cateogries. Similar experiment could be conducted for descriptions.

In [27]:
articles['mid_label'] = iptc_categorisation(articles, taxonomy, hierarchy=2, embedding='ada_embedding', type='name')
articles['label'] = iptc_categorisation(articles, taxonomy, hierarchy=None, embedding='ada_embedding', type='name')

At last we filter out data points which are schedules of days, as they can't be reliably classified to any ot the IPTC categories (they usually match multiple categories)

In [28]:
article_df = articles[articles['headline'].str.contains('Schedule of events') == False]
article_df.to_csv('../data/articles_2023_en.csv', index=False)

We examine the results by briefly looking at the assigned categories

In [29]:
article_view = article_df[['headline', 'lede', 'high_label', 'mid_label', 'label']]
article_view.head(10)

Unnamed: 0,headline,lede,high_label,mid_label,label
0,News from Slovenia,The Slovenian Press Agency news report is prot...,health,people,poll
2,Weather: Cloudy with rain,"After a rainy night, most of the country will ...",weather,weather science,weather
3,Koper port welcoming first cruise ship this se...,Viking Sky with the capacity of 900 passengers...,"arts, culture and entertainment",sailing,sailing
4,Ski jumper Anže Lanišek takes second place in ...,,health,people,poll
5,Delo says US has become revolutionary state,"Stalin must be laughing in his grave, as he wa...","unrest, conflicts and war",civil unrest,revolutions
6,Temp hiring increasingly popular despite conce...,Hiring temporary workers has become increasing...,labour,labour legislation,public employees
7,Ski jumper Anže Lanišek takes second place in ...,Anže Lanišek finished second at a ski jumping ...,sport,ski jumping,ski jumping
8,Dnevnik says Golob's visit to Ukraine brought ...,Commenting on Friday's visit by PM Robert Golo...,"unrest, conflicts and war",diplomacy,diplomacy
9,Golob: Peace initiatives for Ukraine currently...,The international community's peace initiative...,"unrest, conflicts and war",armed conflict,international military intervention
10,Slovenian ski jumping team finishes second in ...,,health,people,poll


As the `data/articles_2023_en` dataset is too large to store on Github repository, we also assign labels to each separate dataframe from `data/2023_articles_en` directory

In [35]:
path = '../data/2023_articles_en'

for folder in tqdm(os.listdir(path)):
    for filename in os.listdir(f'{path}/{folder}'):
        if filename.endswith(".csv"):
            taxonomy = pd.read_csv('../data/taxonomy/taxonomy.csv')
            df = pd.read_csv(f'{path}/{folder}/{filename}')
            df['high_label'] = iptc_categorisation(df, taxonomy, hierarchy=1, embedding='ada_embedding', type='name')
            df['mid_label'] = iptc_categorisation(df, taxonomy, hierarchy=2, embedding='ada_embedding', type='name')
            df['label'] = iptc_categorisation(df, taxonomy, hierarchy=None, embedding='ada_embedding', type='name')
            df.to_csv(f'{path}/{folder}/{filename}', index=False)

 70%|███████   | 7/10 [48:28<19:43, 394.58s/it]

### Label test sets with Angle

In [21]:
taxonomy = pd.read_csv('../data/taxonomy/taxonomy_names_embed.csv')
taxonomy2 = pd.read_csv('../data/taxonomy/taxonomy.csv')
test_set = pd.read_csv('../data/test_sets/test_set_random.csv')
                       

In [26]:
taxonomy['hierarchy'] = taxonomy2['hierarchy']
taxonomy['name'] = taxonomy2['name']

In [29]:
test_set['high_label2'] = iptc_categorisation(test_set, taxonomy, hierarchy=1, embedding='angle_embedding', type='name')
test_set['mid_label2'] = iptc_categorisation(test_set, taxonomy, hierarchy=2, embedding='angle_embedding', type='name')
test_set['label2'] = iptc_categorisation(test_set, taxonomy, hierarchy=None, embedding='angle_embedding', type='name')

In [30]:
test_set.to_csv('../data/test_sets/test_set_random.csv', index=False)