In [1]:
import sys
sys.path.append('../')

In [None]:
import os
import pandas as pd
from src.embeddings import OpenaiAdaEmbedding, BertEmbedding, AngleEmbedding, MiniLMEmbedding
from src.exploratory_data_analysis import gather_data
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from openai import OpenAI

# Embedding generation

First we instantiate classes responsible for gathering:
1. Ada Embeddings from OpenAI API
2. Embeddings from `bert-base-uncased` BERT architecture 
3. Embeddings from open-source, state-of-the-art UAE-Large-V1 model https://huggingface.co/WhereIsAI/UAE-Large-V1

To generate Ada Embeddings, an openai_key is needed. One can create such key at https://platform.openai.com/api-keys

In [None]:
api_key = os.environ['OPENAI_API_KEY']
openai_embedding = OpenaiAdaEmbedding(api_key)
bert_embedding = BertEmbedding(model_name='bert-base-uncased')
angle_embedding = AngleEmbedding()
mini_lm_embeddig = MiniLMEmbedding()

We create embeddings for instances present in test set

In [12]:
test_set = pd.read_csv('../data/test_sets/test_set_random.csv')

Around 10 minutes of computation

In [None]:
test_set['ada_embedding'] = test_set['text'].apply(openai_embedding.get_embedding)
test_set['angle_embedding'] = test_set['text'].apply(angle_embedding.get_embedding).apply(lambda x: x.tolist())
test_set['mini_lm_embedding'] = test_set['text'].apply(mini_lm_embeddig.get_embedding)

In [25]:
test_set.to_csv('../data/test_sets/test_set_random_embeddings.csv', index=False)

# Classification

First of all we load csv files containing taxonomies and embeddings of IPTC categories.

In [36]:
taxonomy_ada = pd.read_csv('../data/taxonomy/embeddings/taxonomy_openai_ada.csv')
taxonomy_angle = pd.read_csv('../data/taxonomy/embeddings/taxonomy_angle.csv')
taxonomy_mini_lm = pd.read_csv('../data/taxonomy/embeddings/taxonomy_mini_lm.csv')

Now we perform classification based on cosine similarity. More explicitly, for each news article an IPTC category with highest cosine similarity is chosen as its label.

We enclose the classification method inside a function. 
1. It reads article and taxonomy embeddings and saves them to numpy array. 
2. Creates cosine similarity matrix between article and names (descriptions) of category embeddings.

In [26]:
def generate_predictions(df_articles, df_taxonomy, column_name_article_embedding, column_name_category_embedding):
    """Classifies articles based on the taxonomy.
    
    Args:
        df_articles (pd.DataFrame): Dataframe containing the articles.
        df_taxonomy (pd.DataFrame): Dataframe containing the taxonomy.
        
    Returns:
        pd.DataFrame: Dataframe containing the articles with the predicted categories.
    """
    
    article_embeddings = df_articles[column_name_article_embedding]
    category_embeddings = df_taxonomy[column_name_category_embedding]
    
    if isinstance(article_embeddings.iloc[0], str):
        article_embeddings = article_embeddings.apply(eval)
    
    if isinstance(category_embeddings.iloc[0], str):
        category_embeddings = category_embeddings.apply(eval)
    
    article_embeddings = np.array(article_embeddings.tolist())
    category_embeddings = np.array(category_embeddings.tolist())
    
    # compute similarity matrix of each article to each category
    similarity_scores = cosine_similarity(article_embeddings, category_embeddings)
    probabilities = similarity_scores / similarity_scores.sum(axis=1, keepdims=True)
    preds = np.argmax(probabilities, axis=1)
    preds_names = df_taxonomy['name'].iloc[preds].values
    
    # create output dataframe
    df_output = df_articles.copy()
    df_output['predicted_category_name'] = preds_names
    df_output['predicted_category_number'] = preds
    df_output['predicted_category_probability'] = np.max(probabilities, axis=1)
    df_output['predicted_categories_all_probabilities'] = [str(list(probs)) for probs in probabilities]
    
    return df_output

## Predictions for 1st hierarchy

For now we experiment only with Ada Embeddings and at first, only top-level hierarchy from IPTC categories is considered. It is due to number of reasons:
1. It is much easier to create test set for classification problem with 18 categories (number of top-level categories), in comparison to over 900 all categories
2. We want to verify general understanding of the method based on cosine similarity, and if the results are promising, generalize it for more IPTC categories.

In [43]:
ada_name_hierarchy_1 = generate_predictions(test_set, taxonomy_ada[taxonomy_ada['hierarchy'] == 1], 'ada_embedding', 'embedding_name')
ada_description_hierarchy_1 = generate_predictions(test_set, taxonomy_ada[taxonomy_ada['hierarchy'] == 1], 'ada_embedding', 'embedding_description')

angle_name_hierarchy_1 = generate_predictions(test_set, taxonomy_angle[taxonomy_angle['hierarchy'] == 1], 'angle_embedding', 'embedding_name')
angle_description_hierarchy_1 = generate_predictions(test_set, taxonomy_angle[taxonomy_angle['hierarchy'] == 1], 'angle_embedding', 'embedding_description')

mini_lm_name_hierarchy_1 = generate_predictions(test_set, taxonomy_mini_lm[taxonomy_mini_lm['hierarchy'] == 1], 'mini_lm_embedding', 'embedding_name')
mini_lm_description_hierarchy_1 = generate_predictions(test_set, taxonomy_mini_lm[taxonomy_mini_lm['hierarchy'] == 1], 'mini_lm_embedding', 'embedding_description')

In [44]:
ada_name_hierarchy_1.to_csv('../results/top_hierarchy/ada_name.csv', index=False)
ada_description_hierarchy_1.to_csv('../results/top_hierarchy/ada_description.csv', index=False)

angle_name_hierarchy_1.to_csv('../results/top_hierarchy/angle_name.csv', index=False)
angle_description_hierarchy_1.to_csv('../results/top_hierarchy/angle_description.csv', index=False)

mini_lm_name_hierarchy_1.to_csv('../results/top_hierarchy/mini_lm_name.csv', index=False)
mini_lm_description_hierarchy_1.to_csv('../results/top_hierarchy/mini_lm_description.csv', index=False)

## Predictions for all hierarchies

We also experimet with lower level hierarchies combined together.

In [45]:
ada_name = generate_predictions(test_set, taxonomy_ada, 'ada_embedding', 'embedding_name')
ada_description = generate_predictions(test_set, taxonomy_ada, 'ada_embedding', 'embedding_description')

angle_name = generate_predictions(test_set, taxonomy_angle, 'angle_embedding', 'embedding_name')
angle_description = generate_predictions(test_set, taxonomy_angle, 'angle_embedding', 'embedding_description')

mini_lm_name = generate_predictions(test_set, taxonomy_mini_lm, 'mini_lm_embedding', 'embedding_name')
mini_lm_description = generate_predictions(test_set, taxonomy_mini_lm, 'mini_lm_embedding', 'embedding_description')

In [46]:
ada_name.to_csv('../results/all_hierarchies/ada_name.csv', index=False)
ada_description.to_csv('../results/all_hierarchies/ada_description.csv', index=False)

angle_name.to_csv('../results/all_hierarchies/angle_name.csv', index=False)
angle_description.to_csv('../results/all_hierarchies/angle_description.csv', index=False)

mini_lm_name.to_csv('../results/all_hierarchies/mini_lm_name.csv', index=False)
mini_lm_description.to_csv('../results/all_hierarchies/mini_lm_description.csv', index=False)

We examine the results by briefly looking at the assigned categories

In [50]:
ada_name[['headline', 'lede', 'text', 'predicted_category_name']].head(10)

Unnamed: 0,headline,lede,text,predicted_category_name
0,ZPIZ proposes 3.5% extraordinary pension increase,The council of the public pension fund ZPIZ ha...,The extraordinary adjustment will apply from 1...,pension and welfare
1,Two more convicted in Celje court leak case,Two more persons have been found guilty in a c...,Gregor Tanšek was found guilty of sharing clas...,prisoners and detainees
2,Municipalities report EUR 2.7 billion in flood...,Municipalities have reported over EUR 2.7 bill...,The forms sent through the Ajda web applicatio...,government aid
3,Večer says employer reps in the wrong about la...,Reflecting on the indignation of employer repr...,"""We've been hearing incessant reports about ho...",public employees
4,Ljubljana-based IRCAI partners with Amazon to ...,The Slovenia-based International Research Cent...,IRCAI is the only centre under the auspices of...,computing and information technology
5,Pride Parade organisers critical of police han...,The Pride Parade association was critical on W...,"Muršec said that 3,500 people had taken part i...",gays and lesbians
6,Festival highlights city theatres of former Yu...,The Ruta Grupa Triglav travelling theatre fest...,The programme will open with the drama The Cel...,film festival
7,"Suicide on slight downward trend, but preventi...",A total of 402 people in Slovenia committed su...,"During the nine-year period, the suicide quoti...",suicide
8,Slovenia commemorates WWII resistance movement,Slovenia marks Day of Uprising Against Occupat...,This year's main ceremony was held on the eve ...,rebellions
9,"Leak source at Krško N-plant identified, issue...",The Krško nuclear power plant (NEK) said it ha...,As representatives of the Nuclear Safety Admin...,nuclear accident


As the `data/articles_2023_en` dataset is too large to store on Github repository, we also assign labels to each separate dataframe from `data/2023_articles_en` directory

In [57]:
def generate_predictions2(df_articles, df_taxonomy, column_name_article_embedding, column_name_category_embedding):
    """Classifies articles based on the taxonomy.
    
    Args:
        df_articles (pd.DataFrame): Dataframe containing the articles.
        df_taxonomy (pd.DataFrame): Dataframe containing the taxonomy.
        
    Returns:
        pd.DataFrame: Dataframe containing the articles with the predicted categories.
    """
    
    article_embeddings = df_articles[column_name_article_embedding]
    category_embeddings = df_taxonomy[column_name_category_embedding]
    
    if isinstance(article_embeddings.iloc[0], str):
        article_embeddings = article_embeddings.apply(eval)
    
    if isinstance(category_embeddings.iloc[0], str):
        category_embeddings = category_embeddings.apply(eval)
    
    article_embeddings = np.array(article_embeddings.tolist())
    category_embeddings = np.array(category_embeddings.tolist())
    
    # compute similarity matrix of each article to each category
    similarity_scores = cosine_similarity(article_embeddings, category_embeddings)
    highest_similarity_scores = np.max(similarity_scores, axis=1)
    probabilities = similarity_scores / similarity_scores.sum(axis=1, keepdims=True)
    preds = np.argmax(probabilities, axis=1)
    preds_names = df_taxonomy['name'].iloc[preds].values
    
    return preds_names, highest_similarity_scores

In [58]:
path = '../data/2023_articles_en'

for folder in tqdm(os.listdir(path)):
    for filename in os.listdir(f'{path}/{folder}'):
        if filename.endswith(".csv"):
            taxonomy = pd.read_csv('../data/taxonomy/embeddings/taxonomy_openai_ada.csv')
            df = pd.read_csv(f'{path}/{folder}/{filename}')
            df['high_label'], df['high_label_similarity'] = generate_predictions2(df, taxonomy[taxonomy['hierarchy'] == 1], 'ada_embedding', 'embedding_name')
            df['mid_label'], df['mid_label_similarity'] = generate_predictions2(df, taxonomy[taxonomy['hierarchy'] == 2], 'ada_embedding', 'embedding_name')
            df['label'], df['label_similarity'] = generate_predictions2(df, taxonomy, 'ada_embedding', 'embedding_name')
            df.to_csv(f'{path}/{folder}/{filename}', index=False)

100%|██████████| 10/10 [31:36<00:00, 189.65s/it]
