In [None]:
import sys
sys.path.append('../')

In [1]:
import pandas as pd
import os
from classes.embeddings import OpenaiAdaEmbedding, BertEmbedding

# Save taxonomy in nice format

In [5]:
df = pd.read_excel('../data/taxonomy/TAKSONOMIJA.xlsx', sheet_name='SubjectCodes', usecols='A:E', nrows=938, skiprows=2)

In [7]:
def get_hierarchy(x):
    n = x.split(':')[1]
    if n[2:] == '0' * len(n[2:]):
        return 1
    elif n[5:] == '0' * len(n[5:]):
        return 2
    else:
        return 3

In [8]:
df['hierarchy'] = df['Qcode'].apply(lambda x: get_hierarchy(x))

In [13]:
df = df[['hierarchy', 'IPTC NAME', 'IPTC DESCRIPTION']]

df.columns = ['hierarchy', 'name', 'description']

In [None]:
df.to_csv('../data/taxonomy/taxonomy.csv', index=False)

In [None]:
api_key = os.environ['OPENAI_API_KEY']
openai_ada = OpenaiAdaEmbedding(api_key = api_key)
bert = BertEmbedding(model_name = 'bert-base-uncased')
df['ada_embedding'] = df['name'].apply(lambda x: openai_ada.get_embedding(x))
df['bert_embedding'] = df['name'].apply(lambda x: bert.get_embedding(x))

# Evaluate best IPTC categories based on cosine similarity (For now only 1st hierarchy of taxonomy)

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm
import os

In [4]:
article_df = pd.read_csv('../data/articles_2023_en.csv')
article_df = article_df[article_df['ada_embedding'] != None]
article_df['ada_embedding'] = article_df['ada_embedding'].apply(eval)
article_embeddings = np.array(article_df['ada_embedding'].tolist())

In [5]:
category_df = pd.read_csv('../data/taxonomy/taxonomy.csv')
category_df = category_df[category_df['hierarchy'] == 1]
category_df['ada_embedding'] = category_df['ada_embedding'].apply(eval)
category_embeddings = np.array(category_df['ada_embedding'].tolist())

In [10]:
cosine_similarity_matrix = cosine_similarity(article_embeddings, category_embeddings)
cosine_similarity_df = pd.DataFrame(cosine_similarity_matrix, columns=category_df['name'], index=article_df.index)
cosine_similarity_df['high_label'] = cosine_similarity_df.idxmax(axis=1)


In [None]:
article_df['high_label_similarity'] = cosine_similarity_df.max(axis=1)

In [20]:
article_df.to_csv('../data/articles_2023_en.csv', index=False)

In [11]:
cosine_similarity_df.to_csv('../data/cosine_similarity.csv')

In [89]:
path = '../data/2023_articles_en'
for folder in tqdm(os.listdir(path)):
    for filename in os.listdir(f'{path}/{folder}'):
        if filename.endswith(".csv"):
            article_df = pd.read_csv(f'{path}/{folder}/{filename}')
            article_df['ada_embedding'] = article_df['ada_embedding'].apply(eval)
            article_embeddings = np.array(article_df['ada_embedding'].tolist())
            cosine_similarity_matrix = cosine_similarity(article_embeddings, category_embeddings)
            cosine_similarity_df = pd.DataFrame(cosine_similarity_matrix, columns=category_df['name'], index=article_df.index)
            cosine_similarity_df['high_label'] = cosine_similarity_df.idxmax(axis=1)
            article_df['high_label'] = cosine_similarity_df['high_label']
            article_df.to_csv(f'{path}/{folder}/{filename}', index=False)

100%|██████████| 10/10 [02:03<00:00, 12.40s/it]
