In [1]:
import sys
sys.path.append('../')

In [None]:
import pandas as pd
import os
from src.embeddings import OpenaiAdaEmbedding, BertEmbedding, AngleEmbedding

# Dataset preparation

First we read taxonomy from original IPTC taxonomy file and process it 

In [7]:
df = pd.read_excel('../data/taxonomy/TAKSONOMIJA.xlsx', sheet_name='SubjectCodes', usecols='A:E', nrows=938, skiprows=2)

Based on QCodes of each IPTC category we extract their hierarchies

In [8]:
def get_hierarchy(x):
    n = x.split(':')[1]
    if n[2:] == '0' * len(n[2:]):
        return 1
    elif n[5:] == '0' * len(n[5:]):
        return 2
    else:
        return 3

In [9]:
df['hierarchy'] = df['Qcode'].apply(lambda x: get_hierarchy(x))

In [10]:
df = df[['Qcode', 'hierarchy', 'IPTC NAME', 'IPTC DESCRIPTION']]

df.columns = ['code', 'hierarchy', 'name', 'description']

# Embedding extraction

First we instantiate classes responsible for gathering:
1. Ada Embeddings from OpenAI API
2. Embeddings from `bert-base-uncased` BERT architecture 
3. Embeddings from open-source, state-of-the-art UAE-Large-V1 model https://huggingface.co/WhereIsAI/UAE-Large-V1

To generate Ada Embeddings, an openai_key is needed. One can create such key at https://platform.openai.com/api-keys

In [6]:
api_key = os.environ['OPENAI_API_KEY']
openai_ada = OpenaiAdaEmbedding(api_key = api_key)
bert = BertEmbedding(model_name = 'bert-base-uncased')
angle = AngleEmbedding()

We extract Ada, BERT and AnglE embeddings from names of categories

In [7]:
df['ada_embedding_name'] = df['name'].apply(lambda x: openai_ada.get_embedding(x))
df['bert_embedding_name'] = df['name'].apply(lambda x: bert.get_embedding(x))
df['angle_embedding_name'] = df['name'].apply(lambda x: angle.get_embedding(x))
df['angle_embedding_name'] = df['angle_embedding_name'].apply(lambda x: x.tolist())

And for descriptions, as their embeddings might be more insightful.

In [8]:
df['ada_embedding_description'] = df['description'].apply(lambda x: openai_ada.get_embedding(x))
df['bert_embedding_description'] = df['description'].apply(lambda x: bert.get_embedding(x))
df['angle_embedding_description'] = df['description'].apply(lambda x: angle.get_embedding(x))
df['angle_embedding_description'] = df['angle_embedding_description'].apply(lambda x: x.tolist())

Finally we save the processed taxonomy to csv

In [10]:
df_descriptions = df[['ada_embedding_description', 'bert_embedding_description', 'angle_embedding_description']]
df_names = df[['ada_embedding_name', 'bert_embedding_name', 'angle_embedding_name']]

In [11]:
df_descriptions.to_csv('../data/taxonomy/taxonomy_descriptions_embed.csv', index=False)
df_names.to_csv('../data/taxonomy/taxonomy_names_embed.csv', index=False)

In [9]:
df.to_csv('../data/taxonomy/taxonomy.csv', index=False)

In [20]:
df.head(5)

Unnamed: 0,code,hierarchy,name,description,ada_embedding_name,bert_embedding_name,ada_embedding_description,bert_embedding_description
0,subj:01000000,1,"arts, culture and entertainment",Matters pertaining to the advancement and refi...,"[-0.0018561693141236901, 0.00820828415453434, ...","[0.29135567, 0.53658617, 0.14300509, 0.0661959...","[-0.003499549115076661, -0.010010791942477226,...","[0.07155024, 0.47957146, 0.17099142, -0.082546..."
1,subj:01001000,2,archaeology,Probing the past through ruins and artefacts,"[0.0008693007985129952, -0.016492612659931183,...","[0.28388408, 0.36506987, -0.4875075, -0.067497...","[0.0012796183582395315, -0.005543092731386423,...","[0.17526373, 0.40781248, -0.111298054, -0.1549..."
2,subj:01002000,2,architecture,"Designing of buildings, monuments and the spac...","[-0.003000278491526842, -0.009844981133937836,...","[0.3987167, 0.4550103, -0.3130968, -0.09459012...","[0.012280375696718693, 0.0004783767508342862, ...","[0.5307179, 0.72216964, 0.2738483, -0.08355137..."
3,subj:01003000,2,bullfighting,Classical contest pitting man against the bull,"[-0.03164760768413544, 0.004129278939217329, -...","[0.13182184, -0.13112079, -0.5677358, 0.091910...","[-0.01748625561594963, 0.007708199787884951, 0...","[-0.50239855, -0.008285609, -0.1940193, 0.0154..."
4,subj:01004000,2,festive event (including carnival),"Parades, parties, celebrations and the like no...","[-0.024428900331258774, -0.01666032150387764, ...","[-0.109260455, 0.023998955, -0.11399407, -0.23...","[-0.017328009009361267, -0.01992149092257023, ...","[0.31001565, 0.38075322, 0.11926926, 0.1893968..."
