In [18]:
import pandas as pd
from shared.dm.utils.dataset import load_dataset

def write_entities_file(dataset_config: dict, filepath: str):
    dataset = load_dataset(dataset_config, pd.DataFrame())
    dataset.get_entities().to_csv(filepath, sep='\t', index=False)

In [20]:
configs = {
    'AAUP': {
        'filepath': './tasks/dm-AAUP/entities.tsv',
        'dataset_config': {
            'format': 'tsv',
            'entity_keys': ["Wikidata_URI", "DBpedia15_URI", "DBpedia16_URI"],
            'data_file': './tasks/dm-AAUP/examples.tsv',
            'label': 'label'
        }
    },
    'Cities': {
        'filepath': './tasks/dm-Cities/entities.tsv',
        'dataset_config': {
            'format': 'tsv',
            'entity_keys': ["Wikidata_URI", "DBpedia15_URI", "DBpedia16_URI"],
            'data_file': './tasks/dm-Cities/examples.tsv',
            'label': 'label'
        }
    },
    'Forbes': {
        'filepath': './tasks/dm-Forbes/entities.tsv',
        'dataset_config': {
            'format': 'tsv',
            'entity_keys': ["Wikidata_URI", "DBpedia15_URI", "DBpedia16_URI"],
            'data_file': './tasks/dm-Forbes/examples.tsv',
            'label': 'label'
        }
    },
    'MetacriticAlbums': {
        'filepath': './tasks/dm-MetacriticAlbums/entities.tsv',
        'dataset_config': {
            'format': 'tsv',
            'entity_keys': ["Wikidata_URI", "DBpedia15_URI", "DBpedia16_URI"],
            'data_file': './tasks/dm-MetacriticAlbums/examples.tsv',
            'label': 'label'
        }
    },
    'MetacriticMovies': {
        'filepath': './tasks/dm-MetacriticMovies/entities.tsv',
        'dataset_config': {
            'format': 'tsv',
            'entity_keys': ["Wikidata_URI", "DBpedia15_URI", "DBpedia16_URI"],
            'data_file': './tasks/dm-MetacriticMovies/examples.tsv',
            'label': 'label'
        }
    },
}

for cfg in configs.values():
    write_entities_file(cfg['dataset_config'], cfg['filepath'])

In [29]:
# TODO: extend to all tasks (not only those in `configs`)
# TODO: access entities.tsv files from docker containers
def create_merged_entity_mapping_file():
    mapped_ents = []
    mapping_dict = {}
    for cfg in configs.values():
        ents = pd.read_csv(cfg['filepath'], header=0, sep='\t')
        _add_entities_to_mapping_dict(ents, mapped_ents, mapping_dict)
    df = pd.DataFrame(mapped_ents)
    df.to_csv('entities_to_map.tsv', sep='\t', index=False)

def _add_entities_to_mapping_dict(ents: pd.DataFrame, mapped_ents: list, mapping_dict: dict):
    for _, row in ents.iterrows():
        entity_ids = {k: v for k, v in row.items() if v}
        # use existing mapping_dict entry (if existing), otherwise create new and add to mapped ents
        mapping_dict_entry = {}
        found_existing = False
        for entity_id in entity_ids.values():
            if entity_id in mapping_dict:
                mapping_dict_entry = mapping_dict[entity_id]
                found_existing = True
                break
        if not found_existing:
            mapped_ents.append(mapping_dict_entry)
        # update and index mapping-dict entry
        mapping_dict_entry |= entity_ids
        for entity_id in entity_ids:
            mapping_dict[entity_id] = mapping_dict_entry

create_merged_entity_mapping_file()

In [30]:
def create_mapping_for_dbp50k():
    ents = pd.read_csv('./entities_to_map.tsv', header=0, sep='\t')
    ents['source'] = ents['DBpedia16_URI'].str.slice(start=len('http://dbpedia.org/resource/'))
    dbpedia50k_embeddings = pd.read_csv('./kg/dbpedia50k/embeddings/entity_embeddings.tsv', sep='\t', header=None, index_col=0)
    ents = ents[ents['source'].isin(dbpedia50k_embeddings.index)]
    ents.to_csv('./kg/dbpedia50k/entity_mapping.tsv', index=None, sep='\t')
create_mapping_for_dbp50k()