# Data Preprocessing

This notebook performs and documents the data preprocessing. The final results are a training, validation and testing dataset that can be used for training models in the next stage.

In [None]:
import os
import shutil
from data_preprocessing_utility import filter_triples_by_uri, fix_redirections, filter_properties, filter_entities, split_dataset, concat_files

In [None]:
# file paths to the initial unprocessed files
FILEPATH_MW_BS = "data/exported_data/mutual_wikilinks_properties_both_sides.csv"
FILEPATH_MW_OS = "data/exported_data/mutual_wikilinks_properties_one_side.csv"
FILEPATH_MW_NO_PROPS = "data/exported_data/mutual_wikilinks_no_properties.csv"
FILEPATH_REMAINING_TRIPLES = "data/exported_data/remaining_triples.csv"
FILEPATH_TYPES = "data/raw_data/instance-types_inference=transitive_lang=en.ttl"

# file path of the file containing the filtered subset of property types
FILEPATH_FILTERED_PROP_TYPES = "data/processed_data/filtered_property_types.csv"

# file path of the file containing redirections of entities
FILEPATH_REDIRECTIONS = "data/raw_data/redirects_inference=transitive_lang=en.ttl"

# directory where the final processed train, validation and testing files are saved
RESULTS_DIR_FILEPATH = "data/processed_data/"
if not os.path.isdir(RESULTS_DIR_FILEPATH):
    os.mkdir(RESULTS_DIR_FILEPATH)

# create temporary directory for saving intermediate results
# this directory including the files containing intermediate results will be deleted after the proprocessing is done
TEMP_DIR_FILEPATH = "data/temp/"
if not os.path.isdir(TEMP_DIR_FILEPATH):
    os.mkdir(TEMP_DIR_FILEPATH)

## Remove Triples With Entities Outside of DBpedia

In [None]:
# remove triples with entities outside of DBpedia from remaining triples dataset
filter_triples_by_uri(
    dataset_filepath=FILEPATH_REMAINING_TRIPLES,
    processed_dataset_filepath=TEMP_DIR_FILEPATH+"remaining_triples_no_outside.csv",
    uri_substring="http://dbpedia.org/resource/"
)

## Remove Career Stations

In [None]:
# remove triples with career stations from remaining triples dataset
filter_triples_by_uri(
    dataset_filepath=TEMP_DIR_FILEPATH+"remaining_triples_no_outside.csv",
    processed_dataset_filepath=TEMP_DIR_FILEPATH+"remaining_triples_no_cs.csv",
    uri_substring="__CareerStation__",
    positive_match=False
)

## Fix Redirections

In [None]:
# fix redirections for entities in the dataset of mutually wikilinked pairs with both-sided properties
fix_redirections(
    dataset_filepath=FILEPATH_MW_BS,
    redirections_filepath=FILEPATH_REDIRECTIONS,
    processed_dataset_filepath=TEMP_DIR_FILEPATH+"mw_bs_fixed_redirects.csv",
    redirections_filetype="ttl"
)

In [None]:
# fix redirections for entities in the dataset of mutually wikilinked pairs with one-sided properties
fix_redirections(
    dataset_filepath=FILEPATH_MW_OS,
    redirections_filepath=FILEPATH_REDIRECTIONS,
    processed_dataset_filepath=TEMP_DIR_FILEPATH+"mw_os_fixed_redirects.csv",
    redirections_filetype="ttl"
)

In [None]:
# fix redirections for entities in the dataset of mutually wikilinked pairs without connecting properties
fix_redirections(
    dataset_filepath=FILEPATH_MW_NO_PROPS,
    redirections_filepath=FILEPATH_REDIRECTIONS,
    processed_dataset_filepath=TEMP_DIR_FILEPATH+"mw_no_props_fixed_redirects.csv",
    redirections_filetype="ttl"
)

In [None]:
# fix redirections for entities in the dataset of remaining triples
fix_redirections(
    dataset_filepath=TEMP_DIR_FILEPATH+"remaining_triples_no_cs.csv",
    redirections_filepath=FILEPATH_REDIRECTIONS,
    processed_dataset_filepath=TEMP_DIR_FILEPATH+"remaining_triples_fixed_redirects.csv",
    redirections_filetype="ttl"
)

In [None]:
# fix redirections for entities in the types dataset
fix_redirections(
    dataset_filepath=FILEPATH_TYPES,
    redirections_filepath=FILEPATH_REDIRECTIONS,
    processed_dataset_filepath=TEMP_DIR_FILEPATH+"types_fixed_redirects.csv",
    subjects_only=True,
    dataset_filetype="ttl",
    redirections_filetype="ttl"
)

## Filter Properties

To reduce the number of properties that have to be considered in the embedding creation and relation prediction, the property types are reduced. The dataset of filtered property types is not created in this notebook but in a separate notebook. The resulting file containing the filtered property definitons is used here to remove any property in the datasets that is not in this list. The datasets that are filtered are the datasets containing properties that connect mutually wikilinked entity pairs (one-sided and both-sided properties) and the dataset containing remaining triples (which excludes the connecting properties of mutually wikilinked pairs now).

In [None]:
# filter properties of the mutual wikilinks with both-sided properties dataset
filter_properties(
    properties_dataset_filepath=TEMP_DIR_FILEPATH+"mw_bs_fixed_redirects.csv",
    filtered_property_types_filepath=FILEPATH_FILTERED_PROP_TYPES,
    processed_dataset_filepath=TEMP_DIR_FILEPATH+"filtered_props_mw_bs.csv"
)

In [None]:
# filter properties of the mutual wikilinks with one-sided properties dataset
filter_properties(
    properties_dataset_filepath=TEMP_DIR_FILEPATH+"mw_os_fixed_redirects.csv",
    filtered_property_types_filepath=FILEPATH_FILTERED_PROP_TYPES,
    processed_dataset_filepath=TEMP_DIR_FILEPATH+"filtered_props_mw_os.csv"
)

In [None]:
# filter properties of the remaining triples dataset
filter_properties(
    properties_dataset_filepath=TEMP_DIR_FILEPATH+"remaining_triples_fixed_redirects.csv",
    filtered_property_types_filepath=FILEPATH_FILTERED_PROP_TYPES,
    processed_dataset_filepath=TEMP_DIR_FILEPATH+"filtered_props_remaining_triples.csv"
)

## Filter Entities

Not all entities that appear in the datasets containing properties that connect mutually wikilinked entity pairs are also available in the dataset containing the other remaining triples. The goal of this master thesis is to classify relations between mutually wikilinked entity pairs that are already integrated in the graph and are not unknown. This is also reflected in the models that are choosen to predict the relation / property types between a pair of entities. The models that are used to create the knowledge graph embeddings need to have information on other triples in which an entity appears (transductive link prediction). For that reason any pair containing an entity that is not part of the remaining triples dataset is removed from the datasets containing connecting properties of mutually wikilinked entities. Futhermore, entity pairs in the dataset of mutually wikilinked entities without any connecting properties are filtered to only contain pairs where both entities appear in the remaining triples dataset.

The same filtering is applied on the entity types dataset, to reduce it to type information on entities that are part of the remaining triples dataset.

In [None]:
# filter enitities of the mutual wikilinks with both-sided properties dataset
filter_entities(
    dataset_filepath=TEMP_DIR_FILEPATH+"filtered_props_mw_bs.csv",
    entities_dataset_filepath=TEMP_DIR_FILEPATH+"filtered_props_remaining_triples.csv",
    processed_dataset_filepath=TEMP_DIR_FILEPATH+"filtered_entities_mw_bs.csv",
    filter_subject_only=False,
)

In [None]:
# filter enitities of the mutual wikilinks with one-sided properties dataset
filter_entities(
    dataset_filepath=TEMP_DIR_FILEPATH+"filtered_props_mw_os.csv",
    entities_dataset_filepath=TEMP_DIR_FILEPATH+"filtered_props_remaining_triples.csv",
    processed_dataset_filepath=TEMP_DIR_FILEPATH+"filtered_entities_mw_os.csv",
    filter_subject_only=False,
)

In [None]:
# filter enitities of the mutual wikilinks without connecting properties dataset
filter_entities(
    dataset_filepath=TEMP_DIR_FILEPATH+"mw_no_props_fixed_redirects.csv",
    entities_dataset_filepath=TEMP_DIR_FILEPATH+"filtered_props_remaining_triples.csv",
    processed_dataset_filepath=RESULTS_DIR_FILEPATH+"mw_no_props.csv",
    filter_subject_only=False,
)

In [None]:
# filter enitities of the types dataset
filter_entities(
    dataset_filepath=TEMP_DIR_FILEPATH+"types_fixed_redirects.csv",
    entities_dataset_filepath=TEMP_DIR_FILEPATH+"filtered_props_remaining_triples.csv",
    processed_dataset_filepath=TEMP_DIR_FILEPATH+"filtered_entities_types.csv",
    filter_subject_only=True,
)

## Create Training, Validation and Testing Datasets

To train, validate model settings and test the overall performance the datasets containing connecting properties of mutually wikilinked entities is split into a training, validation and testing set. The data is split so that entity pairs only appear in one of the splits. Individual entities can still appear in more than one split which again reflects the setting of the master thesis in which entities are not completely unknown but the relations between mutually wikilinked pairs are not all classified.

After splitting the datasets containing connecting properties between mutually wikilinked properties, the respective splits with both- and one-sided properties are combined. Furthermore the remaining triples and entity types datasets are added to the training set of connecting properties of mutually wikilinked entities to obtain the final training set.

In [None]:
# split dataset containing both-sided properties of mutual wikilinks into train, val and test set
split_dataset(
    dataset_filepath=TEMP_DIR_FILEPATH+"filtered_entities_mw_bs.csv",
    trainset_filepath=TEMP_DIR_FILEPATH+"train_mw_bs.csv",
    valset_filepath=TEMP_DIR_FILEPATH+"val_mw_bs.csv",
    testset_filepath=TEMP_DIR_FILEPATH+"test_mw_bs.csv",
    val_test_fraction=5,
    random_state=42
)                            

In [None]:
# split dataset containing one-sided properties of mutual wikilinks into train, val and test set
split_dataset(
    dataset_filepath=TEMP_DIR_FILEPATH+"filtered_entities_mw_os.csv",
    trainset_filepath=TEMP_DIR_FILEPATH+"train_mw_os.csv",
    valset_filepath=TEMP_DIR_FILEPATH+"val_mw_os.csv",
    testset_filepath=TEMP_DIR_FILEPATH+"test_mw_os.csv",
    val_test_fraction=5,
    random_state=42
)                            

In [None]:
# final training set
# concatenate training splits of mutual wikilinked entities with both- and one-sided properties, remaining triples and types
concat_files(
    filepaths=[
        TEMP_DIR_FILEPATH+"filtered_props_remaining_triples.csv",
        TEMP_DIR_FILEPATH+"filtered_entities_types.csv",
        TEMP_DIR_FILEPATH+"train_mw_bs.csv",
        TEMP_DIR_FILEPATH+"train_mw_os.csv"
    ],
    filetypes=["csv", "csv", "csv", "csv"],
    processed_dataset_filepath=RESULTS_DIR_FILEPATH+"train_w_types.tsv"
)
# also create a training set without types
concat_files(
    filepaths=[
        TEMP_DIR_FILEPATH+"filtered_props_remaining_triples.csv",
        TEMP_DIR_FILEPATH+"train_mw_bs.csv",
        TEMP_DIR_FILEPATH+"train_mw_os.csv"
    ],
    filetypes=["csv", "csv", "csv"],
    processed_dataset_filepath=RESULTS_DIR_FILEPATH+"train.tsv"
)

In [None]:
# final validation set
# concatenate validation splits of mutual wikilinked entities with both- and one-sided properties
concat_files(
    filepaths=[
        TEMP_DIR_FILEPATH+"val_mw_bs.csv",
        TEMP_DIR_FILEPATH+"val_mw_os.csv"
    ],
    filetypes=["csv", "csv"],
    processed_dataset_filepath=RESULTS_DIR_FILEPATH+"val.tsv"
)

In [None]:
# final testing set
# concatenate testing splits of mutual wikilinked entities with both- and one-sided properties
concat_files(
    filepaths=[
        TEMP_DIR_FILEPATH+"test_mw_bs.csv",
        TEMP_DIR_FILEPATH+"test_mw_os.csv"
    ],
    filetypes=["csv", "csv"],
    processed_dataset_filepath=RESULTS_DIR_FILEPATH+"test.tsv"
)

In [None]:
# remove temporary directory and all temporary files inside
shutil.rmtree(TEMP_DIR_FILEPATH)