In [1]:
import pandas as pd
train_df = pd.read_csv("data/ESSUM/dbpedia/triples/train.txt", sep="\t", header=None, names=["head", "relation", "tail"])
val_df = pd.read_csv("data/ESSUM/dbpedia/triples/validation.txt", sep="\t", header=None, names=["head", "relation", "tail"])

## filtered URI only

In [3]:
import pandas as pd

# Drop rows where 'tail' has NaN values
df = train_df.dropna(subset=['tail'])

# Filter the DataFrame to exclude rows where 'tail' is a literal value
df_filtered = df[df['tail'].str.startswith('http')]

# Further filter to exclude rows where 'relation' contains 'homepage' or 'website'
df_filtered = df_filtered[~df_filtered['relation'].str.contains('homepage|website', case=False)]

# Save the filtered DataFrame to a CSV file
df_filtered.to_csv('filtered_train.txt', index=False, header=None, sep="\t")

print("Filtered DataFrame has been saved to filtered_train.txt.")


Filtered DataFrame has been saved to filtered_train.txt.


In [5]:
# Drop rows where 'tail' has NaN values
df = val_df.dropna(subset=['tail'])

# Filter the DataFrame to exclude rows where 'tail' is a literal value
df_filtered = df[df['tail'].str.startswith('http')]

# Further filter to exclude rows where 'relation' contains 'homepage' or 'website'
df_filtered = df_filtered[~df_filtered['relation'].str.contains('homepage|website', case=False)]

# Save the filtered DataFrame to a CSV file
df_filtered.to_csv('filtered_dev.txt', index=False, header=None, sep="\t")

print("Filtered DataFrame has been saved to filtered_dev.txt.")

Filtered DataFrame has been saved to filtered_dev.txt.


## Filtered all literal only

In [6]:
import pandas as pd

# Drop rows where 'tail' has NaN values
df = train_df.dropna(subset=['tail'])

# Filter the DataFrame to include rows where 'tail' does not start with 'http'
literal_values_df = df[~df['tail'].str.startswith('http', na=False)]

# Include rows where 'relation' contains 'homepage' or 'website'
homepage_website_df = df[df['relation'].str.contains('homepage|website', case=False, na=False)]

# Combine the two DataFrames
combined_df = pd.concat([literal_values_df, homepage_website_df]).drop_duplicates()

# Save the combined DataFrame to a CSV file
combined_df.to_csv('filtered_all_literals_train.txt', index=False, header=None, sep="\t")

print("Selected DataFrame has been saved to filtered_all_literals_train")


Selected DataFrame has been saved to filtered_all_literals_train


In [7]:
import pandas as pd

# Drop rows where 'tail' has NaN values
df = val_df.dropna(subset=['tail'])

# Filter the DataFrame to include rows where 'tail' does not start with 'http'
literal_values_df = df[~df['tail'].str.startswith('http', na=False)]

# Include rows where 'relation' contains 'homepage' or 'website'
homepage_website_df = df[df['relation'].str.contains('homepage|website', case=False, na=False)]

# Combine the two DataFrames
combined_df = pd.concat([literal_values_df, homepage_website_df]).drop_duplicates()

# Save the combined DataFrame to a CSV file
combined_df.to_csv('filtered_all_literals_val.txt', index=False, header=None, sep="\t")

print("Selected DataFrame has been saved to filtered_all_literals_val.txt")


Selected DataFrame has been saved to filtered_all_literals_val.txt


## Selected numerical only

In [9]:
literal_train_df = pd.read_csv("filtered_all_literals_train.txt", sep="\t", header=None, names=["head", "relation", "tail"])
literal_val_df = pd.read_csv("filtered_all_literals_val.txt", sep="\t", header=None, names=["head", "relation", "tail"])
df = pd.merge(literal_train_df, literal_val_df, how="outer")

# Convert the 'tail' column to numeric, coercing errors to NaN
df['tail'] = pd.to_numeric(df['tail'], errors='coerce')

# Filter the DataFrame to include only rows where 'tail' is numeric (not NaN)
df_numeric_tails = df.dropna(subset=['tail'])

# Save the filtered DataFrame to a CSV file
df_numeric_tails.to_csv('numerical_literals,txt', index=False, header=None, sep="\t")

print("Filtered DataFrame with numeric tails has been saved to numerical_literals.txt")

Filtered DataFrame with numeric tails has been saved to numerical_literals.txt


## Selected text literals only

In [10]:
literal_train_df = pd.read_csv("filtered_all_literals_train.txt", sep="\t", header=None, names=["head", "relation", "tail"])
literal_val_df = pd.read_csv("filtered_all_literals_val.txt", sep="\t", header=None, names=["head", "relation", "tail"])
df = pd.merge(literal_train_df, literal_val_df, how="outer")

# Ensure all values in the 'tail' column are strings
df['tail'] = df['tail'].astype(str)

# Filter out rows where 'tail' values are numeric using a regular expression
df_non_numeric_tails = df[~df['tail'].str.match(r'^-?\d+(\.\d+)?$', na=False)]

# Save the filtered DataFrame to a CSV file
df_non_numeric_tails.to_csv('text_literals.txt', index=False, header=None, sep="\t")

print("Filtered DataFrame with non-numeric tails has been saved to text_literals.txt.")

Filtered DataFrame with non-numeric tails has been saved to text_literals.txt.


## Generate test dataset

In [11]:
test_df = pd.read_csv("data/ESSUM/dbpedia/triples/test.txt", sep="\t", header=None, names=["head", "relation", "tail"])
# Drop rows where 'tail' has NaN values
df = test_df.dropna(subset=['tail'])

# Filter the DataFrame to exclude rows where 'tail' is a literal value
df_filtered = df[df['tail'].str.startswith('http')]

# Further filter to exclude rows where 'relation' contains 'homepage' or 'website'
df_filtered = df_filtered[~df_filtered['relation'].str.contains('homepage|website', case=False)]

# Save the filtered DataFrame to a CSV file
df_filtered.to_csv('filtered_test.txt', index=False, header=None, sep="\t")

print("Filtered DataFrame has been saved to filtered_test.txt.")

Filtered DataFrame has been saved to filtered_test.txt.


## Generate entity types

In [16]:
import pandas as pd

train_df = pd.read_csv("essum-dataset-for-LiteralE/final/train.txt", sep="\t", header=None, names=["head", "relation", "tail"])
val_df = pd.read_csv("essum-dataset-for-LiteralE/final/valid.txt", sep="\t", header=None, names=["head", "relation", "tail"])
df = pd.merge(train_df, val_df, how="outer")

# Get unique entities and relations
unique_heads = df['head'].unique().tolist()
unique_tails = df['tail'].unique().tolist()
unique_relations = df['relation'].unique().tolist()

# Optionally, you can combine heads and tails to get a full list of unique entities
all_entities = list(set(unique_heads + unique_tails))

# Print the results
#print("Unique Heads:", unique_heads)
#print("Unique Tails:", unique_tails)
#print("All Unique Entities:", all_entities)
#print("Unique Relations:", unique_relations)


In [18]:
from SPARQLWrapper import SPARQLWrapper, JSON
import json

def extract_type_from_uri(uri):
    if "dbpedia.org/resource/Category:" in uri:
        return "Category"
    elif "dbpedia.org/ontology/" in uri:
        return uri.split('/')[-1]
    elif "dbpedia.org/resource/" in uri:
        if "Person" in uri.split('/')[-1]:
            return "Person"
        elif any(word in uri.lower() for word in ["river", "mountain", "lake"]):
            return "GeographicalFeature"
        elif any(word in uri.lower() for word in ["university", "college"]):
            return "EducationalInstitution"
    return None  # Return None if no type could be determined from the URI pattern

def get_type_from_sparql(uri):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    query = f"""
    SELECT DISTINCT ?type WHERE {{
      <{uri}> a ?type .
      FILTER(strstarts(str(?type), "http://dbpedia.org/ontology/"))
    }}
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    types = [result['type']['value'].split('/')[-1] for result in results["results"]["bindings"]]
    return types if types else ["Unknown"]  # Return ["Unknown"] if no type was found

def determine_entity_type(uri):
    # First attempt to determine the type using URI patterns
    type_from_uri = extract_type_from_uri(uri)
    if type_from_uri is not None:
        return type_from_uri
    
    # If the URI pattern method is inconclusive, query the SPARQL endpoint
    types_from_sparql = get_type_from_sparql(uri)
    return types_from_sparql[0] if types_from_sparql else "Unknown"

# Determine types for each entity
entity_types = {uri: determine_entity_type(uri) for uri in all_entities}

# Save to JSON
with open('entity_types.json', 'w') as f:
    json.dump(entity_types, f, indent=4)

print("Entity types have been saved to entity_types.json")

Entity types have been saved to entity_types.json


## Generate range and domain constraints

In [20]:
# Convert DataFrame rows to list of tuples (triples)
triples_list = [tuple(x) for x in df.values]

# Print the triples list
for triple in triples_list[:5]:
    head, rel, tail = triple
    print('Head:', head, 'Relation:', rel, 'Tail:', tail)


Head: http://dbpedia.org/resource/173rd_Airborne_Brigade_Combat_Team Relation: http://dbpedia.org/ontology/battle Tail: http://dbpedia.org/resource/Operation_Hump
Head: http://dbpedia.org/resource/1960_Glover_Trophy Relation: http://dbpedia.org/ontology/fastestDriver Tail: http://dbpedia.org/resource/Stirling_Moss
Head: http://dbpedia.org/resource/1960_Glover_Trophy Relation: http://dbpedia.org/ontology/fastestDriverTeam Tail: http://dbpedia.org/resource/Cooper_Car_Company
Head: http://dbpedia.org/resource/1960_Glover_Trophy Relation: http://dbpedia.org/ontology/firstDriver Tail: http://dbpedia.org/resource/Innes_Ireland
Head: http://dbpedia.org/resource/1960_Glover_Trophy Relation: http://dbpedia.org/ontology/location Tail: http://dbpedia.org/resource/Goodwood_Circuit


In [21]:
from collections import defaultdict

domain_per_rel = defaultdict(set)
range_per_rel = defaultdict(set)

for head, rel, tail in triples_list:
    head_type = entity_types.get(head, "Unknown")
    tail_type = entity_types.get(tail, "Unknown")
    
    domain_per_rel[rel].add(head_type)
    range_per_rel[rel].add(tail_type)

# Example printout of domains and ranges
for rel, domains in domain_per_rel.items():
    print(f"Relation: {rel.split('/')[-1]}, Domain: {domains}, Range: {range_per_rel[rel]}")

# Assuming domain_per_rel and range_per_rel are populated correctly
# Convert defaultdict to a regular dictionary for JSON serialization
domain_dict = {k: list(v) for k, v in domain_per_rel.items()}
range_dict = {k: list(v) for k, v in range_per_rel.items()}

# Save domain_per_rel to domain_per_rel.json
with open('domain_per_rel.json', 'w') as f:
    json.dump(domain_dict, f, indent=4)

# Save range_per_rel to range_per_rel.json
with open('range_per_rel.json', 'w') as f:
    json.dump(range_dict, f, indent=4)

print("Domain and range dictionaries have been saved successfully.")


Relation: battle, Domain: {'SportsTeam', 'Unknown', 'Person', 'Agent'}, Range: {'SocietalEvent'}
Relation: fastestDriver, Domain: {'SocietalEvent'}, Range: {'Person'}
Relation: fastestDriverTeam, Domain: {'SocietalEvent'}, Range: {'Agent'}
Relation: firstDriver, Domain: {'SocietalEvent'}, Range: {'Person'}
Relation: location, Domain: {'SocietalEvent', 'WrestlingEvent', 'Stadium', 'Infrastructure', 'Place'}, Range: {'Stadium', 'Person', 'Building', 'Infrastructure', 'ArchitecturalStructure', 'Place', 'Single'}
Relation: secondDriver, Domain: {'SocietalEvent'}, Range: {'Person'}
Relation: secondTeam, Domain: {'SocietalEvent'}, Range: {'Company', 'Agent'}
Relation: thirdDriver, Domain: {'SocietalEvent'}, Range: {'Person'}
Relation: thirdTeam, Domain: {'SocietalEvent'}, Range: {'Company', 'Agent'}
Relation: subject, Domain: {'ChristianBishop', 'SocietalEvent', 'Animal', 'Mammal', 'SoccerTournament', 'EducationalInstitution', 'Fungus', 'Plant', 'Work', 'Person', 'Album', 'PeriodicalLiteratu