In [None]:
!pip install transformers
!pip install -U sentence-transformers

In [None]:
import numpy as np
import pandas as pd
import csv
import torch
import requests
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

In [None]:
df = pd.read_csv('./adressa/mind_format/train/wikidata_ids.txt', header=None, names=['ID'])

In [None]:
# List of all the entities
entity_ids = [r[0] for r in df.values]

In [None]:
# Function to create entities/relation to id variables
def create_x2id_dict(elements):
  result = {}
  for e in elements:
    result[e] = len(result)

  return result 

In [None]:
def get_description(id):
    query = f"""
    SELECT ?Label ?Description
    WHERE 
    {{
      wd:{id} rdfs:label ?Label .
      FILTER (LANG(?Label) = "en").
      OPTIONAL {{ wd:{id} schema:description ?Description . FILTER (LANG(?Description) = "en") }}
    }}
    """
    max_tries = 100
    for i in range(max_tries):
      try:
        response = requests.get("https://query.wikidata.org/sparql", params={'query': query, 'format': 'json'})
        response_json = response.json()
        label = response_json['results']['bindings'][0]['Label']['value']
        description = response_json['results']['bindings'][0].get('Description', {}).get('value', '')
        description = label + ' ' + description
        return description
      except:
        pass
    return None

In [None]:
# Load the pre-trained language model and its tokenizer
model = SentenceTransformer('all-mpnet-base-v2').to('cuda:0')
global bad_entities
bad_entities = []
# Define a function to extract the embeddings for a given entity
def extract_embeddings(entity_id, model):
    # Get the text description for the entity
    entity_description = get_description(entity_id)
    if entity_description == None:
        bad_entities.append(entity_id)
        return None, None

    sentence_embeddings = model.encode(entity_description)
    
    return entity_description, sentence_embeddings

In [None]:
# Extract the embeddings for each entity
entity_embeddings = []
entities_descriptions = []
for entity_id in tqdm(entity_ids):
    entity_description, sentence_embeddings = extract_embeddings(entity_id, model)
    entity_embeddings.append(sentence_embeddings)
    entities_descriptions.append(entity_description)

In [None]:
entity_embeddings = []
for entity_des in tqdm(entity_ids):
    if entity_des is None:
        entity_embeddings.append()
    else:
        phrase_emb = model.encode(entity_des)
        entity_embeddings.append(phrase_emb)

In [None]:
# Temp File to avoid to load all entities
with open('description.txt', 'w') as fp:
    for line in entities_descriptions:
        fp.write(str(line)+'\n')

In [None]:
entity_ids = [e for e in entity_ids if e not in bad_entities]

In [None]:
entity2id_dict = create_x2id_dict(entity_ids)

In [None]:
kg_folder_adressa = './adressa/kg/'

In [None]:
with open(kg_folder_adressa+ 'entity2id.txt', 'w', newline='') as file:
    writer = csv.writer(file, delimiter='\t')
    for key, value in entity2id_dict.items():
        writer.writerow([key, value])

In [None]:
entity_embeddings = [list(embedding) for embedding in entity_embeddings if not embedding is None]

In [None]:
np.savetxt(kg_folder_adressa+'entity2vecd768.vec', entity_embeddings, fmt='%.6f', delimiter='\t')

In [None]:
chunks = [entity_ids[x:x+100] for x in range(0, len(entity_ids), 100)]

In [None]:

# Define the API endpoint for retrieving information about entities
endpoint = "https://www.wikidata.org/w/api.php"

chunks = [entity_ids[x:x+50] for x in range(0, len(entity_ids), 50)]

entities = {}

for c in chunks:
  # Define the parameters for the API request
  params = {
      "action": "wbgetentities",
      "ids": "|".join(c),
      "format": "json"
  }

  # Send the API request and retrieve the response
  response = requests.get(endpoint, params=params)

  # Extract the JSON data from the response
  data = response.json()
  #print(data)

  # Define a dictionary to store the entity information
  #entities = {}

  # Extract the entity information from the data
  for entity_id, entity in data["entities"].items():
      try:
        entities[entity_id] = {
            "label": entity["labels"]["en"]["value"],
            "description": entity["descriptions"]["en"]["value"],
            "claims": entity.get("claims", {})
        }
      except:
        entities[entity_id] = {
            "label": entity["labels"]["en"]["value"],
            "description": entity["labels"]["en"]["value"],
            "claims": entity.get("claims", {})
        }

# Define a list to store the relationships between entities
relationships = []

# Extract the relationships between entities from the entity information
for entity_id, entity in entities.items():
    for property_id, property_values in entity["claims"].items():
        for property_value in property_values:
            if "mainsnak" in property_value and "datavalue" in property_value["mainsnak"]:
                datavalue = property_value["mainsnak"]["datavalue"]
                if "value" in datavalue and "id" in datavalue["value"]:
                    #print(datavalue)
                    try:
                      target_entity_id = datavalue["value"]["id"]
                      if target_entity_id in entity_ids:
                        relationships.append((entity_id, property_id, target_entity_id))
                    except:
                      pass

In [None]:
relations = list(set([rel[1] for rel in relationships]))

In [None]:
relation2id_dict = create_x2id_dict(relations)

In [None]:
with open(kg_folder_adressa+'relation2id.txt', 'w', newline='') as file:
    writer = csv.writer(file, delimiter='\t')
    for key, value in relation2id_dict.items():
        writer.writerow([key, value])

In [None]:
global bad_relations
bad_relations = []

def extract_embeddings_relation(relation_id, model):
    relation_description = get_description(relation_id)
    if relation_description == None:
        bad_relations.append(relation_id)
        return None, None
    sentence_embeddings = model.encode(relation_description)
    return relation_description, sentence_embeddings

In [None]:
relation_embeddings = []
relation_descs = []
for relation_id in relations:
    emb_, rela_d = extract_embeddings_relation(relation_id, model)
    relation_embeddings.append(emb_)
    relation_descs.append(rela_d)

In [None]:
relations = [r for r in relations if r not in bad_relations]

In [None]:
relation2id_dict = create_x2id_dict(relations)

In [None]:
with open(kg_folder_adressa+'relation2id.txt', 'w', newline='') as file:
    writer = csv.writer(file, delimiter='\t')
    for key, value in relation2id_dict.items():
        writer.writerow([key, value])

In [None]:
relation_embeddings = [list(embedding) for embedding in relation_embeddings]

In [None]:
np.savetxt(kg_folder_adressa+'relation2vecd768.vec', relation_embeddings, fmt='%.6f', delimiter='\t')

In [None]:
relationships = list(set(relationships))

In [None]:
triple2id = [(entity2id_dict[relation[0]],entity2id_dict[relation[2]],relation2id_dict[relation[1]]) for relation in relationships]

In [None]:
from operator import itemgetter
triple2id = sorted(triple2id, key=itemgetter(0, 1, 2))

In [None]:
with open(kg_folder_adressa+'triple2id.txt', 'w', newline='') as file:
    writer = csv.writer(file, delimiter='\t')
    for tuple_ in triple2id:
        writer.writerow(tuple_)

In [None]:
# Reduce dimension using PCA

In [None]:
import numpy as np
from sklearn.decomposition import PCA

# Load the embeddings from the file specified in `path1`
with open(kg_folder_adressa+'entity2vecd768.vec', 'r') as f:
    embeddings = np.loadtxt(f, delimiter='\t')

# Apply PCA to the embeddings to reduce the dimensionality to 100
pca = PCA(n_components=100)
reduced_embeddings = pca.fit_transform(embeddings)
np.savetxt(kg_folder_adressa+'entity2vecd100.vec', reduced_embeddings, fmt='%.6f', delimiter='\t')

In [None]:
with open(kg_folder_adressa+'relation2vecd768.vec', 'r') as f:
    embeddings = np.loadtxt(f, delimiter='\t')

# Apply PCA to the embeddings to reduce the dimensionality to 100
pca = PCA(n_components=100)
reduced_embeddings = pca.fit_transform(embeddings)
np.savetxt(kg_folder_adressa+'relation2vecd100.vec', reduced_embeddings, fmt='%.6f', delimiter='\t')