In [56]:
import torch

In [5]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode the sentences from your DataFrame
embeddings = model.encode(df_lcquad['question'].tolist())


  from .autonotebook import tqdm as notebook_tqdm


In [21]:
def find_similar_sentence(query, embeddings, df):
    # Encode the query sentence to get its embedding
    query_embedding = model.encode([query])[0]
    
    # Compute cosine similarities between the query embedding and all sentence embeddings
    similarities = cosine_similarity([query_embedding], embeddings)[0]
    
    # Find the index of the sentence with the highest similarity
    closest_index = np.argmax(similarities)
    
    # Fetch the most similar sentence and its SPARQL query from the DataFrame
    similar_sentence = df.iloc[closest_index]['question']
    sparql_query = df.iloc[closest_index]['query']
    entities = df.iloc[closest_index]['entities']
    relations = df.iloc[closest_index]['relations']
    
    return similar_sentence, sparql_query,entities,relations

# Example query
query_sentence = "what is the capital of germany?"
similar_sentence, sparql_query,entities,relations = find_similar_sentence(query_sentence, embeddings, concatenated_df)
print("Similar Sentence:", similar_sentence)
print("SPARQL Query:", sparql_query)
print("entities:", entities)
print(" relations:", relations)


Similar Sentence: what is the smallest city by area in germany?
SPARQL Query: PREFIX dbo: <http://dbpedia.org/ontology/> PREFIX skos: <http://www.w3.org/2004/02/skos/core#> PREFIX dbc: <http://dbpedia.org/resource/Category:> PREFIX dct: <http://purl.org/dc/terms/> SELECT ?city WHERE { ?m skos:broader dbc:Cities_in_Germany . ?city dct:subject ?m ; dbo:areaTotal ?area } ORDER BY ?area LIMIT 1
entities: ['Cities_in_Germany']
 relations: ['areaTotal']


In [2]:
import pandas as pd
import json
with open('temp/train_lcquad2.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Extract relevant data
rows = []
for item in data:
       # print(question_data)
        question_text_en = item['question']
        query = item['sparql_wikidata']
        relation = item['newPredLabels']
        entities = item['new_LabelsEnt']

        rows.append({"question": question_text_en, "query": query,"entities":entities, "relations":relation})
        

# Create a DataFrame
df_lcquad = pd.DataFrame(rows)

In [3]:
len(df_lcquad)

21497

In [4]:
df_lcquad.head(1)

Unnamed: 0,question,query,entities,relations
0,What periodical literature does Delta Air Line...,select distinct ?obj where { wd:Q188920 wdt:P...,"[delta air lines: Q188920, periodical: Q1002697]","[house publication: P2813, instance of: P31]"


In [6]:
df_lcquad.to_parquet('temp/wikidata_examples.parquet')

In [7]:
import pickle

# Save embeddings to a file
with open('temp/embeddings_wikidata.pkl', 'wb') as f:
    pickle.dump(embeddings, f)


In [10]:
with open('temp/train_qald.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Extract relevant data
rows = []
for item in data:
       # print(question_data)
        question_text_en = item['en_ques']
        query = item['sparql']
        relation = item['relations']
        entities = item['entities']

        rows.append({"question": question_text_en, "query": query,"entities":entities, "relations":relation})

# Create a DataFrame
df_qald9 = pd.DataFrame(rows)

In [12]:
len(df_qald9)

350

In [13]:
with open('temp/train_vquanda.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Extract relevant data
rows = []
for item in data:
       # print(question_data)
        question_text_en = item['en_ques']
        query = item['sparql']
        relation = item['relations']
        entities = item['entities']

        rows.append({"question": question_text_en, "query": query,"entities":entities, "relations":relation})

# Create a DataFrame
df_vquanda = pd.DataFrame(rows)

In [14]:
len(df_vquanda)

3500

In [15]:
df_vquanda.head(1)

Unnamed: 0,question,query,entities,relations
0,Which universities are alma mater to Charles P...,SELECT DISTINCT ?uri WHERE { <http://dbpedia.o...,[Charles_Plosser],[almaMater]


In [16]:
concatenated_df = pd.concat([df_qald9, df_vquanda], axis=0)

In [17]:
len(concatenated_df)

3850