In [None]:
import pandas as pd
import sys
from SPARQLWrapper import SPARQLWrapper, JSON

import date_utils
from meta_constants import MetaColumns
import wiki_data_sparql

In [None]:
df = wiki_data_sparql.query_dataframe_lotus_compound_taxon_relations()
df

In [None]:
from rdkit_mol_identifiers import split_inchikey
df = df.drop(columns=["taxon_name_x"]).rename(columns={"taxon_name_y": "taxon_name"}).sort_values(["inchikey"])
df["split_inchikey"] = [split_inchikey(inchikey) for inchikey in df["inchikey"]]
df

In [None]:
import pandas_utils
pandas_utils.save_dataframe(df, "../data/lotus_download_nb.parquet")
pandas_utils.save_dataframe(df, "../data/lotus_download_nb.csv")

In [None]:

wikidata_sparql_url = "https://query.wikidata.org/sparql"

# fails as too long
lotus_sparql_with_parents = """#title: Which are the available referenced structure-organism pairs on Wikidata?
SELECT DISTINCT ?structure ?inchikey ?taxon ?taxon_name ?taxon_rank ?reference ?reference_doi ?parent_taxon ?parent_taxon_name WHERE {
  ?structure p:P703 ?statement.
  ?statement ps:P703 ?taxon.
  ?statement (prov:wasDerivedFrom/pr:P248) ?reference.
  ?structure wdt:P234 ?inchi.
  ?structure wdt:P235 ?inchikey.
  ?reference wdt:P356 ?reference_doi.
  ?taxon wdt:P225 ?taxon_name.
  ?taxon wdt:P105 ?taxon_rank.
  ?taxon wdt:P171 ?parent_taxon.
  ?parent_taxon wdt:P225 ?parent_taxon_name.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}

LIMIT 10
"""

# TAXONS_PLACEHOLDER should be a list of taxon ids as "wd:Q2671785" "wd:Q2671785" "wd:Q2671785"
lotus_sparql_parents_only = """
#title: Which are the available referenced structure-organism pairs on Wikidata? Get parents
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX p: <http://www.wikidata.org/prop/>
SELECT ?taxon ?parent_taxon ?parent_taxon_name WHERE {
  VALUES ?taxon { TAXONS_PLACEHOLDER }
  ?taxon wdt:P171 ?parent_taxon.
  ?parent_taxon wdt:P225 ?parent_taxon_name;
    wdt:P105 ?parent_taxon_rank;
    wdt:P685 ?parent_ncbi_id.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
GROUP BY ?taxon ?parent_taxon ?parent_taxon_name
"""

lotus_sparql = """
#title: Which are the available referenced structure-organism pairs on Wikidata?
SELECT DISTINCT ?structure ?structureLabel ?inchikey ?taxon ?taxon_name ?ncbi_id ?taxon_rankLabel ?reference ?reference_doi WHERE {
  ?structure p:P703 ?statement.
  ?statement ps:P703 ?taxon;
    (prov:wasDerivedFrom/pr:P248) ?reference.
  ?structure wdt:P234 ?inchi;
    wdt:P235 ?inchikey.
  ?reference wdt:P356 ?reference_doi.
  ?taxon wdt:P225 ?taxon_name;
    wdt:P105 ?taxon_rank;
    wdt:P685 ?ncbi_id.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
"""
# lotus_sparql = """
# #title: Which are the available referenced structure-organism pairs on Wikidata?
# SELECT DISTINCT ?structure ?inchikey ?taxon ?taxon_name ?taxon_rank ?reference ?reference_doi WHERE {
#   ?structure p:P703 ?statement.
#   ?statement ps:P703 ?taxon;
#     (prov:wasDerivedFrom/pr:P248) ?reference.
#   ?structure wdt:P234 ?inchi;
#     wdt:P235 ?inchikey.
#   ?reference wdt:P356 ?reference_doi.
#   ?taxon wdt:P225 ?taxon_name;
#     wdt:P105 ?taxon_rank.
#   SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
# }
# """


In [None]:


def get_sparql_json_results(sparql_query: str, endpoint_url: str = "https://query.wikidata.org/sparql"):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(sparql_query)
    sparql.setReturnFormat(JSON)
    return sparql.queryAndConvert()


def load_as_dataframe(sparql_query: str, endpoint_url: str = "https://query.wikidata.org/sparql") -> pd.DataFrame:
    results = get_sparql_json_results(sparql_query, endpoint_url)['results']['bindings']
    new_results = [extract_values(result) for result in results]
    df = pd.DataFrame.from_dict(new_results)
    df = df.rename(columns={
        "structure": "wiki_structure_url",
        "taxon": "wiki_taxon_url",
        "reference": "wiki_reference_url",
    })
    return df


def extract_values(result):
    return {key: result[key]['value'] for key in result}


df = load_as_dataframe(lotus_sparql)
df[MetaColumns.date_wikidata_lotus_search] = date_utils.iso_datetime_now()
df

In [None]:
df.to_csv("../data/lotus_wikidata.csv", index=False)

In [None]:

parents_df = load_as_dataframe(lotus_sparql_parents_only)
parents_df

In [None]:
parents_df = parents_df.drop_duplicates(["wiki_taxon_url"])

In [None]:
df[df.duplicated(["wiki_taxon_url", "wiki_structure_url", "wiki_reference_url"])]

In [None]:
df.merge(parents_df, on="wiki_taxon_url")

In [None]:
df

In [None]:

df.to_csv("../data/lotus_wikidata.csv", index=False)

In [None]:
co = df10.tail(5).reset_index(drop=True).join(df.head(5).reset_index(drop=True), rsuffix="r", lsuffix="l")
co[["inchikeyl", 'inchikeyr']]


In [None]:
import wiki_data_sparql
taxondf = wiki_data_sparql.query_dataframe_lotus_compound_taxon_relations()
taxondf

In [None]:

def divide_chunks(items, chunk_size):
    return [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)]

def query_dataframe_parent_taxon_ncbi(taxon_list):
    chunks = divide_chunks(taxon_list, 250)
    queries = [wiki_data_sparql._get_parent_taxon_ncbi_sparql(chunk) for chunk in chunks]
    dfs = [wiki_data_sparql.load_as_dataframe(query) for query in queries]
    return pd.concat(dfs, sort=False)

parents = query_dataframe_parent_taxon_ncbi(taxondf.head(1000)["taxon"])
parents

In [None]:
head = taxondf.head(1000).copy()
head.merge(parents, on="taxon", how="left").sort_values(["parent_ncbi_id"]).drop_duplicates(["taxon", "structure", "reference"])


In [None]:
head.drop_duplicates()

In [None]:
import pandas_utils
pandas_utils.divide_chunks(taxondf["taxon"])

In [None]:
def divide(items, chunk_size):
    return [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)]

chunks = divide(taxondf["taxon"], 100)