In [15]:
from rdflib import Graph
from tqdm import tqdm  # Ensure you have tqdm installed for progress bar
import logging
import requests
import pandas as pd
from IPython.display import display
import random


# Configure logging
logging.basicConfig(level=logging.INFO)

In [75]:
#We want to use the terms we extracted from Wikipedia
terms_file = ''#file path to microbial index

with open(terms_file, 'r') as file:
    wiki_terms = file.read().splitlines() #reads content line by line and turns it into a list 

In [76]:
def create_sparql_query(wiki_terms):
    query_terms = '|'.join(wiki_terms)
    sparql_query = f"""
    PREFIX schema: <http://schema.org/>

    SELECT ?subject ?name ?description ?keywords
    WHERE {{
        ?subject a schema:Dataset ;
                 schema:name ?name ;
                 schema:description ?description ;
                 schema:creator ?creator;
                 schema:keywords ?keywords .
        FILTER (
            regex(?keywords, "{query_terms}", "i")
        )
    }}
    """
    return sparql_query

wiki_query = create_sparql_query(wiki_terms)


In [77]:
def query_odis_endpoint(wiki_query):
    endpoint_url = "http://graph.oceaninfohub.org/blazegraph/namespace/oih/sparql"
    response = requests.post(endpoint_url, data={'query': wiki_query, 'format': 'json'})
    response.raise_for_status()
    return response.json()

# Use the modified query
wiki_results = query_odis_endpoint(wiki_query)


In [78]:
# Step 4: Extract nodes from the JSON response
def extract_nodes(wiki_results, wiki_terms):
    nodes = []
    for result in wiki_results['results']['bindings']:
        keywords = result.get('keywords', {}).get('value', '')
        # Ensure at least one term from wiki_terms is present in the keywords
        if any(term.lower() in keywords.lower() for term in wiki_terms):
            node = {
                'subject': result['subject']['value'],
                'name': result.get('name', {}).get('value', ''),
                'description': result.get('description', {}).get('value', ''),
                'keywords': keywords
            }
            nodes.append(node)
    return nodes

In [83]:
# Correctly call the function with both arguments
nodes = extract_nodes(wiki_results, wiki_terms)

In [84]:
logging.info(f"Total nodes extracted: {len(nodes)}")


INFO:root:Total nodes extracted: 565676


In [85]:
print(nodes[:10])

[{'subject': 'https://catalogue.cioos.ca/dataset/ec330c0c-e6dd-42d2-8c7e-78197b3c5d11', 'name': 'Integrated resource management action plan for the bay and laval river sector', 'description': 'The purpose of this project is to promote the protection, enhancement and sustainable use of the resources of forest and aquatic wildlife habitats in the bay and Laval river areas. This approach follows the acquisition of knowledge of 13 sites of ecological interest, the results of which can be found in the Intervention guide for the protection and enhancement of coastal habitats of interest. Of all the habitats in the lower estuary, the relative ecological importance of the Laval bay area was considered to be very high. In view of the presence of several species of fish of interest in the bay and the Laval river, including salmon, sea trout and smelt, one of the most important clam production areas in Quebec, Important colonies of birds on Laval island, which has been granted wildlife sanctuary 

In [81]:
# Randomly select 50 nodes
random_nodes = random.sample(nodes, 50)

In [86]:

# Ensure to randomly select 50 nodes
random_nodes = random.sample(nodes, 50)

# Convert the list of random nodes to a DataFrame
random_nodes_df = pd.DataFrame(random_nodes)

# Save the DataFrame to a CSV file
random_nodes_df.to_csv('wiki_subset_50_random.csv', index=False)

# Print the DataFrame of 50 random nodes to confirm
print(random_nodes_df)


                                              subject  \
0   https://catalogue.cioos.ca/dataset/06281b0d-52...   
1   https://catalogue.cioos.ca/dataset/ad06f0b9-0e...   
2   https://catalogue.cioos.ca/dataset/ca-cioos_79...   
3   https://catalogue.cioos.ca/dataset/ca-cioos_9c...   
4   https://catalogue.cioos.ca/dataset/ca-cioos_c8...   
5   https://catalogue.cioos.ca/dataset/8d4296e0-7b...   
6   https://catalogue.cioos.ca/dataset/ca-cioos_80...   
7   https://catalogue.cioos.ca/dataset/fb1009b1-22...   
8   https://catalogue.cioos.ca/dataset/ca-cioos_19...   
9   https://catalogue.cioos.ca/dataset/ca-cioos_9c...   
10  https://catalogue.cioos.ca/dataset/ca-cioos_89...   
11  https://catalogue.cioos.ca/dataset/d75b491e-ad...   
12  https://catalogue.cioos.ca/dataset/ca-cioos_89...   
13  https://catalogue.cioos.ca/dataset/ca-cioos_93...   
14  https://catalogue.cioos.ca/dataset/0fb1bc5e-0e...   
15  https://catalogue.cioos.ca/dataset/ca-cioos_f8...   
16  https://catalogue.cioos.ca/

In [61]:
# Convert the list of dictionaries into a pandas DataFrame
wiki_subset_50_random = pd.DataFrame(random_nodes)



In [62]:
# Display the DataFrame
print(wiki_subset_50_random)

# Save to a CSV file for visualization
# Save to a CSV file for visualization with proper comma separation
wiki_subset_50_random.to_csv('wiki_subset_50_random.csv', index=False)


                                              subject  \
0   https://catalogue.cioos.ca/dataset/ca-cioos_28...   
1   https://catalogue.cioos.ca/dataset/ca-cioos_6c...   
2   https://catalogue.cioos.ca/dataset/d549d183-e8...   
3   https://catalogue.cioos.ca/dataset/33356c46-ef...   
4   https://catalogue.cioos.ca/dataset/ca-cioos_96...   
..                                                ...   
95  https://catalogue.cioos.ca/dataset/e373c100-16...   
96  https://catalogue.cioos.ca/dataset/ca-cioos_22...   
97  https://catalogue.cioos.ca/dataset/ca-cioos_76...   
98  https://catalogue.cioos.ca/dataset/ca-cioos_96...   
99  https://catalogue.cioos.ca/dataset/ca-cioos_47...   

                                                 name  \
0   Nearshore Macrophyte Stable Isotopes - BC Cent...   
1   Surface water CO2 parameters collected by Alas...   
2   Barkley Canyon Axis  Capteur d'Oxyg��ne d��plo...   
3   Strait of Georgia Central Acoustic Doppler Cur...   
4   Multiparametric decision s