In [2]:
import IPython

IPython.display.clear_output(wait=True)
import os
import time

import pandas as pd
from pymongo import ASCENDING, MongoClient

from alligator import Alligator

# Load the CSV file into a DataFrame
file_path = './tables/imdb_top_1000.csv'

# MongoDB connection
client = MongoClient("mongodb://gator-mongodb:27017/")
# Drop the entire crocodile_db database
#client.drop_database("crocodile_db")
db = client["alligator_db"]

# Drop all collections except 'bow_cache' and 'candidate_cache'
collections_to_keep = ["bow_cache", "candidate_cache"]
all_collections = db.list_collection_names()

for collection in all_collections:
    if collection not in collections_to_keep:
        db[collection].drop()
        print(f"Dropped collection: {collection}")

print("All unwanted collections have been dropped.")

# Create an instance of the Alligator class
crocodile_instance = Alligator(
    input_csv=file_path,
    entity_retrieval_endpoint=os.environ["ENTITY_RETRIEVAL_ENDPOINT"],  # Access the entity retrieval endpoint directly from environment variables
    entity_retrieval_token=os.environ["ENTITY_RETRIEVAL_TOKEN"],  # Access the entity retrieval token directly from environment variables
    object_retrieval_endpoint=os.environ["OBJECT_RETRIEVAL_ENDPOINT"],
    literal_retrieval_endpoint=os.environ["LITERAL_RETRIEVAL_ENDPOINT"],
    max_workers=8,
    candidate_retrieval_limit=10,
    max_candidates_in_result=3,
    batch_size=256,
    model_path="./alligator/models/default.h5",
    columns_type={
    "NE": {
        "0": "OTHER",
        "7": "OTHER"
    },
    "LIT": {
        "1": "NUMBER",
        "2": "NUMBER",
        "3": "STRING",
        "4": "NUMBER",
        "5": "STRING"
    },
    # "IGNORED" : ["6", "9", "10", "7", "8"]
}
)

# Run the entity linking process
tic = time.perf_counter()
crocodile_instance.run()
toc = time.perf_counter()
print("Elapsed time:", toc - tic)
print("Entity linking process completed.")

Dropped collection: input_data
Dropped collection: object_cache
Dropped collection: literal_cache
All unwanted collections have been dropped.
Onboarding unknown rows for dataset 'e7e5487ffea34cdcbc52a40d3a32affa', table 'imdb_top_1000'
Chunk 1: Processed 1000 rows (total: 1000) (16084.7 rows/sec)
Data onboarding complete for dataset 'e7e5487ffea34cdcbc52a40d3a32affa' and table 'imdb_top_1000'
Onboarded 1000 rows in 0.06 seconds (16008.7 rows/sec)
Found 1000 tasks to process.
No more tasks to process.
No more tasks to process.
No more tasks to process.
No more tasks to process.
No more tasks to process.
No more tasks to process.
No more tasks to process.
No more tasks to process.
ML ranking for stage rank progress: 0/1000 documentsML ranking for stage rank progress: 0/1000 documents

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
ML ranking for stage rank progress: 256/1000 documents
ML ra

In [1]:
import os
from alligator.fetchers import ObjectFetcher
object_retrieval_endpoint=os.environ["OBJECT_RETRIEVAL_ENDPOINT"]
literal_retrieval_endpoint=os.environ["LITERAL_RETRIEVAL_ENDPOINT"]
entity_retrieval_token=os.environ["ENTITY_RETRIEVAL_TOKEN"]
object_retrieval_endpoint
o = ObjectFetcher(object_retrieval_endpoint, entity_retrieval_token)
await o.fetch_objects(["Q90"])

Fetching objects for 0 entities


{'Q90': {'objects': {'Q8253667': ['P1151'],
   'Q8964470': ['P1792'],
   'Q12371988': ['P1313'],
   'Q1465786': ['P1313'],
   'Q235863': ['P417'],
   'Q142': ['P17', 'P17', 'P1376'],
   'Q70972': ['P17', 'P1376', 'P131'],
   'Q146246': ['P17', 'P1376'],
   'Q2748708': ['P17'],
   'Q71084': ['P17'],
   'Q58296': ['P17', 'P1376'],
   'Q18220037': ['P1424'],
   'Q13917': ['P1376', 'P361', 'P131'],
   'Q71092': ['P1376'],
   'Q106577': ['P1376'],
   'Q207162': ['P1376'],
   'Q212429': ['P1376'],
   'Q1142326': ['P1376', 'P1365', 'P131'],
   'Q16665915': ['P1376', 'P131'],
   'Q69829': ['P1376'],
   'Q58326': ['P1376'],
   'Q133132': ['P1376', 'P793'],
   'Q88521107': ['P1376'],
   'Q179023': ['P1376'],
   'Q21129738': ['P868'],
   'Q845625': ['P2184'],
   'Q1471': ['P206'],
   'Q810526': ['P206'],
   'Q860172': ['P206'],
   'Q1032646': ['P206'],
   'Q1403319': ['P1456'],
   'Q3252156': ['P1456'],
   'Q6655': ['P421'],
   'Q6723': ['P421'],
   'Q220': ['P190'],
   'Q1490': ['P190'],
   'Q34