In [None]:
import IPython

IPython.display.clear_output(wait=True)
import os
import time

import pandas as pd
from pymongo import ASCENDING, MongoClient

from alligator import Alligator

# Load the CSV file into a DataFrame
file_path = './tables/imdb_top_1000.csv'

# MongoDB connection
client = MongoClient("mongodb://gator-mongodb:27017/")
# Drop the entire crocodile_db database
#client.drop_database("crocodile_db")
db = client["crocodile_db"]

# Drop all collections except 'bow_cache' and 'candidate_cache'
collections_to_keep = ["bow_cache", "candidate_cache"]
all_collections = db.list_collection_names()

for collection in all_collections:
    if collection not in collections_to_keep:
        db[collection].drop()
        print(f"Dropped collection: {collection}")

print("All unwanted collections have been dropped.")

# Create an instance of the Alligator class
crocodile_instance = Alligator(
    input_csv=file_path,
    entity_retrieval_endpoint=os.environ["ENTITY_RETRIEVAL_ENDPOINT"],  # Access the entity retrieval endpoint directly from environment variables
    entity_retrieval_token=os.environ["ENTITY_RETRIEVAL_TOKEN"],  # Access the entity retrieval token directly from environment variables
    max_workers=8,
    candidate_retrieval_limit=10,
    max_candidates_in_result=3,
    batch_size=256,
    model_path="./alligator/models/default.h5",
    columns_type={
    "NE": {
        "0": "OTHER"
    },
    "LIT": {
        "1": "NUMBER",
        "2": "NUMBER",
        "3": "STRING",
        "4": "NUMBER",
        "5": "STRING"
    },
    # "IGNORED" : ["6", "9", "10", "7", "8"]
}
)

# Run the entity linking process
tic = time.perf_counter()
crocodile_instance.run()
toc = time.perf_counter()
print("Elapsed time:", toc - tic)
print("Entity linking process completed.")