In [1]:
from crocodile import Crocodile
import os
# Create an instance of the Crocodile class
# Create an instance of the Crocodile class
crocodile_instance = Crocodile(
    mongo_uri="mongodb://mongodb:27017/",
    db_name="crocodile_db",
    table_trace_collection_name="table_trace",
    dataset_trace_collection_name="dataset_trace",
    max_candidates=3,
    entity_retrieval_endpoint=os.environ["ENTITY_RETRIEVAL_ENDPOINT"],  # Access the entity retrieval endpoint directly from environment variables
    entity_bow_endpoint=os.environ["ENTITY_BOW_ENDPOINT"],  # Access the entity BoW endpoint directly from environment variables
    entity_retrieval_token=os.environ["ENTITY_RETRIEVAL_TOKEN"],  # Access the entity retrieval token directly from environment variables
    max_workers=50,
    candidate_retrieval_limit=10
)

In [6]:
crocodile_instance.fetch_candidates("alsaka", "", qid="Q797")

[{'id': 'Q57710097',
  'name': 'Alsaka',
  'description': 'family name',
  'types': [{'id': 'Q101352', 'name': 'family name'}],
  'features': {'ntoken_mention': 1,
   'ntoken_entity': 1,
   'length_mention': 6,
   'length_entity': 6,
   'popularity': 0.0,
   'ed_score': 1.0,
   'jaccard_score': 1.0,
   'jaccardNgram_score': 1.0,
   'desc': 0.0,
   'descNgram': 0.0,
   'bow_similarity': 0.0,
   'kind': 1,
   'NERtype': 4,
   'column_NERtype': None}},
 {'id': 'Q57710056',
  'name': 'Yacoub Alsaka',
  'description': 'Professor of Engineering, University of Central Florida',
  'types': [{'id': 'Q5', 'name': 'human'},
   {'id': 'Q1622272', 'name': 'university teacher'}],
  'features': {'ntoken_mention': 1,
   'ntoken_entity': 2,
   'length_mention': 6,
   'length_entity': 13,
   'popularity': 0.0,
   'ed_score': 0.46,
   'jaccard_score': 0.5,
   'jaccardNgram_score': 0.5,
   'desc': 0.0,
   'descNgram': 0.0,
   'bow_similarity': 0.0,
   'kind': 1,
   'NERtype': 3,
   'column_NERtype': None}

In [1]:
import pandas as pd
from pymongo import MongoClient, ASCENDING
from crocodile import Crocodile
import os

# Load the CSV file into a DataFrame
file_path = './tables/imdb_top_1000.csv'
df = pd.read_csv(file_path)

# MongoDB connection
client = MongoClient("mongodb://mongodb:27017/")
# Drop the entire crocodile_db database
#client.drop_database("crocodile_db")
db = client["crocodile_db"]

# Drop all collections except 'bow_cache' and 'candidate_cache'
collections_to_keep = ["bow_cache", "candidate_cache"]
all_collections = db.list_collection_names()

for collection in all_collections:
    if collection not in collections_to_keep:
        db[collection].drop()
        print(f"Dropped collection: {collection}")

print("All unwanted collections have been dropped.")


db = client["crocodile_db"]
input_collection = db["input_data"]
table_trace_collection = db["table_trace"]
dataset_trace_collection = db["dataset_trace"]
process_queue = db["process_queue"]

dataset_name = "test"
table_name = "imdb_top_1000_speed_test"


# Ensure indexes for uniqueness and performance
def ensure_indexes():
    input_collection.create_index([("dataset_name", ASCENDING), ("table_name", ASCENDING), ("row_id", ASCENDING)], unique=True)
    table_trace_collection.create_index([("dataset_name", ASCENDING), ("table_name", ASCENDING)], unique=True)
    dataset_trace_collection.create_index([("dataset_name", ASCENDING)], unique=True)
    process_queue.create_index([("dataset_name", ASCENDING), ("table_name", ASCENDING)], unique=True)
    process_queue.create_index([("status", ASCENDING)])  # Ensure fast retrieval of items by status

ensure_indexes()

# Define column classifications for NE and LIT types
ne_cols = {
    "0": "OTHER",    # Series_Title
    "7": "PERSON",   # Director
    "8": "PERSON"    # Star1
}

lit_cols = {
    "1": "NUMBER",   # Released_Year
    "2": "NUMBER",   # Runtime (min)
    "3": "STRING",    # Genre
    "4": "NUMBER",   # IMDB_Rating
    "5": "STRING",   # Overview
    "6": "NUMBER",   # Meta_score
    "9": "NUMBER",   # No_of_Votes
    "10": "NUMBER"   # Gross
}

# Store the header in table_trace_collection only once
table_trace_collection.insert_one({
    "dataset_name": dataset_name,
    "table_name": table_name,
    "header": list(df.columns),  # Store the header (column names)
    "total_rows": len(df),
    "processed_rows": 0,
    "status": "PENDING"
})

# Onboard data (values only, no headers)
for index, row in df.iterrows():
    document = {
        "dataset_name": dataset_name,
        "table_name": table_name,
        "row_id": index,
        "data": row.tolist(),  # Store row values as a list instead of a dictionary with headers
        "classified_columns": {
            "NE": ne_cols,
            "LIT": lit_cols
        },
        "context_columns": [str(i) for i in range(len(df.columns))],  # Context columns (by index)
        "correct_qids": {},  # Empty as GT is not available
        "status": "TODO"
    }
    input_collection.insert_one(document)

# Initialize dataset-level trace (if not done earlier)
dataset_trace_collection.update_one(
    {"dataset_name": dataset_name},
    {
        "$setOnInsert": {
            "total_tables": 1,  # Total number of tables
            "processed_tables": 0,
            "total_rows": len(df),  # This will be updated after processing
            "processed_rows": 0,
            "status": "PENDING"
        }
    },
    upsert=True
)

print(f"Data onboarded successfully for dataset '{dataset_name}' and table '{table_name}'.")

# Create an instance of the Crocodile class
crocodile_instance = Crocodile(
    mongo_uri="mongodb://mongodb:27017/",
    db_name="crocodile_db",
    table_trace_collection_name="table_trace",
    dataset_trace_collection_name="dataset_trace",
    max_candidates=3,
    entity_retrieval_endpoint=os.environ["ENTITY_RETRIEVAL_ENDPOINT"],  # Access the entity retrieval endpoint directly from environment variables
    entity_bow_endpoint=os.environ["ENTITY_BOW_ENDPOINT"],  # Access the entity BoW endpoint directly from environment variables
    entity_retrieval_token=os.environ["ENTITY_RETRIEVAL_TOKEN"],  # Access the entity retrieval token directly from environment variables
    max_workers=8,
    candidate_retrieval_limit=10,
    model_path="./training/trained_models/neural_ranker.h5"
)

# Run the entity linking process
crocodile_instance.run()

print("Entity linking process completed.")

Dropped collection: input_data
Dropped collection: training_data
Dropped collection: dataset_trace
Dropped collection: timing_trace
Dropped collection: table_trace
Dropped collection: process_queue
Dropped collection: web_requests
All unwanted collections have been dropped.
Data onboarded successfully for dataset 'test' and table 'imdb_top_1000_speed_test'.
Found 1000 tasks to process.
No more tasks to process.
No more tasks to process.
No more tasks to process.
No more tasks to process.
No more tasks to process.
No more tasks to process.
No more tasks to process.
No more tasks to process.
All tasks have been processed.
Entity linking process completed.
