In [None]:
import pandas as pd
from pymongo import MongoClient
from crocodile import Crocodile
import os

# Sample DataFrame
data = {
    'MovieTitle': ['Batman Begins', 'The Dark Knight', 'Inception'],
    'Year': [2005, 2008, 2010],
    'Genre': ['Action', 'Action', 'Sci-Fi'],
    'Director': ['Christopher Nolan', 'Christopher Nolan', 'Christopher Nolan']
}

df = pd.DataFrame(data)

# MongoDB connection
client = MongoClient("mongodb://mongodb:27017/")
db = client["crocodile_db"]
collection = db["input_data"]
trace_collection = db["processing_trace"]

# Dataset and table names for tracing
dataset_name = "movie_dataset"
table_name = "movies_table"

# **Store the header separately** in the trace collection
trace_collection.insert_one({
    "dataset_name": dataset_name,
    "table_name": table_name,
    "header": list(df.columns),  # Store the header (column names)
    "total_rows": len(df),
    "processed_rows": 0,
    "status": "PENDING"  # Initial status before processing
})

# Onboard data without headers, just the row values
for index, row in df.iterrows():
    document = {
        "dataset_name": dataset_name,
        "table_name": table_name,
        "row_id": index,
        "data": row.tolist(),  # Store the row as a list of values
        "classified_columns": {
            "NE": {  # Specify the NE columns with the correct data types
                "0": "PERSON"  # MovieTitle considered as PERSON (e.g., protagonist/character/entity)
            },
            "LIT": {  # Specify literal columns with their data types
                "1": "NUMBER",  # Year is a number
                "2": "STRING"  # Genre is a string
            }
        },
        "context_columns": ["0", "1", "2", "3"],  # Context columns (by index)
        "status": "TODO"
    }
    collection.insert_one(document)

print(f"Data onboarded successfully for dataset '{dataset_name}' and table '{table_name}'.")

# Create an instance of the Crocodile class
crocodile_instance = Crocodile(
    mongo_uri="mongodb://mongodb:27017/",
    db_name="crocodile_db",
    table_trace_collection_name="table_trace",
    dataset_trace_collection_name="dataset_trace",
    max_candidates=3,
    entity_retrieval_endpoint=os.environ["ENTITY_RETRIEVAL_ENDPOINT"],  # Access the entity retrieval endpoint directly from environment variables
    entity_bow_endpoint=os.environ["ENTITY_BOW_ENDPOINT"],  # Access the entity BoW endpoint directly from environment variables
    entity_retrieval_token=os.environ["ENTITY_RETRIEVAL_TOKEN"]  # Access the entity retrieval token directly from environment variables
)

# Run the entity linking process
crocodile_instance.run(dataset_name=dataset_name, table_name=table_name)

print("Entity linking process completed.")

In [None]:
# Load the CSV file into a DataFrame
file_path = './film_input_no_QIDs.csv'
df = pd.read_csv(file_path)
df.columns

In [None]:
import pandas as pd
from pymongo import MongoClient

# Load the CSV file into a DataFrame
file_path = './film_input_no_QIDs.csv'
df = pd.read_csv(file_path)

# MongoDB connection
client = MongoClient("mongodb://mongodb:27017/")
db = client["crocodile_db"]
collection = db["input_data"]
trace_collection = db["processing_trace"]

# Dataset and table names for tracing
dataset_name = "imdb_dataset"
table_name = "film_input_no_QIDs10"

# Onboard data
for index, row in df.iterrows():
    document = {
        "dataset_name": dataset_name,
        "table_name": table_name,
        "row_id": index,
        "data": row.to_dict(),
        "classified_columns": {
            "NE": ["title", "director", "domestic distributor"],  # Assuming Series_Title is the column to be linked
            "LIT": ["release year", "length in min", "worldwide gross"]  # Assuming these are literal columns
        },
        "context_columns": ['title', 'director', 'release year', 'domestic distributor', 'length in min', 'worldwide gross'],  # Context columns
        "status": "TODO"
    }
    collection.insert_one(document)

    #if index == 9:
    #    break

# Initialize the trace collection
trace_collection.insert_one({
    "dataset_name": dataset_name,
    "table_name": table_name,
    "total_rows": len(df),
    "processed_rows": 0,
    "status": "PENDING"  # Initial status before processing
})

print(f"Data onboarded successfully for dataset '{dataset_name}' and table '{table_name}'.")

In [None]:
import pandas as pd
from pymongo import MongoClient

# Load the CSV file into a DataFrame
file_path = './imdb_top_1000.csv'
df = pd.read_csv(file_path)

# MongoDB connection
client = MongoClient("mongodb://mongodb:27017/")
db = client["crocodile_db"]
collection = db["input_data"]
trace_collection = db["processing_trace"]

# Dataset and table names for tracing
dataset_name = "imdb_dataset"
table_name = "top_1000_movies"

# Onboard data
for index, row in df.iterrows():
    document = {
        "dataset_name": dataset_name,
        "table_name": table_name,
        "row_id": index,
        "data": row.to_dict(),
        "classified_columns": {
            "NE": ["Series_Title"],  # Assuming Series_Title is the column to be linked
            "LIT": ["Released_Year", "Genre"]  # Assuming these are literal columns
        },
        "context_columns": ["Series_Title", "Released_Year", "Genre", "Director"],  # Context columns
        "status": "TODO"
    }
    collection.insert_one(document)

    if index == 9:
        break

# Initialize the trace collection
trace_collection.insert_one({
    "dataset_name": dataset_name,
    "table_name": table_name,
    "total_rows": len(df),
    "processed_rows": 0,
    "status": "PENDING"  # Initial status before processing
})

print(f"Data onboarded successfully for dataset '{dataset_name}' and table '{table_name}'.")

In [None]:
from crocodile import Crocodile
import os

# Create an instance of the Crocodile class
crocodile_instance = Crocodile(
    mongo_uri="mongodb://mongodb:27017/",
    db_name="crocodile_db",
    collection_name="input_data",
    trace_collection_name="processing_trace",
    max_candidates=3,
    entity_retrieval_endpoint=os.environ["ENTITY_RETRIEVAL_ENDPOINT"],  # Access the entity retrieval endpoint directly from environment variables
    entity_bow_endpoint=os.environ["ENTITY_BOW_ENDPOINT"],  # Access the entity BoW endpoint directly from environment variables
    entity_retrieval_token=os.environ["ENTITY_RETRIEVAL_TOKEN"]  # Access the entity retrieval token directly from environment variables
)

# Run the entity linking process
crocodile_instance.run(dataset_name=dataset_name, table_name=table_name)

print("Entity linking process completed.")

In [None]:
! pip install Levenshtein

In [None]:
from crocodile import Crocodile
import os
# Create an instance of the Crocodile class
crocodile_instance = Crocodile(
    mongo_uri="mongodb://mongodb:27017/",
    db_name="crocodile_db",
    collection_name="input_data",
    trace_collection_name="processing_trace",
    max_candidates=3,
    entity_retrieval_endpoint=os.environ["ENTITY_RETRIEVAL_ENDPOINT"],  # Access the entity retrieval endpoint directly from environment variables
    entity_bow_endpoint=os.environ["ENTITY_BOW_ENDPOINT"],  # Access the entity BoW endpoint directly from environment variables
    entity_retrieval_token=os.environ["ENTITY_RETRIEVAL_TOKEN"
    ]  # Access the entity retrieval token directly from environment variables
)
#crocodile_instance.get_bow_from_api(["Q90"])
candidates = crocodile_instance.fetch_candidates("paris", "paris france")

In [None]:
crocodile_instance.get_bow_from_api(["Q30", "Q40"])

In [None]:
row_text =  "paris france, Canadian"
description = candidates[0]["description"]
candidate_tokens = set(crocodile_instance.tokenize_text(description))
row_tokens = set(crocodile_instance.tokenize_text(row_text))
crocodile_instance.calculate_token_overlap(candidate_tokens, row_tokens)

In [None]:
candidate_tokens, row_tokens

In [None]:
candidates

In [None]:
import requests
import base64
import gzip
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from nltk.tokenize import word_tokenize
import nltk
import pickle

# Download NLTK resources if not already downloaded
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords

# Global stopwords to avoid reinitializing repeatedly
stop_words = set(stopwords.words('english'))

# Function to get BoW vectors from the API
def get_bow_from_api(qids):
    url = 'https://lamapi.hel.sintef.cloud/entity/bow?token=lamapi_demo_2023'
    response = requests.post(
        url,
        headers={'accept': 'application/json', 'Content-Type': 'application/json'},
        json={"json": qids}
    )
    
    if response.status_code != 200:
        print(f"Error fetching BoW: {response.status_code}")
        return None
    
    bow_data = response.json()
    
    # Decode and decompress the encoded BoW vectors
    decoded_vectors = {}
    for qid, encoded_data in bow_data.items():
        compressed_bytes = base64.b64decode(encoded_data)
        decompressed_vector = pickle.loads(gzip.decompress(compressed_bytes))
        bow_vector = decompressed_vector
        decoded_vectors[qid] = bow_vector
    
    return decoded_vectors

# Function to tokenize text and remove stopwords
def tokenize_text(text):
    tokens = word_tokenize(text.lower())
    return [t for t in tokens if t.isalpha() and t not in stop_words]

# Function to build a row vector (BoW)
def build_row_vector(row_text, shared_vocab):
    row_tokens = tokenize_text(row_text)
    row_bow = Counter(row_tokens)
    
    # Create a vector for the row based on the shared vocabulary
    row_bow_vector = np.array([row_bow.get(word, 0) for word in shared_vocab])
    
    return row_bow_vector

# Function to compute cosine similarity between row and candidate vectors
def compute_similarity(row_bow_vector, candidate_vectors, shared_vocab):
    similarities = {}
    for qid, candidate_bow in candidate_vectors.items():
        candidate_bow_vector = np.array([candidate_bow.get(word, 0) for word in shared_vocab])
        similarity = cosine_similarity([row_bow_vector], [candidate_bow_vector])[0][0]
        similarities[qid] = similarity
    return similarities

# Test case: simulate a row of data (e.g., a table row)
row = {
    'city': 'Paris',
    'country': 'France',
    'continent': 'Europe',
    'population': '2140526',
    'area_km2': '105.4',
    'language': 'French'
}

row = {
    'Series_Title': 'Pulp Fiction',
    'Released_Year': 1994,
    'Runtime (min)': 154,
    'Genre': 'Crime, Drama',
    'IMDB_Rating': 8.9,
    'Overview': 'The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner bandits intertwine in four tales of violence and redemption.',
    'Meta_score': 94.0,
    'Director': 'Quentin Tarantino',
    'Star1': 'John Travolta',
    'No_of_Votes': 1826188,
    'Gross': '107,928,762'
}

# Combine the row data into a single text string for BoW processing
row_text = ' '.join([str(row[index]) for index in row if index != 'Overview'])

# Step 1: Retrieve BoW vectors from API for some QIDs
qids = ["Q30", "Q166262", "Q90", "Q104123", "Q45", "Q100", "Q5"]  # Example QIDs
candidate_vectors = get_bow_from_api(qids)

if candidate_vectors is None:
    print("No candidate vectors retrieved from the API.")
else:
    # Step 2: Ensure consistent shared vocabulary
    shared_vocab = set()
    for vector in candidate_vectors.values():
        shared_vocab.update(vector.keys())  # Collect vocabulary from candidate BoWs
    row_tokens = tokenize_text(row_text)
    shared_vocab.update(row_tokens)
    shared_vocab = list(shared_vocab)

    # Step 3: Build the row vector (BoW)
    row_bow_vector = build_row_vector(row_text, shared_vocab)

    # Step 4: Compute similarity between row and candidate vectors
    similarity_scores = compute_similarity(row_bow_vector, candidate_vectors, shared_vocab)

    # Step 5: Output the similarity scores
    print("\nSimilarity Scores between row and candidates:")
    for qid, score in similarity_scores.items():
        print(f"QID: {qid}, Similarity: {score:.4f}")

In [None]:
shared_vocab

In [None]:
row_tokens

In [None]:
shared_vocab

In [None]:
import pandas as pd
from pymongo import MongoClient

# Load the CSV file into a DataFrame
file_path = './imdb_top_1000.csv'
df = pd.read_csv(file_path)

# MongoDB connection
client = MongoClient("mongodb://mongodb:27017/")
db = client["crocodile_db"]
collection = db["input_data"]
trace_collection = db["processing_trace"]
results = collection.find({})
outcome = []
for result in results:
    outcome.append((result["row_id"], result["el_results"]["title"][0]["id"]))

In [None]:
df = pd.read_csv("film_with_QIDs.csv") 
df

In [None]:
df2 = pd.DataFrame(outcome)
sum(df2[1] == df["Title_QID"]) / len(df2)

In [None]:
import pandas as pd
from pymongo import MongoClient

# Load the CSV file into a DataFrame
file_path = './0DO2KMKV.csv'
df = pd.read_csv(file_path)

# MongoDB connection
client = MongoClient("mongodb://mongodb:27017/")
db = client["crocodile_db"]
collection = db["input_data"]
trace_collection = db["processing_trace"]

# Dataset and table names for tracing
dataset_name = "test"
table_name = "0DO2KMKV"

# Onboard data
for index, row in df.iterrows():
    document = {
        "dataset_name": dataset_name,
        "table_name": table_name,
        "row_id": index,
        "data": row.to_dict(),
        "classified_columns": {
            "NE": ["col0"],  # Assuming Series_Title is the column to be linked
            "LIT": ["col1"]  # Assuming these are literal columns
        },
        "context_columns": list(df.columns),  # Context columns
        "status": "TODO"
    }
    collection.insert_one(document)

    #if index == 9:
    #    break

# Initialize the trace collection
trace_collection.insert_one({
    "dataset_name": dataset_name,
    "table_name": table_name,
    "total_rows": len(df),
    "processed_rows": 0,
    "status": "PENDING"  # Initial status before processing
})

print(f"Data onboarded successfully for dataset '{dataset_name}' and table '{table_name}'.")

In [None]:
df

In [None]:
from crocodile import Crocodile
import os

# Create an instance of the Crocodile class
crocodile_instance = Crocodile(
    mongo_uri="mongodb://mongodb:27017/",
    db_name="crocodile_db",
    collection_name="input_data",
    trace_collection_name="processing_trace",
    max_candidates=3,
    entity_retrieval_endpoint=os.environ["ENTITY_RETRIEVAL_ENDPOINT"],  # Access the entity retrieval endpoint directly from environment variables
    entity_bow_endpoint=os.environ["ENTITY_BOW_ENDPOINT"],  # Access the entity BoW endpoint directly from environment variables
    entity_retrieval_token=os.environ["ENTITY_RETRIEVAL_TOKEN"]  # Access the entity retrieval token directly from environment variables
)

# Run the entity linking process
crocodile_instance.run(dataset_name=dataset_name, table_name=table_name)

print("Entity linking process completed.")

In [None]:
import pandas as pd
from pymongo import MongoClient
from crocodile import Crocodile
import os
import json

# Load the CSV file into a DataFrame
file_path = './tables/VGUZX5R3.csv'
df = pd.read_csv(file_path)

# MongoDB connection
client = MongoClient("mongodb://mongodb:27017/")
db = client["crocodile_db"]
input_collection = db["input_data"]
table_trace_collection = db["table_trace"]
dataset_trace_collection = db["dataset_trace"]

dataset_name = "test"
table_name = "VGUZX5R3"

# Load the correct QIDs for the table
with open('./tables/correct_qids_VGUZX5R3.json', 'r') as file:
    correct_qids = json.load(file)

# **Store the header only once** in the table_trace_collection
table_trace_collection.insert_one({
    "dataset_name": dataset_name,
    "table_name": table_name,
    "header": list(df.columns),  # Store the header (column names)
    "total_rows": len(df),
    "processed_rows": 0,
    "status": "PENDING"
})

# Onboard data (values only, no headers)
for index, row in df.iterrows():
    document = {
        "dataset_name": dataset_name,
        "table_name": table_name, 
        "row_id": index,
        "data": row.tolist(),  # Store row values as a list instead of a dictionary with headers
        "classified_columns": {
            "NE": {
                "0": "LOCATION",
                "1": "LOCATION",
                "3": "LOCATION"
            },
            "LIT": {
                "2": "NUMBER", 
                "4": "NUMBER", 
                "5": "NUMBER"
            }
        },
        "context_columns": [str(i) for i in range(len(df.columns))],  # Context columns (by index)
        "correct_qids": correct_qids,
        "status": "TODO"
    }
    input_collection.insert_one(document)

# Initialize dataset-level trace (if not done earlier)
dataset_trace_collection.update_one(
    {"dataset_name": dataset_name},
    {
        "$setOnInsert": {
            "total_tables": 1,  # Total number of tables
            "processed_tables": 0,
            "total_rows": 0,  # This will be updated after processing
            "processed_rows": 0,
            "status": "PENDING"
        }
    },
    upsert=True
)

print(f"Data onboarded successfully for dataset '{dataset_name}' and table '{table_name}'.")

# Create an instance of the Crocodile class
crocodile_instance = Crocodile(
    mongo_uri="mongodb://mongodb:27017/",
    db_name="crocodile_db",
    table_trace_collection_name="table_trace",
    dataset_trace_collection_name="dataset_trace",
    max_candidates=3,
    entity_retrieval_endpoint=os.environ["ENTITY_RETRIEVAL_ENDPOINT"],  # Access the entity retrieval endpoint directly from environment variables
    entity_bow_endpoint=os.environ["ENTITY_BOW_ENDPOINT"],  # Access the entity BoW endpoint directly from environment variables
    entity_retrieval_token=os.environ["ENTITY_RETRIEVAL_TOKEN"]  # Access the entity retrieval token directly from environment variables
)

# Run the entity linking process
crocodile_instance.run(dataset_name=dataset_name, table_name=table_name)

print("Entity linking process completed.")

In [None]:
import pandas as pd
from pymongo import MongoClient, ASCENDING
from crocodile import Crocodile
import os

# Load the CSV file into a DataFrame
file_path = './tables/imdb_top_1000.csv'
df = pd.read_csv(file_path)

# MongoDB connection
client = MongoClient("mongodb://mongodb:27017/")
db = client["crocodile_db"]
input_collection = db["input_data"]
table_trace_collection = db["table_trace"]
dataset_trace_collection = db["dataset_trace"]
process_queue = db["process_queue"]

dataset_name = "test"
table_name = "imdb_top_1000_speed_test1"


# Ensure indexes for uniqueness and performance
def ensure_indexes():
    input_collection.create_index([("dataset_name", ASCENDING), ("table_name", ASCENDING), ("row_id", ASCENDING)], unique=True)
    table_trace_collection.create_index([("dataset_name", ASCENDING), ("table_name", ASCENDING)], unique=True)
    dataset_trace_collection.create_index([("dataset_name", ASCENDING)], unique=True)
    process_queue.create_index([("dataset_name", ASCENDING), ("table_name", ASCENDING)], unique=True)
    process_queue.create_index([("status", ASCENDING)])  # Ensure fast retrieval of items by status

ensure_indexes()

# Define column classifications for NE and LIT types
ne_cols = {
    "0": "TITLE",    # Series_Title
    "7": "PERSON",   # Director
    "8": "PERSON"    # Star1
}

lit_cols = {
    "1": "NUMBER",   # Released_Year
    "2": "NUMBER",   # Runtime (min)
    "3": "STRING",    # Genre
    "4": "NUMBER",   # IMDB_Rating
    "5": "STRING",   # Overview
    "6": "NUMBER",   # Meta_score
    "9": "NUMBER",   # No_of_Votes
    "10": "NUMBER"   # Gross
}

# Store the header in table_trace_collection only once
table_trace_collection.insert_one({
    "dataset_name": dataset_name,
    "table_name": table_name,
    "header": list(df.columns),  # Store the header (column names)
    "total_rows": len(df),
    "processed_rows": 0,
    "status": "PENDING"
})

# Onboard data (values only, no headers)
for index, row in df.iterrows():
    document = {
        "dataset_name": dataset_name,
        "table_name": table_name,
        "row_id": index,
        "data": row.tolist(),  # Store row values as a list instead of a dictionary with headers
        "classified_columns": {
            "NE": ne_cols,
            "LIT": lit_cols
        },
        "context_columns": [str(i) for i in range(len(df.columns))],  # Context columns (by index)
        "correct_qids": {},  # Empty as GT is not available
        "status": "TODO"
    }
    input_collection.insert_one(document)

# Initialize dataset-level trace (if not done earlier)
dataset_trace_collection.update_one(
    {"dataset_name": dataset_name},
    {
        "$setOnInsert": {
            "total_tables": 1,  # Total number of tables
            "processed_tables": 0,
            "total_rows": 0,  # This will be updated after processing
            "processed_rows": 0,
            "status": "PENDING"
        }
    },
    upsert=True
)

print(f"Data onboarded successfully for dataset '{dataset_name}' and table '{table_name}'.")

# Create an instance of the Crocodile class
crocodile_instance = Crocodile(
    mongo_uri="mongodb://mongodb:27017/",
    db_name="crocodile_db",
    table_trace_collection_name="table_trace",
    dataset_trace_collection_name="dataset_trace",
    max_candidates=3,
    entity_retrieval_endpoint=os.environ["ENTITY_RETRIEVAL_ENDPOINT"],  # Access the entity retrieval endpoint directly from environment variables
    entity_bow_endpoint=os.environ["ENTITY_BOW_ENDPOINT"],  # Access the entity BoW endpoint directly from environment variables
    entity_retrieval_token=os.environ["ENTITY_RETRIEVAL_TOKEN"]  # Access the entity retrieval token directly from environment variables
)

# Run the entity linking process
crocodile_instance.run(dataset_name=dataset_name, table_name=table_name)

print("Entity linking process completed.")

In [None]:
import pandas as pd
from pymongo import MongoClient
from crocodile import Crocodile
import os
import json
#from tensorflow.keras.models import load_model


# Load the CSV file into a DataFrame
file_path = './tables/VGUZX5R3.csv'
df = pd.read_csv(file_path)

# MongoDB connection
client = MongoClient("mongodb://mongodb:27017/")
# Drop the entire crocodile_db database
client.drop_database("crocodile_db")
print("The crocodile_db database has been dropped.")
db = client["crocodile_db"]
input_collection = db["input_data"]
table_trace_collection = db["table_trace"]
dataset_trace_collection = db["dataset_trace"]

dataset_name = "test"
table_name = "VGUZX5R3"

# Load the correct QIDs for the table
with open('./tables/correct_qids_VGUZX5R3.json', 'r') as file:
    correct_qids_data = json.load(file)

# Store the header only once in the table_trace_collection
table_trace_collection.insert_one({
    "dataset_name": dataset_name,
    "table_name": table_name,
    "header": list(df.columns),  # Store the header (column names)
    "total_rows": len(df),
    "processed_rows": 0,
    "status": "PENDING"
})

# Onboard data (values only, no headers)
for index, row in df.iterrows():
    # Filter correct QIDs relevant for the current row
    correct_qids_for_row = {key: value for key, value in correct_qids_data.items() if key.startswith(f"{index}-")}
    
    document = {
        "dataset_name": dataset_name,
        "table_name": table_name, 
        "row_id": index,
        "data": row.tolist(),  # Store row values as a list instead of a dictionary with headers
        "classified_columns": {
            "NE": {
                "0": "LOCATION",
                "1": "LOCATION",
                "3": "LOCATION"
            },
            "LIT": {
                "2": "NUMBER", 
                "4": "NUMBER", 
                "5": "NUMBER"
            }
        },
        "context_columns": [str(i) for i in range(len(df.columns))],  # Context columns (by index)
        "correct_qids": correct_qids_for_row,  # Only relevant QIDs for the current row
        "status": "TODO"
    }
    input_collection.insert_one(document)

# Initialize dataset-level trace (if not done earlier)
dataset_trace_collection.update_one(
    {"dataset_name": dataset_name},
    {
        "$setOnInsert": {
            "total_tables": 1,  # Total number of tables
            "processed_tables": 0,
            "total_rows": 0,  # This will be updated after processing
            "processed_rows": 0,
            "status": "PENDING"
        }
    },
    upsert=True
)

print(f"Data onboarded successfully for dataset '{dataset_name}' and table '{table_name}'.")

model_path = "./training/trained_models/neural_ranker.h5"
#ml_model = load_model(model_path)
# Create an instance of the Crocodile class
crocodile_instance = Crocodile(
    mongo_uri="mongodb://mongodb:27017/",
    db_name="crocodile_db",
    table_trace_collection_name="table_trace",
    dataset_trace_collection_name="dataset_trace",
    max_candidates=3,
    entity_retrieval_endpoint=os.environ["ENTITY_RETRIEVAL_ENDPOINT"],  # Access the entity retrieval endpoint directly from environment variables
    entity_bow_endpoint=os.environ["ENTITY_BOW_ENDPOINT"],  # Access the entity BoW endpoint directly from environment variables
    entity_retrieval_token=os.environ["ENTITY_RETRIEVAL_TOKEN"],  # Access the entity retrieval token directly from environment variables
    candidate_retrieval_limit=10,
    max_workers=30,
    model_path=model_path
)

# Run the entity linking process
crocodile_instance.run(dataset_name=dataset_name, table_name=table_name)

print("Entity linking process completed.")

In [None]:
import pandas as pd
from pymongo import MongoClient, ASCENDING
from crocodile import Crocodile
import os

# Load the CSV file into a DataFrame
file_path = './tables/imdb_top_1000.csv'
df = pd.read_csv(file_path)

# MongoDB connection
client = MongoClient("mongodb://mongodb:27017/")
# Drop the entire crocodile_db database
client.drop_database("crocodile_db")
print("The crocodile_db database has been dropped.")

db = client["crocodile_db"]
input_collection = db["input_data"]
table_trace_collection = db["table_trace"]
dataset_trace_collection = db["dataset_trace"]
process_queue = db["process_queue"]

dataset_name = "test"
table_name = "imdb_top_1000_speed_test"


# Ensure indexes for uniqueness and performance
def ensure_indexes():
    input_collection.create_index([("dataset_name", ASCENDING), ("table_name", ASCENDING), ("row_id", ASCENDING)], unique=True)
    table_trace_collection.create_index([("dataset_name", ASCENDING), ("table_name", ASCENDING)], unique=True)
    dataset_trace_collection.create_index([("dataset_name", ASCENDING)], unique=True)
    process_queue.create_index([("dataset_name", ASCENDING), ("table_name", ASCENDING)], unique=True)
    process_queue.create_index([("status", ASCENDING)])  # Ensure fast retrieval of items by status

ensure_indexes()

# Define column classifications for NE and LIT types
ne_cols = {
    "0": "TITLE",    # Series_Title
    "7": "PERSON",   # Director
    "8": "PERSON"    # Star1
}

lit_cols = {
    "1": "NUMBER",   # Released_Year
    "2": "NUMBER",   # Runtime (min)
    "3": "STRING",    # Genre
    "4": "NUMBER",   # IMDB_Rating
    "5": "STRING",   # Overview
    "6": "NUMBER",   # Meta_score
    "9": "NUMBER",   # No_of_Votes
    "10": "NUMBER"   # Gross
}

# Store the header in table_trace_collection only once
table_trace_collection.insert_one({
    "dataset_name": dataset_name,
    "table_name": table_name,
    "header": list(df.columns),  # Store the header (column names)
    "total_rows": len(df),
    "processed_rows": 0,
    "status": "PENDING"
})

# Onboard data (values only, no headers)
for index, row in df.iterrows():
    document = {
        "dataset_name": dataset_name,
        "table_name": table_name,
        "row_id": index,
        "data": row.tolist(),  # Store row values as a list instead of a dictionary with headers
        "classified_columns": {
            "NE": ne_cols,
            "LIT": lit_cols
        },
        "context_columns": [str(i) for i in range(len(df.columns))],  # Context columns (by index)
        "correct_qids": {},  # Empty as GT is not available
        "status": "TODO"
    }
    input_collection.insert_one(document)

# Initialize dataset-level trace (if not done earlier)
dataset_trace_collection.update_one(
    {"dataset_name": dataset_name},
    {
        "$setOnInsert": {
            "total_tables": 1,  # Total number of tables
            "processed_tables": 0,
            "total_rows": 0,  # This will be updated after processing
            "processed_rows": 0,
            "status": "PENDING"
        }
    },
    upsert=True
)

print(f"Data onboarded successfully for dataset '{dataset_name}' and table '{table_name}'.")

# Create an instance of the Crocodile class
crocodile_instance = Crocodile(
    mongo_uri="mongodb://mongodb:27017/",
    db_name="crocodile_db",
    table_trace_collection_name="table_trace",
    dataset_trace_collection_name="dataset_trace",
    max_candidates=3,
    entity_retrieval_endpoint=os.environ["ENTITY_RETRIEVAL_ENDPOINT"],  # Access the entity retrieval endpoint directly from environment variables
    entity_bow_endpoint=os.environ["ENTITY_BOW_ENDPOINT"],  # Access the entity BoW endpoint directly from environment variables
    entity_retrieval_token=os.environ["ENTITY_RETRIEVAL_TOKEN"],  # Access the entity retrieval token directly from environment variables
    max_workers=32,
    candidate_retrieval_limit=10,
    model_path="./training/trained_models/neural_ranker.h5"
)

# Run the entity linking process
crocodile_instance.run()

print("Entity linking process completed.")

In [None]:
import pandas as pd
from pymongo import MongoClient, ASCENDING
from crocodile import Crocodile
import os

# Create an instance of the Crocodile class
crocodile_instance = Crocodile(
    mongo_uri="mongodb://mongodb:27017/",
    db_name="crocodile_db",
    table_trace_collection_name="table_trace",
    dataset_trace_collection_name="dataset_trace",
    max_candidates=3,
    entity_retrieval_endpoint=os.environ["ENTITY_RETRIEVAL_ENDPOINT"],  # Access the entity retrieval endpoint directly from environment variables
    entity_bow_endpoint=os.environ["ENTITY_BOW_ENDPOINT"],  # Access the entity BoW endpoint directly from environment variables
    entity_retrieval_token=os.environ["ENTITY_RETRIEVAL_TOKEN"],  # Access the entity retrieval token directly from environment variables
    max_workers=50,
    candidate_retrieval_limit=10,
    model_path="./training/trained_models/neural_ranker.h5"
)

In [None]:
crocodile_instance.get_bow_from_api("United States Washington D.C. 331000000 North America", ["Q30", "Q32", "Q35"])

In [None]:
import subprocess
import time

# Define the curl command
curl_command = [
    "curl", "-X", "POST",
    "https://lamapi.hel.sintef.cloud/entity/bow?token=lamapi_demo_2023",
    "-H", "accept: application/json",
    "-H", "Content-Type: application/json",
    "-d", '''
    {
      "json": {
        "text": "galaxy supernova remnant emission nebula",
        "qids": ["Q3094537", "Q1249631", "Q58030121", "Q56322597", "Q112178715", "Q3094522", "Q3038279", "Q3757513", "Q375383", "Q111055666", "Q318"]
      }
    }
    '''
]

# Measure the time taken
start_time = time.time()
result = subprocess.run(curl_command, capture_output=True, text=True)
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

# Print the output and time taken
print("Response:")
print(result.stdout)
print("\nTime taken (seconds):", elapsed_time)

In [None]:
import subprocess
import time

# Define the curl command
curl_command = [
    "curl", "-X", "POST",
    "https://lamapi.hel.sintef.cloud/entity/bow?token=lamapi_demo_2023",
    "-H", "accept: application/json",
    "-H", "Content-Type: application/json",
    "-d", '''
    {
      "json": {
        "text": "open cluster star cluster galaxy",
        "qids": ["Q3094537", "Q1249631", "Q58030121", "Q56322597", "Q112178715", "Q3094522", "Q3038279", "Q3757513", "Q375383", "Q111055666", "Q318"]
      }
    }
    '''
]

# Measure the time taken
start_time = time.time()
result = subprocess.run(curl_command, capture_output=True, text=True)
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

# Print the output and time taken
print("Response:")
print(result.stdout)
print("\nTime taken (seconds):", elapsed_time)

In [1]:
import pandas as pd
from pymongo import MongoClient, ASCENDING
from crocodile import Crocodile
import os

# Load the CSV file into a DataFrame
file_path = './tables/imdb_top_1000.csv'
df = pd.read_csv(file_path)

# MongoDB connection
client = MongoClient("mongodb://mongodb:27017/")
# Drop the entire crocodile_db database
#client.drop_database("crocodile_db")
db = client["crocodile_db"]

# Drop all collections except 'bow_cache' and 'candidate_cache'
collections_to_keep = ["bow_cache", "candidate_cache"]
all_collections = db.list_collection_names()

for collection in all_collections:
    if collection not in collections_to_keep:
        db[collection].drop()
        print(f"Dropped collection: {collection}")

print("All unwanted collections have been dropped.")


db = client["crocodile_db"]
input_collection = db["input_data"]
table_trace_collection = db["table_trace"]
dataset_trace_collection = db["dataset_trace"]
process_queue = db["process_queue"]

dataset_name = "test"
table_name = "imdb_top_1000_speed_test"


# Ensure indexes for uniqueness and performance
def ensure_indexes():
    input_collection.create_index([("dataset_name", ASCENDING), ("table_name", ASCENDING), ("row_id", ASCENDING)], unique=True)
    table_trace_collection.create_index([("dataset_name", ASCENDING), ("table_name", ASCENDING)], unique=True)
    dataset_trace_collection.create_index([("dataset_name", ASCENDING)], unique=True)
    process_queue.create_index([("dataset_name", ASCENDING), ("table_name", ASCENDING)], unique=True)
    process_queue.create_index([("status", ASCENDING)])  # Ensure fast retrieval of items by status

ensure_indexes()

# Define column classifications for NE and LIT types
ne_cols = {
    "0": "OTHER",    # Series_Title
    "7": "PERSON",   # Director
    "8": "PERSON"    # Star1
}

lit_cols = {
    "1": "NUMBER",   # Released_Year
    "2": "NUMBER",   # Runtime (min)
    "3": "STRING",    # Genre
    "4": "NUMBER",   # IMDB_Rating
    "5": "STRING",   # Overview
    "6": "NUMBER",   # Meta_score
    "9": "NUMBER",   # No_of_Votes
    "10": "NUMBER"   # Gross
}

# Store the header in table_trace_collection only once
table_trace_collection.insert_one({
    "dataset_name": dataset_name,
    "table_name": table_name,
    "header": list(df.columns),  # Store the header (column names)
    "total_rows": len(df),
    "processed_rows": 0,
    "status": "PENDING"
})

# Onboard data (values only, no headers)
for index, row in df.iterrows():
    document = {
        "dataset_name": dataset_name,
        "table_name": table_name,
        "row_id": index,
        "data": row.tolist(),  # Store row values as a list instead of a dictionary with headers
        "classified_columns": {
            "NE": ne_cols,
            "LIT": lit_cols
        },
        "context_columns": [str(i) for i in range(len(df.columns))],  # Context columns (by index)
        "correct_qids": {},  # Empty as GT is not available
        "status": "TODO"
    }
    input_collection.insert_one(document)

# Initialize dataset-level trace (if not done earlier)
dataset_trace_collection.update_one(
    {"dataset_name": dataset_name},
    {
        "$setOnInsert": {
            "total_tables": 1,  # Total number of tables
            "processed_tables": 0,
            "total_rows": len(df),  # This will be updated after processing
            "processed_rows": 0,
            "status": "PENDING"
        }
    },
    upsert=True
)

print(f"Data onboarded successfully for dataset '{dataset_name}' and table '{table_name}'.")

# Create an instance of the Crocodile class
crocodile_instance = Crocodile(
    mongo_uri="mongodb://mongodb:27017/",
    db_name="crocodile_db",
    table_trace_collection_name="table_trace",
    dataset_trace_collection_name="dataset_trace",
    max_candidates=3,
    entity_retrieval_endpoint=os.environ["ENTITY_RETRIEVAL_ENDPOINT"],  # Access the entity retrieval endpoint directly from environment variables
    entity_bow_endpoint=os.environ["ENTITY_BOW_ENDPOINT"],  # Access the entity BoW endpoint directly from environment variables
    entity_retrieval_token=os.environ["ENTITY_RETRIEVAL_TOKEN"],  # Access the entity retrieval token directly from environment variables
    max_workers=8,
    candidate_retrieval_limit=10,
    model_path="./training/trained_models/neural_ranker.h5"
)

# Run the entity linking process
crocodile_instance.run()

print("Entity linking process completed.")

2024-12-20 00:11:38.440503: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-20 00:11:38.440803: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-20 00:11:38.443036: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-20 00:11:38.468855: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Dropped collection: training_data
Dropped collection: error_logs
Dropped collection: input_data
Dropped collection: timing_trace
Dropped collection: process_queue
Dropped collection: table_trace
Dropped collection: dataset_trace
All unwanted collections have been dropped.
Data onboarded successfully for dataset 'test' and table 'imdb_top_1000_speed_test'.
Found 1000 tasks to process.


2024-12-20 00:11:41.419915: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-20 00:11:41.420444: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-20 00:11:41.423749: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-20 00:11:41.457567: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-12-20 00:11:41.762423: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. 

No more tasks to process.
No more tasks to process.
No more tasks to process.
No more tasks to process.
No more tasks to process.
No more tasks to process.
No more tasks to process.
No more tasks to process.
Total unprocessed documents: 1000
Predicting scores for 29649 candidates...
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Scores predicted.
ML ranking progress: 100.00% completed
ML ranking completed.
All tasks have been processed.
Entity linking process completed.


In [None]:
import pandas as pd
from pymongo import MongoClient, ASCENDING
from crocodile import Crocodile
import os

# MongoDB connection
client = MongoClient("mongodb://mongodb:27017/")
db = client["crocodile_db"]

# Collections
input_collection = db["input_data"]
table_trace_collection = db["table_trace"]
dataset_trace_collection = db["dataset_trace"]

# Drop all collections except 'bow_cache' and 'candidate_cache'
collections_to_keep = ["bow_cache", "candidate_cache"]
all_collections = db.list_collection_names()

for collection in all_collections:
    if collection not in collections_to_keep:
        db[collection].drop()
        print(f"Dropped collection: {collection}")

print("All unwanted collections have been dropped.")

# Simulated tables with 1 row each
table1_data = pd.DataFrame({
    "Series_Title": ["Inception"],
    "Released_Year": [2010],
    "Director": ["Christopher Nolan"],
    "Genre": ["Sci-Fi"],
    "IMDB_Rating": [8.8],
    "Star1": ["Leonardo DiCaprio"]
})

table2_data = pd.DataFrame({
    "Movie_Title": ["Titanic"],
    "Runtime": [195],
    "Star1": ["Leonardo diCaprio"],
    "Director": ["James Cameron"],
    "Votes": [1000000],
    "Gross": ["2.2B"]
})

# Correct QIDs for evaluation
qids_table1 = {
    "0-2": "Q25191",  # Director: Christopher Nolan
    "0-5": "Q38111"   # Star1: Leonardo DiCaprio (Inception)
}

qids_table2 = {
    "0-3": "Q42574",  # Director: James Cameron
    "0-2": "Q6606737"   # Star1: Leonardo DiCaprio (Titanic)
}

# Column classification for NE and LIT
ne_cols_table1 = {
    "0": "OTHER",    # Series_Title
    "2": "PERSON",   # Director
    "5": "PERSON"    # Star1
}

lit_cols_table1 = {
    "1": "NUMBER",   # Released_Year
    "3": "STRING",   # Genre
    "4": "NUMBER"    # IMDB_Rating
}

ne_cols_table2 = {
    "0": "OTHER",    # Movie_Title
    "2": "PERSON",   # Star1
    "3": "PERSON"    # Director
}

lit_cols_table2 = {
    "1": "NUMBER",   # Runtime
    "4": "NUMBER",   # Votes
    "5": "STRING"    # Gross
}

# Tables to onboard
tables = {
    "table1": (table1_data, qids_table1, ne_cols_table1, lit_cols_table1),
    "table2": (table2_data, qids_table2, ne_cols_table2, lit_cols_table2)
}

# Ensure indexes
def ensure_indexes():
    input_collection.create_index([("dataset_name", ASCENDING), ("table_name", ASCENDING), ("row_id", ASCENDING)], unique=True)
    table_trace_collection.create_index([("dataset_name", ASCENDING), ("table_name", ASCENDING)], unique=True)
    dataset_trace_collection.create_index([("dataset_name", ASCENDING)], unique=True)

ensure_indexes()

# Process and onboard data into MongoDB
dataset_name = "test_dataset"
total_rows = 0

for table_name, (df, qids, ne_cols, lit_cols) in tables.items():
    total_rows += len(df)
    # Insert rows into the input_data collection
    for idx, row in df.iterrows():
        input_collection.insert_one({
            "dataset_name": dataset_name,
            "table_name": table_name,
            "row_id": idx,
            "data": row.tolist(),
            "correct_qids": {key: value for key, value in qids.items() if key.startswith(f"{idx}-")},
            "classified_columns": {
                "NE": ne_cols,
                "LIT": lit_cols
            },
            "status": "TODO"
        })

    # Insert table header into table_trace collection
    table_trace_collection.insert_one({
        "dataset_name": dataset_name,
        "table_name": table_name,
        "header": list(df.columns),
        "classified_columns": {
            "NE": ne_cols,
            "LIT": lit_cols
        },
        "total_rows": len(df),
        "processed_rows": 0,
        "status": "PENDING"
    })

# Insert or update dataset trace
dataset_trace_collection.update_one(
    {"dataset_name": dataset_name},
    {
        "$setOnInsert": {
            "total_tables": len(tables),
            "processed_tables": 0,
            "total_rows": total_rows,
            "processed_rows": 0,
            "status": "PENDING"
        }
    },
    upsert=True
)

print(f"Dataset '{dataset_name}' onboarded successfully with {len(tables)} tables and {total_rows} total rows.")

# Create an instance of the Crocodile class
crocodile_instance = Crocodile(
    mongo_uri="mongodb://mongodb:27017/",
    db_name="crocodile_db",
    table_trace_collection_name="table_trace",
    dataset_trace_collection_name="dataset_trace",
    max_candidates=3,
    entity_retrieval_endpoint=os.environ["ENTITY_RETRIEVAL_ENDPOINT"],
    entity_bow_endpoint=os.environ["ENTITY_BOW_ENDPOINT"],
    entity_retrieval_token=os.environ["ENTITY_RETRIEVAL_TOKEN"],
    max_workers=8,
    candidate_retrieval_limit=10,
    model_path="./training/trained_models/neural_ranker.h5"
)

# Run the entity linking process
crocodile_instance.run()

print("Entity linking process completed.")