In [8]:
import pandas as pd
from pymongo import MongoClient

# Sample DataFrame
data = {
    'MovieTitle': ['Batman Begins', 'The Dark Knight', 'Inception'],
    'Year': [2005, 2008, 2010],
    'Genre': ['Action', 'Action', 'Sci-Fi'],
    'Director': ['Christopher Nolan', 'Christopher Nolan', 'Christopher Nolan']
}

df = pd.DataFrame(data)

# MongoDB connection
client = MongoClient("mongodb://mongodb:27017/")
db = client["crocodile_db"]
collection = db["input_data"]
trace_collection = db["processing_trace"]

# Dataset and table names for tracing
dataset_name = "movie_dataset"
table_name = "movies_table"

# Onboard data
for index, row in df.iterrows():
    document = {
        "dataset_name": dataset_name,
        "table_name": table_name,
        "row_id": index,
        "data": row.to_dict(),
        "classified_columns": {
            "NE": ["MovieTitle"],  # Specify columns to be linked
            "LIT": ["Year", "Genre"]  # Specify literal columns
        },
        "context_columns": ["MovieTitle", "Year", "Genre", "Director"],  # Specify context columns
        "status": "TODO"
    }
    collection.insert_one(document)

# Initialize the trace collection
trace_collection.insert_one({
    "dataset_name": dataset_name,
    "table_name": table_name,
    "total_rows": len(df),
    "processed_rows": 0,
    "status": "PENDING"  # Initial status before processing
})

print(f"Data onboarded successfully for dataset '{dataset_name}' and table '{table_name}'.")

Data onboarded successfully for dataset 'movie_dataset' and table 'movies_table'.


In [5]:
# Load the CSV file into a DataFrame
file_path = './film_input_no_QIDs.csv'
df = pd.read_csv(file_path)
df.columns

Index(['title', 'director', 'release year', 'domestic distributor',
       'length in min', 'worldwide gross'],
      dtype='object')

In [2]:
import pandas as pd
from pymongo import MongoClient

# Load the CSV file into a DataFrame
file_path = './film_input_no_QIDs.csv'
df = pd.read_csv(file_path)

# MongoDB connection
client = MongoClient("mongodb://mongodb:27017/")
db = client["crocodile_db"]
collection = db["input_data"]
trace_collection = db["processing_trace"]

# Dataset and table names for tracing
dataset_name = "imdb_dataset"
table_name = "film_input_no_QIDs10"

# Onboard data
for index, row in df.iterrows():
    document = {
        "dataset_name": dataset_name,
        "table_name": table_name,
        "row_id": index,
        "data": row.to_dict(),
        "classified_columns": {
            "NE": ["title", "director", "domestic distributor"],  # Assuming Series_Title is the column to be linked
            "LIT": ["release year", "length in min", "worldwide gross"]  # Assuming these are literal columns
        },
        "context_columns": ['title', 'director', 'release year', 'domestic distributor', 'length in min', 'worldwide gross'],  # Context columns
        "status": "TODO"
    }
    collection.insert_one(document)

    #if index == 9:
    #    break

# Initialize the trace collection
trace_collection.insert_one({
    "dataset_name": dataset_name,
    "table_name": table_name,
    "total_rows": len(df),
    "processed_rows": 0,
    "status": "PENDING"  # Initial status before processing
})

print(f"Data onboarded successfully for dataset '{dataset_name}' and table '{table_name}'.")

Data onboarded successfully for dataset 'imdb_dataset' and table 'film_input_no_QIDs10'.


In [2]:
import pandas as pd
from pymongo import MongoClient

# Load the CSV file into a DataFrame
file_path = './imdb_top_1000.csv'
df = pd.read_csv(file_path)

# MongoDB connection
client = MongoClient("mongodb://mongodb:27017/")
db = client["crocodile_db"]
collection = db["input_data"]
trace_collection = db["processing_trace"]

# Dataset and table names for tracing
dataset_name = "imdb_dataset"
table_name = "top_1000_movies"

# Onboard data
for index, row in df.iterrows():
    document = {
        "dataset_name": dataset_name,
        "table_name": table_name,
        "row_id": index,
        "data": row.to_dict(),
        "classified_columns": {
            "NE": ["Series_Title"],  # Assuming Series_Title is the column to be linked
            "LIT": ["Released_Year", "Genre"]  # Assuming these are literal columns
        },
        "context_columns": ["Series_Title", "Released_Year", "Genre", "Director"],  # Context columns
        "status": "TODO"
    }
    collection.insert_one(document)

    if index == 9:
        break

# Initialize the trace collection
trace_collection.insert_one({
    "dataset_name": dataset_name,
    "table_name": table_name,
    "total_rows": len(df),
    "processed_rows": 0,
    "status": "PENDING"  # Initial status before processing
})

print(f"Data onboarded successfully for dataset '{dataset_name}' and table '{table_name}'.")

Data onboarded successfully for dataset 'imdb_dataset' and table 'top_1000_movies'.


In [3]:
from crocodile import Crocodile
import os

# Create an instance of the Crocodile class
crocodile_instance = Crocodile(
    mongo_uri="mongodb://mongodb:27017/",
    db_name="crocodile_db",
    collection_name="input_data",
    trace_collection_name="processing_trace",
    max_candidates=3,
    entity_retrieval_endpoint=os.environ["ENTITY_RETRIEVAL_ENDPOINT"],  # Access the entity retrieval endpoint directly from environment variables
    entity_bow_endpoint=os.environ["ENTITY_BOW_ENDPOINT"],  # Access the entity BoW endpoint directly from environment variables
    entity_retrieval_token=os.environ["ENTITY_RETRIEVAL_TOKEN"]  # Access the entity retrieval token directly from environment variables
)

# Run the entity linking process
crocodile_instance.run(dataset_name=dataset_name, table_name=table_name)

print("Entity linking process completed.")

Entity linking process completed.


In [1]:
! pip install Levenshtein

Collecting Levenshtein
  Downloading levenshtein-0.26.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein)
  Downloading rapidfuzz-3.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (11 kB)
Downloading levenshtein-0.26.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading rapidfuzz-3.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.26.0 rapidfuzz-3.10.0


In [2]:
from crocodile import Crocodile
import os
# Create an instance of the Crocodile class
crocodile_instance = Crocodile(
    mongo_uri="mongodb://mongodb:27017/",
    db_name="crocodile_db",
    collection_name="input_data",
    trace_collection_name="processing_trace",
    max_candidates=3,
    entity_retrieval_endpoint=os.environ["ENTITY_RETRIEVAL_ENDPOINT"],  # Access the entity retrieval endpoint directly from environment variables
    entity_bow_endpoint=os.environ["ENTITY_BOW_ENDPOINT"],  # Access the entity BoW endpoint directly from environment variables
    entity_retrieval_token=os.environ["ENTITY_RETRIEVAL_TOKEN"
    ]  # Access the entity retrieval token directly from environment variables
)
#crocodile_instance.get_bow_from_api(["Q90"])
candidates = crocodile_instance.fetch_candidates("paris", "paris france")

In [None]:
crocodile_instance.get_bow_from_api(["Q30", "Q40"])

In [5]:
row_text =  "paris france, Canadian"
description = candidates[0]["description"]
candidate_tokens = set(crocodile_instance.tokenize_text(description))
row_tokens = set(crocodile_instance.tokenize_text(row_text))
crocodile_instance.calculate_token_overlap(candidate_tokens, row_tokens)

0.2

In [9]:
candidate_tokens, row_tokens

({'canadian', 'series', 'television'}, {'canadian', 'france', 'paris'})

In [None]:
candidates

In [18]:
import requests
import base64
import gzip
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from nltk.tokenize import word_tokenize
import nltk
import pickle

# Download NLTK resources if not already downloaded
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords

# Global stopwords to avoid reinitializing repeatedly
stop_words = set(stopwords.words('english'))

# Function to get BoW vectors from the API
def get_bow_from_api(qids):
    url = 'https://lamapi.hel.sintef.cloud/entity/bow?token=lamapi_demo_2023'
    response = requests.post(
        url,
        headers={'accept': 'application/json', 'Content-Type': 'application/json'},
        json={"json": qids}
    )
    
    if response.status_code != 200:
        print(f"Error fetching BoW: {response.status_code}")
        return None
    
    bow_data = response.json()
    
    # Decode and decompress the encoded BoW vectors
    decoded_vectors = {}
    for qid, encoded_data in bow_data.items():
        compressed_bytes = base64.b64decode(encoded_data)
        decompressed_vector = pickle.loads(gzip.decompress(compressed_bytes))
        bow_vector = decompressed_vector
        decoded_vectors[qid] = bow_vector
    
    return decoded_vectors

# Function to tokenize text and remove stopwords
def tokenize_text(text):
    tokens = word_tokenize(text.lower())
    return [t for t in tokens if t.isalpha() and t not in stop_words]

# Function to build a row vector (BoW)
def build_row_vector(row_text, shared_vocab):
    row_tokens = tokenize_text(row_text)
    row_bow = Counter(row_tokens)
    
    # Create a vector for the row based on the shared vocabulary
    row_bow_vector = np.array([row_bow.get(word, 0) for word in shared_vocab])
    
    return row_bow_vector

# Function to compute cosine similarity between row and candidate vectors
def compute_similarity(row_bow_vector, candidate_vectors, shared_vocab):
    similarities = {}
    for qid, candidate_bow in candidate_vectors.items():
        candidate_bow_vector = np.array([candidate_bow.get(word, 0) for word in shared_vocab])
        similarity = cosine_similarity([row_bow_vector], [candidate_bow_vector])[0][0]
        similarities[qid] = similarity
    return similarities

# Test case: simulate a row of data (e.g., a table row)
row = {
    'city': 'Paris',
    'country': 'France',
    'continent': 'Europe',
    'population': '2140526',
    'area_km2': '105.4',
    'language': 'French'
}

row = {
    'Series_Title': 'Pulp Fiction',
    'Released_Year': 1994,
    'Runtime (min)': 154,
    'Genre': 'Crime, Drama',
    'IMDB_Rating': 8.9,
    'Overview': 'The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner bandits intertwine in four tales of violence and redemption.',
    'Meta_score': 94.0,
    'Director': 'Quentin Tarantino',
    'Star1': 'John Travolta',
    'No_of_Votes': 1826188,
    'Gross': '107,928,762'
}

# Combine the row data into a single text string for BoW processing
row_text = ' '.join([str(row[index]) for index in row if index != 'Overview'])

# Step 1: Retrieve BoW vectors from API for some QIDs
qids = ["Q30", "Q166262", "Q90", "Q104123", "Q45", "Q100", "Q5"]  # Example QIDs
candidate_vectors = get_bow_from_api(qids)

if candidate_vectors is None:
    print("No candidate vectors retrieved from the API.")
else:
    # Step 2: Ensure consistent shared vocabulary
    shared_vocab = set()
    for vector in candidate_vectors.values():
        shared_vocab.update(vector.keys())  # Collect vocabulary from candidate BoWs
    row_tokens = tokenize_text(row_text)
    shared_vocab.update(row_tokens)
    shared_vocab = list(shared_vocab)

    # Step 3: Build the row vector (BoW)
    row_bow_vector = build_row_vector(row_text, shared_vocab)

    # Step 4: Compute similarity between row and candidate vectors
    similarity_scores = compute_similarity(row_bow_vector, candidate_vectors, shared_vocab)

    # Step 5: Output the similarity scores
    print("\nSimilarity Scores between row and candidates:")
    for qid, score in similarity_scores.items():
        print(f"QID: {qid}, Similarity: {score:.4f}")


Similarity Scores between row and candidates:
QID: Q100, Similarity: 0.0037
QID: Q104123, Similarity: 0.1079
QID: Q166262, Similarity: 0.0697
QID: Q30, Similarity: 0.0186
QID: Q45, Similarity: 0.0000
QID: Q5, Similarity: 0.0000
QID: Q90, Similarity: 0.0000


In [None]:
shared_vocab

In [15]:
row_tokens

['pulp',
 'fiction',
 'crime',
 'drama',
 'lives',
 'two',
 'mob',
 'hitmen',
 'boxer',
 'gangster',
 'wife',
 'pair',
 'diner',
 'bandits',
 'intertwine',
 'four',
 'tales',
 'violence',
 'redemption',
 'quentin',
 'tarantino',
 'john',
 'travolta']

In [None]:
shared_vocab

In [4]:
import pandas as pd
from pymongo import MongoClient

# Load the CSV file into a DataFrame
file_path = './imdb_top_1000.csv'
df = pd.read_csv(file_path)

# MongoDB connection
client = MongoClient("mongodb://mongodb:27017/")
db = client["crocodile_db"]
collection = db["input_data"]
trace_collection = db["processing_trace"]
results = collection.find({})
outcome = []
for result in results:
    outcome.append((result["row_id"], result["el_results"]["title"][0]["id"]))

In [5]:
df = pd.read_csv("film_with_QIDs.csv") 
df

Unnamed: 0,title,Title_QID,director,release year,domestic distributor,length in min,worldwide gross
0,Jurassic World,Q3512046,Colin Trevorrow,2015,Universal Pictures,124,1670400637
1,Superman Returns,Q328695,Bryan Singer,2006,Warner Bros.,154,391081192
2,Batman Begins,Q166262,Christopher Nolan,2005,Warner Bros.,140,371853783
3,Avatar,Q24871,James Cameron,2009,20th Century Fox,162,2744336793
4,Titanic,Q44578,James Cameron,1997,Paramount Pictures,194,2208208395
5,The Avengers,Q182218,Joss Whedon,2012,Disney,143,1518815515
6,Harry Potter and the Deathly Hallows Part 2,Q232009,David Yates,2011,Warner Bros.,130,1341511219
7,Furious 7,Q14650496,James Wan,2015,Universal Pictures,137,1515047671
8,Frozen,Q246283,Chris Buck and Jennifer Lee,2013,Walt Disney Pictures,102,1281983879
9,Iron Man 3,Q209538,Shane Black,2013,Disney,130,1215439994


In [6]:
df2 = pd.DataFrame(outcome)
sum(df2[1] == df["Title_QID"]) / len(df2)

0.9666666666666667

In [2]:
import pandas as pd
from pymongo import MongoClient

# Load the CSV file into a DataFrame
file_path = './0DO2KMKV.csv'
df = pd.read_csv(file_path)

# MongoDB connection
client = MongoClient("mongodb://mongodb:27017/")
db = client["crocodile_db"]
collection = db["input_data"]
trace_collection = db["processing_trace"]

# Dataset and table names for tracing
dataset_name = "test"
table_name = "0DO2KMKV"

# Onboard data
for index, row in df.iterrows():
    document = {
        "dataset_name": dataset_name,
        "table_name": table_name,
        "row_id": index,
        "data": row.to_dict(),
        "classified_columns": {
            "NE": ["col0"],  # Assuming Series_Title is the column to be linked
            "LIT": ["col1"]  # Assuming these are literal columns
        },
        "context_columns": list(df.columns),  # Context columns
        "status": "TODO"
    }
    collection.insert_one(document)

    #if index == 9:
    #    break

# Initialize the trace collection
trace_collection.insert_one({
    "dataset_name": dataset_name,
    "table_name": table_name,
    "total_rows": len(df),
    "processed_rows": 0,
    "status": "PENDING"  # Initial status before processing
})

print(f"Data onboarded successfully for dataset '{dataset_name}' and table '{table_name}'.")

Data onboarded successfully for dataset 'test' and table '0DO2KMKV'.


In [None]:
df

In [3]:
from crocodile import Crocodile
import os

# Create an instance of the Crocodile class
crocodile_instance = Crocodile(
    mongo_uri="mongodb://mongodb:27017/",
    db_name="crocodile_db",
    collection_name="input_data",
    trace_collection_name="processing_trace",
    max_candidates=3,
    entity_retrieval_endpoint=os.environ["ENTITY_RETRIEVAL_ENDPOINT"],  # Access the entity retrieval endpoint directly from environment variables
    entity_bow_endpoint=os.environ["ENTITY_BOW_ENDPOINT"],  # Access the entity BoW endpoint directly from environment variables
    entity_retrieval_token=os.environ["ENTITY_RETRIEVAL_TOKEN"]  # Access the entity retrieval token directly from environment variables
)

# Run the entity linking process
crocodile_instance.run(dataset_name=dataset_name, table_name=table_name)

print("Entity linking process completed.")

Entity linking process completed.


In [1]:
import pandas as pd
from pymongo import MongoClient
from crocodile import Crocodile
import os
import json

# Load the CSV file into a DataFrame
file_path = './tables/VGUZX5R3.csv'
df = pd.read_csv(file_path)

# MongoDB connection
client = MongoClient("mongodb://mongodb:27017/")
db = client["crocodile_db"]
collection = db["input_data"]
trace_collection = db["processing_trace"]

# Dataset and table names for tracing
dataset_name = "test"
table_name = "VGUZX5R3"

# Load the correct QIDs for the table
with open('./tables/correct_qids_VGUZX5R3.json', 'r') as file:
    correct_qids = json.load(file)


# Onboard data
for index, row in df.iterrows():
    document = {
        "dataset_name": dataset_name,
        "table_name": table_name,
        "row_id": index,
        "data": row.to_dict(),
        "classified_columns": {
            "NE": {
                "0": "LOCATION",
                "1": "LOCATION",
                "3": "LOCATION"
            },
            "LIT": ["2", "4", "5"]
        },
        "context_columns": [str(i) for i in range(len(list(df.columns)))],  # Context columns
        "correct_qids": correct_qids,
        "status": "TODO"
    }
    collection.insert_one(document)


# Initialize the trace collection
trace_collection.insert_one({
    "dataset_name": dataset_name,
    "table_name": table_name,
    "total_rows": len(df),
    "processed_rows": 0,
    "status": "PENDING"  # Initial status before processing
})

print(f"Data onboarded successfully for dataset '{dataset_name}' and table '{table_name}'.")


# Create an instance of the Crocodile class
crocodile_instance = Crocodile(
    mongo_uri="mongodb://mongodb:27017/",
    db_name="crocodile_db",
    collection_name="input_data",
    trace_collection_name="processing_trace",
    max_candidates=3,
    entity_retrieval_endpoint=os.environ["ENTITY_RETRIEVAL_ENDPOINT"],  # Access the entity retrieval endpoint directly from environment variables
    entity_bow_endpoint=os.environ["ENTITY_BOW_ENDPOINT"],  # Access the entity BoW endpoint directly from environment variables
    entity_retrieval_token=os.environ["ENTITY_RETRIEVAL_TOKEN"]  # Access the entity retrieval token directly from environment variables
)

# Run the entity linking process
crocodile_instance.run(dataset_name=dataset_name, table_name=table_name)

print("Entity linking process completed.")

Data onboarded successfully for dataset 'test' and table 'VGUZX5R3'.
{'col0': 'Equatorial Guinea', 'col1': 'Malabooo', 'col2': '187,302', 'col3': 'Bata', 'col4': '250,770', 'col5': 1.34}{'col0': 'Burundii', 'col1': 'Gitegaa', 'col2': '22,989', 'col3': 'Bujumbura', 'col4': '497,166', 'col5': 21.6}{'col0': 'Australiaa', 'col1': 'Canberrraa', 'col2': '390,706', 'col3': 'Sydney', 'col4': '4,921,000', 'col5': 12.6}{'col0': 'Bollivia', 'col1': 'Sucreee', 'col2': '259,388', 'col3': 'Santa Cruz de la Sierra', 'col4': '1,453,549', 'col5': 7.1}  {'0': 'LOCATION', '1': 'LOCATION', '3': 'LOCATION'} {'col0': 'Benin', 'col1': 'Porto-Novooo', 'col2': '223,500', 'col3': 'Cotonou', 'col4': '761,100', 'col5': 3.4} {'col0': 'Indiaa', 'col1': 'NNew Delhi', 'col2': '249,998', 'col3': 'Mumbai', 'col4': '12,442,373', 'col5': 49.77}{'col0': 'Belize', 'col1': 'Belmopannn', 'col2': '16,400', 'col3': 'Belize City', 'col4': '70,000', 'col5': 4.27}
{'0': 'LOCATION', '1': 'LOCATION', '3': 'LOCATION'}{'0': 'LOCATION

In [12]:
df.columns

Index(['player', 'position', 'country', 'birthplace', 'played', 'teams ↓',
       'notes'],
      dtype='object')

In [3]:
import pandas as pd
import os
import json

# Load the CSV file into a DataFrame
file_path = './tables/VGUZX5R3.csv'
df = pd.read_csv(file_path)

# Dataset and table names for tracing
dataset_name = "test"
table_name = "VGUZX5R3"

# Load the correct QIDs for the table
with open('./tables/correct_qids_VGUZX5R3.json', 'r') as file:
    correct_qids = json.load(file)

# Onboard data
for index, row in df.iterrows():
    document = {
        "dataset_name": dataset_name,
        "table_name": table_name,
        "row_id": index,
        "data": row.to_dict(),
        "classified_columns": {
            "NE": {
                "col0": "LOCATION",
                "col1": "LOCATION",
                "col3": "LOCATION"
            },
            "LIT": ["col2", "col4", "col5"]
        },
        "context_columns": [i for i in range(len(list(df.columns)))],  # Context columns
        "correct_qids": correct_qids,
        "status": "TODO"
    }

In [4]:
document

{'dataset_name': 'test',
 'table_name': 'VGUZX5R3',
 'row_id': 33,
 'data': {'col0': 'Vietnamm',
  'col1': 'Hanoii',
  'col2': '6,500,000',
  'col3': 'Ho Chi Minh City',
  'col4': '7,123,340',
  'col5': 1.1},
 'classified_columns': {'NE': {'col0': 'LOCATION',
   'col1': 'LOCATION',
   'col3': 'LOCATION'},
  'LIT': ['col2', 'col4', 'col5']},
 'context_columns': [0, 1, 2, 3, 4, 5],
 'correct_qids': {'0-0': 'Q408',
  '0-1': 'Q3114',
  '0-3': 'Q3130',
  '1-0': 'Q242',
  '1-1': 'Q3043',
  '1-3': 'Q108223',
  '2-0': 'Q962',
  '2-1': 'Q3799',
  '2-3': 'Q43595',
  '3-0': 'Q750',
  '3-1': 'Q2907',
  '3-3': 'Q170688',
  '4-0': 'Q155',
  '4-1': 'Q119158',
  '4-3': 'Q174',
  '5-0': 'Q967',
  '5-1': 'Q167551',
  '5-3': 'Q3854',
  '6-0': 'Q1009',
  '6-1': 'Q3808',
  '6-3': 'Q132830',
  '7-0': 'Q16',
  '7-1': 'Q1930',
  '7-3': 'Q172',
  '8-0': 'Q148',
  '8-1': 'Q956',
  '8-3': 'Q8686',
  '9-0': 'Q1008',
  '9-1': 'Q3768',
  '9-3': 'Q1515',
  '10-0': 'Q983',
  '10-1': 'Q3818',
  '10-3': 'Q320792',
  '1