In [2]:
import os
import pandas as pd
from datasketch import MinHash, MinHashLSHEnsemble
from typing import List, Tuple

In [51]:


# --- Configuration ---
BASE_FILE = "D:/uni/AdvancedTopics/project2/sinteticDB/IMDB/IMDB_Base.csv"
NEW_FILE = "D:/uni/AdvancedTopics/project2/sinteticDB/IMDB/Versions/imdb_with_awards.csv"
CANDIDATE_DIR = "sinteticDB/IMDB/externalTables"
MAIN_DATASET_KEYS = ["Series_Title"]
NUM_PERM = 128

def create_minhash(values: List[str], num_perm=128):
    m = MinHash(num_perm=num_perm)
    for v in values:
        if pd.notna(v):
            m.update(str(v).strip().lower().encode('utf8'))
    return m

def extract_column_dataframes(directory: str) -> List[Tuple[str, str, List[str], int]]:
    results = []
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            table_name = filename[:-4]
            df = pd.read_csv(os.path.join(directory, filename))
            for col in df.columns:
                values = df[col].astype(str).dropna().unique().tolist()
                results.append((table_name, col, values, len(values)))
    return results

def recommend_join_type(candidate_keys, main_keys, jaccard_sim, coverage_threshold=0.7):
    key_overlap = len(set(candidate_keys).intersection(set(main_keys)))
    if key_overlap == len(main_keys):
        return "inner join"
    elif key_overlap > 0 and jaccard_sim > coverage_threshold:
        return "left join"
    else:
        return "outer join"

In [52]:
# --- Load base and new dataset versions ---
base_df = pd.read_csv(BASE_FILE)
new_df = pd.read_csv(NEW_FILE)


In [53]:
# --- Detect newly added attributes ---
base_cols = set(base_df.columns)
new_cols = set(new_df.columns)
added_cols = new_cols - base_cols

if not added_cols:
    print("✅ No new attributes found.")
    exit()

print(f"🆕 New attributes detected: {', '.join(added_cols)}\n")

🆕 New attributes detected: Award_Name, Won, Award_Year, Award_Category



In [54]:
# --- Index external candidate columns with MinHash + LSH Ensemble ---
column_entries = extract_column_dataframes(CANDIDATE_DIR)
minhashes = []
index_metadata = []

for table_name, col_name, values, size in column_entries:
    mh = create_minhash(values, NUM_PERM)
    minhashes.append(mh)
    index_metadata.append({
        'table': table_name,
        'column': col_name,
        'full_name': f"{table_name}.{col_name}",
        'size': size,
        'keys': [col_name]  # simple assumption
    })

lsh = MinHashLSHEnsemble(threshold=0.1, num_perm=NUM_PERM,num_part=32)
lsh.index(
    [(m['full_name'], m['size']) for m in index_metadata],
    minhashes
)

TypeError: MinHashLSHEnsemble.index() takes 2 positional arguments but 3 were given

In [68]:
#possibile fix
# --- Index external candidate columns with MinHash + LSH Ensemble ---
column_entries = extract_column_dataframes(CANDIDATE_DIR)
minhashes = []
index_metadata = []

for table_name, col_name, values, size in column_entries:
    mh = create_minhash(values, NUM_PERM)
    minhashes.append(mh)
    index_metadata.append({
        'table': table_name,
        'column': col_name,
        'full_name': f"{table_name}.{col_name}",
        'size': size,
        'keys': [col_name]  # simple assumption
    })

lsh = MinHashLSHEnsemble(threshold=0.5, num_perm=NUM_PERM)
keys = [m['full_name'] for m in index_metadata]
sizes = [m['size'] for m in index_metadata]
combined = list(zip(keys, minhashes, sizes))
lsh.index(combined) 

In [69]:
#--- For each new attribute: search + recommend join ---
for new_col in added_cols:
    print(f"🔍 Analyzing new attribute: `{new_col}`")

    new_values = new_df[new_col].astype(str).dropna().unique().tolist()
    if not new_values:
        print("⚠️ No values found for this attribute. Skipping.\n")
        continue

    new_attr_minhash = create_minhash(new_values, NUM_PERM)
    candidates = lsh.query(new_attr_minhash, len(new_values))
    for key in lsh.query(new_attr_minhash, len(new_values)):
        print(key)

    ranked = []
    for meta, mh in zip(index_metadata, minhashes):
        if meta['full_name'] in candidates:
            sim = new_attr_minhash.jaccard(mh)
            join_type = recommend_join_type(meta['keys'], MAIN_DATASET_KEYS, sim)
            ranked.append((meta['table'], meta['column'], sim, join_type))

    ranked.sort(key=lambda x: x[2], reverse=True)

    if ranked:
        print("\nTop Candidate Matches:")
        for table, column, sim, join_type in ranked[:5]:
            print(f" → {table}.{column} | Jaccard: {sim:.4f} | Join: {join_type}")

        best = ranked[0]
        print(f"\n✅ Best match for `{new_col}`: {best[0]}.{best[1]} → {best[3]} (Sim: {best[2]:.4f})\n")
    else:
        print("❌ No good matches found for this attribute.\n")

🔍 Analyzing new attribute: `Award_Name`
awards.Award_Name
country_data.Localized_Title
❌ No good matches found for this attribute.

🔍 Analyzing new attribute: `Won`
streaming.Subscription_Required
awards.Won
country_data.Localized_Title
❌ No good matches found for this attribute.

🔍 Analyzing new attribute: `Award_Year`
❌ No good matches found for this attribute.

🔍 Analyzing new attribute: `Award_Category`
awards.Award_Category
country_data.Localized_Title
❌ No good matches found for this attribute.



24