In [15]:
import duckdb
import pandas as pd
import logging
import warnings

# Use this line to ignore the specific UserWarning message
warnings.filterwarnings(
    action='ignore',
    message='pandas only supports SQLAlchemy',
    category=UserWarning
)

def load_sample_data(table_name, sample_size=1000):
    """
    Loads a sample of data from a DuckDB table into a pandas DataFrame.

    Args:
        table_name (str): The name of the table to sample from.
        sample_size (int): The number of rows to sample.

    Returns:
        pd.DataFrame: A DataFrame containing the sample data.
    """
    # 1. Establish a connection to the DuckDB database file
    # Using ':memory:' for a temporary in-memory database
    # Replace with 'path/to/your/database.duckdb' to connect to a file
    with duckdb.connect(database='/opt/test-data/experimental.duckdb', read_only=True) as conn:
        try:
            # 2. Use a SQL query to select a sample
            query = f"SELECT * FROM {table_name} using sample {sample_size} rows"

            # 3. Use pandas.read_sql to execute the query and load the data
            df = pd.read_sql(query, conn)
            
            print(f"Successfully loaded a sample of {len(df)} rows from {table_name}.")
            return df
        
        except duckdb.Error as e:
            print(f"An error occurred: {e}")
            return None

In [16]:
df = load_sample_data("er.pair_features",20000)
display(df.head())

Successfully loaded a sample of 20000 rows from er.pair_features.


Unnamed: 0,company_id_a,company_id_b,domain_exact,country_exact,city_exact,name_exact,name_compact_exact,name_prefix4_eq,name_prefix6_eq,emp_cur_absdiff,emp_cur_reldiff,emp_tot_absdiff,emp_tot_reldiff,both_have_domain,both_have_name,country_a,country_b,city_a,city_b
0,3141475,3813168,0,1.0,0.0,0,0,0,0,2,0.4,3,0.3,1,1,italy,italy,udine,milan
1,385551,3218682,0,1.0,0.0,0,0,1,1,0,,4,0.8,1,1,united states,united states,new york,hampton
2,1178364,4420764,0,1.0,,0,0,1,1,3,1.0,4,0.8,1,1,united kingdom,united kingdom,manchester,
3,665884,4727305,0,1.0,0.0,0,0,1,1,13,1.0,28,0.933333,1,1,united states,united states,parkersburg,grand rapids
4,577274,3486868,0,1.0,0.0,0,0,1,1,5,1.0,5,0.833333,1,1,myanmar,myanmar,mandalay,rangoon


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer

# Load your pair_features (CSV or from DuckDB via .df())
#df = pd.read_csv("pair_features_sample.csv")
# --- Weak labels ---
pos_mask = (df["domain_exact"] == 1) | (
    (df["country_exact"] == 1) &
    (df["name_prefix6_eq"] == 1) &
    (df["emp_tot_reldiff"].fillna(1.0) <= 0.20)
)

neg_mask = (df["country_exact"] == 0) | (
    (df["name_prefix4_eq"] == 0) &
    (df["emp_tot_reldiff"].fillna(1.0) >= 0.90)
)

# remove overlaps: positives win ties
neg_mask = neg_mask & (~pos_mask)

pos = df[pos_mask].copy()
pos["label"] = 1
neg = df[neg_mask].copy()
neg["label"] = 0

# Optional: downsample negatives to balance
neg_sample = neg.sample(n=min(len(pos), len(neg)), random_state=42)
weak_train = pd.concat([pos, neg_sample], ignore_index=True).sample(frac=1, random_state=42)

# --- Features / target ---
drop_id_cols = ["company_id_a", "company_id_b"]
X = weak_train.drop(columns=["label"] + drop_id_cols, errors="ignore")
y = weak_train["label"]

# 2) Work out which categorical columns are actually present
candidate_cat = ["country_a", "country_b", "city_a", "city_b"]
cat_cols = [c for c in candidate_cat if c in X.columns]
num_cols = [c for c in X.columns if c not in cat_cols]

# (Optional) quick sanity: see if anything is missing
missing = set(cat_cols) - set(X.columns)
assert not missing, f"These categorical columns are missing from X: {missing}"

# 3) Preprocess: impute NaNs, one-hot the cats, pass through nums
pre = ColumnTransformer(
    transformers=[
        ("num", make_pipeline(SimpleImputer(strategy="median")), num_cols),
        ("cat", make_pipeline(SimpleImputer(strategy="most_frequent"),
                              OneHotEncoder(handle_unknown="ignore")), cat_cols),
    ],
    remainder="drop",
)

# 4) Model
clf = Pipeline([
    ("pre", pre),
    ("model", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

clf.fit(X_train, y_train)
print(classification_report(y_val, clf.predict(X_val), digits=3))

# Probabilities for downstream use:
val_proba = clf.predict_proba(X_val)[:, 1]

              precision    recall  f1-score   support

           0      1.000     0.993     0.996       423
           1      0.994     1.000     0.997       525

    accuracy                          0.997       948
   macro avg      0.997     0.996     0.997       948
weighted avg      0.997     0.997     0.997       948



In [18]:
import joblib
joblib.dump(clf, "../models/er_pair_clf.joblib")

['../models/er_pair_clf.joblib']