In [2]:
import json, ast
import numpy as np
import pandas as pd
from db.db import init_db, read_table
from db.models import TableName
from etl.load import load_raw_data, load_clean_data
from etl.pipeline import _process_table
from pattern_mining.pipeline import _process_and_load_cleaned_data
from email_prediction.feature_engineering.pipeline import _build_profile_and_pad, build_feature_matrix
from email_prediction.feature_engineering.features.feature_builder import _build_firm_stats_lookup, _build_firm_template_lookup, _build_rows_for_investor
from sklearn.model_selection import train_test_split
from catboost import CatBoostRanker

# Placebo and Ablation Testing

Following feedback from Antony I will do some furhter testing to confirm importance of feature types and synthetic investor placebo testing.

Lets start with placebo testing of synthetic investors. We will do this by generating a modest dataset of synthetic investors, then we will match the data size in a separate set by randomly selecting duplicate investors. We will no do segmentation and use a small portion of the data.

In [2]:
# Setup DB and load raw data
init_db()
load_raw_data(TableName.LP)

# Process LP table
clean_lp = _process_table(TableName.LP)

# Write results
load_clean_data(TableName.LP_CLEAN, clean_lp)

Tables created successfully.
Read from LP Contact Data
Write to LP complete!
Written 165268 records to LP
Read LP table from database!
Table Normalisation Complete!
Table Regex-Cleaning Complete!
Removed 18 bad investor rows!
Table Standardisation Complete!
87 invalid email lengths found!
73142 missing emails!
31 invalid email formats found!
396 invalid LinkedIns found!
47184 missing LinkedIns!
2181 invalid firm lengths found!
0 missing firms!
8897 invalid investor lengths found!
0 missing investors!
22564 missing linkedin and email pairs!
Removed 30 emails with invalid local-part characters.
Wrote 92078 rows to LP_CLEAN
Also wrote 92078 rows to COMBINED_CLEAN


Now to run our pattern mining.

In [3]:
# Run pattern mining
clean_data = _process_and_load_cleaned_data()

# Fix a random seed for reproducibility
RANDOM_SEED = 42

# Take a random 25% sample of rows as the fullset will take too long to train
clean_data = clean_data.sample(frac=0.25, random_state=RANDOM_SEED)

# Replace in table
load_clean_data(TableName.COMBINED_CLEAN, clean_data, replace=True)

Read COMBINED_CLEAN table from database!
Email Token Encoder Initialised!
Tokenised 5000 out of 92078
Tokenised 10000 out of 92078
Tokenised 15000 out of 92078
Tokenised 20000 out of 92078
Tokenised 25000 out of 92078
Tokenised 30000 out of 92078
Tokenised 35000 out of 92078
Tokenised 40000 out of 92078
Tokenised 45000 out of 92078
Tokenised 50000 out of 92078
Tokenised 55000 out of 92078
Tokenised 60000 out of 92078
Tokenised 65000 out of 92078
Tokenised 70000 out of 92078
Tokenised 75000 out of 92078
Tokenised 80000 out of 92078
Tokenised 85000 out of 92078
Tokenised 90000 out of 92078
Finished tokenizing: 83504 valid rows, 0 failures.
8574 unknown sequences out of 92078
297 unique templates!
>/home/d_bowman/documents/IRP/irp-db1724/spmf.jar
Sequential rules count: 12
Total time : 2317 ms
Max memory (mb)424.197998046875

12 rules mined!
[{'lhs_tokens': ['first_original_0'], 'rhs_tokens': ['.'], 'support': 33360, 'confidence': 0.7062410026251165}, {'lhs_tokens': ['first_original_0'], 

Now we will run a modest padding campaign.

In [4]:
# Get data
clean_data = read_table(TableName.COMBINED_CLEAN)

# Get data
cand_temps = read_table(TableName.CANDIDATE_TEMPLATES)

# Get firm template map
firm_template_map = read_table(TableName.FIRM_TEMPLATE_MAP)

# Find low investor firms
low_investor_firms = firm_template_map[
    firm_template_map["num_investors"] < 5
]["firm"].tolist()

# Build firm profiles for each data set
augmented = _build_profile_and_pad(
    clean_data, firm_template_map, cand_temps, low_investor_firms, 5
)
print(f"Generated {len(augmented) - len(clean_data)} new investors!")


Read COMBINED_CLEAN table from database!
Read CANDIDATE_TEMPLATES table from database!
Read FIRM_TEMPLATE_MAP table from database!
Generated 35700 new investors!


Next we will randomly select investors to duplicate and generate randomly padded set.

In [None]:
# Number of random investors to pad
n_length_to_pad = len(augmented) - len(clean_data)

# Randomly sample investors to duplicate
rng = np.random.default_rng(RANDOM_SEED)
placebo_rows = clean_data.sample(
    n=n_length_to_pad, 
    replace=True, 
    random_state=RANDOM_SEED
).copy()

# Randomly reassign firm from global pool to break any true structure signal from random assignment
firms = clean_data["firm"].unique()
placebo_rows["firm"] = rng.choice(firms, size=len(placebo_rows))

# Combine with original clean_data
random_augment = pd.concat([clean_data, placebo_rows], ignore_index=True)

print(f"Generated {len(random_augment) - len(clean_data)} random investors!")

Generated 35700 random investors!


Now we can generate our feature matrices. We will use the complex table for the randomly augmented so we don't have to define another table

In [6]:
# Save both to separate tables
build_feature_matrix(
    augmented,
    cand_temps,
    firm_template_map,
    table=TableName.FEATURE_MATRIX,
)
build_feature_matrix(
    random_augment,
    cand_temps,
    firm_template_map,
    table=TableName.FEATURE_MATRIX_COMPLEX,
)

0 features built out of 56544
5000 features built out of 56544
10000 features built out of 56544
15000 features built out of 56544
20000 features built out of 56544
25000 features built out of 56544
30000 features built out of 56544
35000 features built out of 56544
40000 features built out of 56544
45000 features built out of 56544
50000 features built out of 56544
55000 features built out of 56544
0 features built out of 56544
5000 features built out of 56544
10000 features built out of 56544
15000 features built out of 56544
20000 features built out of 56544
25000 features built out of 56544
30000 features built out of 56544
35000 features built out of 56544
40000 features built out of 56544
45000 features built out of 56544
50000 features built out of 56544
55000 features built out of 56544


Ok now we will define our splits. Let's start with a simple firm level split. We'll define a test set of 5000 investors. We will isolate those firms and derive a training/validation set from the remaining investors that are not in those firms.

In [33]:
# Load clean data
clean_data = read_table(TableName.COMBINED_CLEAN)

# Seed random
rng = np.random.default_rng(RANDOM_SEED)

TEST_ROWS_N = 5000
TRAIN_VAL_ROWS_N = 15000

# Pick test set
shuffled_firms = rng.permutation(clean_data["firm"].unique())
test_firms, test_rows = [], 0
for f in shuffled_firms:
    rows = clean_data[clean_data["firm"] == f]
    test_rows += len(rows)
    test_firms.append(f)
    if test_rows >= TEST_ROWS_N:
        break

# Remaining firms
remaining_firms = [f for f in clean_data["firm"].unique() if f not in test_firms]
remaining_df = clean_data[clean_data["firm"].isin(remaining_firms)]    

# Sample train/val rows from remaining firms
sampled = remaining_df.sample(n=TRAIN_VAL_ROWS_N, random_state=RANDOM_SEED)
firm_held_train_ids, firm_held_val_ids = np.split(sampled["id"].sample(frac=1, random_state=RANDOM_SEED), [int(.8*len(sampled))])

# Build splits
firm_held_test_ids = clean_data[clean_data["firm"].isin(test_firms)]["id"]

# Save to csv
pd.DataFrame({"train_ids": firm_held_train_ids}).to_csv("splits/firm_held_train_ids.csv", index=False)
pd.DataFrame({"val_ids": firm_held_val_ids}).to_csv("splits/firm_held_val_ids.csv", index=False)
pd.DataFrame({"test_ids": firm_held_test_ids}).to_csv("splits/firm_held_test_ids.csv", index=False)

Read COMBINED_CLEAN table from database!


  return bound(*args, **kwds)


Next we will isolate a complex and standard test set. We will have 2000 of both in the test set and a healthy mix in the training and validation. 

In [34]:
FLAGS = [
    "has_nfkd_normalized",
    "has_multiple_last_names",
    "has_german_char",
    "has_nickname",
    "has_multiple_first_names",
    "has_middle_name",
]

N_COMPLEX = 2000
N_STANDARD = 2000

# Build 1 row per investor with flags
flag_view = clean_data[['id'] + FLAGS].drop_duplicates(subset=['id'])

# Complex vs standard investors
complex_mask = (flag_view[FLAGS].sum(axis=1) > 0)
complex_ids_all  = flag_view.loc[complex_mask,  'id'].to_numpy()
standard_ids_all = flag_view.loc[~complex_mask, 'id'].to_numpy()

# Sample test sets
rng = np.random.default_rng(RANDOM_SEED)
complex_sample  = rng.choice(complex_ids_all,  size=min(N_COMPLEX,  len(complex_ids_all)),  replace=False)
standard_sample = rng.choice(standard_ids_all, size=min(N_STANDARD, len(standard_ids_all)), replace=False)
test_ids = np.concatenate([complex_sample, standard_sample])

# Build train/val from the remaining pool (80/20)
pool = flag_view.loc[~flag_view['id'].isin(test_ids), 'id'].sample(frac=1.0, random_state=RANDOM_SEED)
n_val = int(0.2 * len(pool))
val_ids   = pool.iloc[:n_val].to_frame(name='val_ids')
train_ids = pool.iloc[n_val:].to_frame(name='train_ids')

# Save cultural bias TEST ids 
pd.DataFrame({'test_ids_complex': complex_sample}).to_csv('splits/test_ids_complex_only.csv', index=False)
pd.DataFrame({'test_ids_standard': standard_sample}).to_csv('splits/test_ids_standard_only.csv', index=False)
train_ids.to_csv('splits/train_ids_cult_bias.csv', index=False)
val_ids.to_csv('splits/val_ids_cult_bias.csv', index=False)

Finally we need a simple split for the placebo testing.

In [35]:
all_ids = clean_data["id"].astype(str).unique()

# 70/15/15 split
train_ids, test_ids = train_test_split(all_ids, test_size=0.15, random_state=42)
train_ids, val_ids  = train_test_split(train_ids, test_size=0.1765, random_state=42)  

# Save to CSVs
pd.DataFrame({"train_ids": train_ids}).to_csv("splits/train_ids.csv", index=False)
pd.DataFrame({"val_ids": val_ids}).to_csv("splits/val_ids.csv", index=False)
pd.DataFrame({"test_ids": test_ids}).to_csv("splits/test_ids.csv", index=False)