In [None]:
import sys
from pathlib import Path
# Add tools directory so Matcher can be imported (notebook is in tools/matching_related/)
_cwd = Path.cwd()
if (_cwd / "Matcher").exists():
    _tools = _cwd
elif (_cwd.parent / "Matcher").exists():
    _tools = _cwd.parent
elif (_cwd / "tools" / "Matcher").exists():
    _tools = _cwd / "tools"
else:
    _tools = _cwd.parent
if str(_tools) not in sys.path:
    sys.path.insert(0, str(_tools))

In [None]:
%reload_ext autoreload
%autoreload 2

from Matcher.utilities import DataLoader
from Matcher.scorer import ScorerConfig, PreferenceScorer, SimilarityScorer
from Matcher.matcher import Matcher
import numpy as np

In [None]:
ROUND = 2

dataloader = DataLoader("./embedded_data_round2/")
config = ScorerConfig(
    base_preference_score = 200,
    mbti_multiplier=0.6,
)

heterosexual_female_df = dataloader.load_data("embedded_heterosexual_female_df")
heterosexual_male_df = dataloader.load_data("embedded_heterosexual_male_df")
homosexual_female_df = dataloader.load_data("embedded_homosexual_female_df")
homosexual_male_df = dataloader.load_data("embedded_homosexual_male_df")

In [None]:
import sqlite3
import pandas as pd

db = sqlite3.connect("../db.sqlite3")
matches = pd.read_sql_query("SELECT * FROM match", db)
discarded_matches = matches[matches["discarded"] == 1][["applicant1_id", "applicant2_id"]]
discarded_matches

hetro_disarded_idxs = []
homo_female_disarded_idxs = []
homo_male_disarded_idxs = []

# Id sets for each pool (assumes each df has column "id")
het_female_ids = set(heterosexual_female_df["id"])
het_male_ids = set(heterosexual_male_df["id"])
homo_female_ids = set(homosexual_female_df["id"])
homo_male_ids = set(homosexual_male_df["id"])

for idx, row in discarded_matches.iterrows():
    a1, a2 = row["applicant1_id"], row["applicant2_id"]
    if (a1 in het_female_ids and a2 in het_male_ids):
        idx_a1 = heterosexual_female_df[heterosexual_female_df["id"] == a1].index[0]
        idx_a2 = heterosexual_male_df[heterosexual_male_df["id"] == a2].index[0]
        hetro_disarded_idxs.append((idx_a1, idx_a2))
    elif (a1 in het_male_ids and a2 in het_female_ids):
        idx_a1 = heterosexual_male_df[heterosexual_male_df["id"] == a1].index[0]
        idx_a2 = heterosexual_female_df[heterosexual_female_df["id"] == a2].index[0]
        hetro_disarded_idxs.append((idx_a2, idx_a1))
    elif (a1 in homo_female_ids and a2 in homo_female_ids):
        idx_a1 = homosexual_female_df[homosexual_female_df["id"] == a1].index[0]
        idx_a2 = homosexual_female_df[homosexual_female_df["id"] == a2].index[0]
        homo_female_disarded_idxs.append((idx_a1, idx_a2))
    elif (a1 in homo_male_ids and a2 in homo_male_ids):
        idx_a1 = homosexual_male_df[homosexual_male_df["id"] == a1].index[0]
        idx_a2 = homosexual_male_df[homosexual_male_df["id"] == a2].index[0]
        homo_male_disarded_idxs.append((idx_a1, idx_a2))
    else:
        print(f"Invalid match: {a1} {a2}")

In [None]:
no_response_applicants = pd.read_csv("./match_result/no_response_applicants.csv", header=None)

no_response_het_female_idxs = []
no_response_het_male_idxs = []
no_response_homo_female_idxs = []
no_response_homo_male_idxs = []

for idx, row in no_response_applicants.iterrows():
    a = row[0]
    if a in het_female_ids:
        idx = heterosexual_female_df[heterosexual_female_df["id"] == a].index[0]
        no_response_het_female_idxs.append(idx)
    elif a in het_male_ids:
        idx = heterosexual_male_df[heterosexual_male_df["id"] == a].index[0]
        no_response_het_male_idxs.append(idx)
    elif a in homo_female_ids:
        idx = homosexual_female_df[homosexual_female_df["id"] == a].index[0]
        no_response_homo_female_idxs.append(idx)
    elif a in homo_male_ids:
        idx = homosexual_male_df[homosexual_male_df["id"] == a].index[0]
        no_response_homo_male_idxs.append(idx)
    else:
        print(f"Invalid applicant: {a}")

In [None]:
FM_preference_scorer = PreferenceScorer(config, heterosexual_female_df, heterosexual_male_df)
MF_preference_scorer = PreferenceScorer(config, heterosexual_male_df, heterosexual_female_df)
MM_preference_scorer = PreferenceScorer(config, homosexual_male_df, homosexual_male_df)
FF_preference_scorer = PreferenceScorer(config, homosexual_female_df, homosexual_female_df)

FM_preference_res = FM_preference_scorer.calculate_score_matrix()
MF_preference_res = MF_preference_scorer.calculate_score_matrix()
MM_preference_res = MM_preference_scorer.calculate_score_matrix()
FF_preference_res = FF_preference_scorer.calculate_score_matrix()


In [None]:
FM_similarity_scorer = SimilarityScorer(config, heterosexual_female_df, heterosexual_male_df)
MF_similarity_scorer = SimilarityScorer(config, heterosexual_male_df, heterosexual_female_df)
MM_similarity_scorer = SimilarityScorer(config, homosexual_male_df, homosexual_male_df)
FF_similarity_scorer = SimilarityScorer(config, homosexual_female_df, homosexual_female_df)

FM_similarity_res = FM_similarity_scorer.calculate_score_matrix()
MF_similarity_res = MF_similarity_scorer.calculate_score_matrix()
MM_similarity_res = MM_similarity_scorer.calculate_score_matrix()
FF_similarity_res = FF_similarity_scorer.calculate_score_matrix()

In [None]:
MAX_SCORE = 650
MINMAX_RATIO = 0.7
NO_RESPONSE_PENALTY = 150

# sumup preference and similarity
total_FM = FM_preference_res + FM_similarity_res
total_MF = MF_preference_res + MF_similarity_res
total_MM = MM_preference_res + MM_similarity_res
total_FF = FF_preference_res + FF_similarity_res

# clip the upper bound of the score matrix
total_FM = np.clip(total_FM, max=MAX_SCORE)
total_MF = np.clip(total_MF, max=MAX_SCORE)
total_MM = np.clip(total_MM, max=MAX_SCORE)
total_FF = np.clip(total_FF, max=MAX_SCORE)

# remove self-match
total_MM[np.arange(len(total_MM)), np.arange(len(total_MM))] = -np.inf
total_FF[np.arange(len(total_FF)), np.arange(len(total_FF))] = -np.inf

# remove discarded matches
for a1, a2 in hetro_disarded_idxs:
    total_FM[a1, a2] = -np.inf
    total_MF[a2, a1] = -np.inf
for a1, a2 in homo_female_disarded_idxs:
    total_FF[a1, a2] = -np.inf
    total_FF[a2, a1] = -np.inf
for a1, a2 in homo_male_disarded_idxs:
    total_MM[a1, a2] = -np.inf
    total_MM[a2, a1] = -np.inf

# add no response penalty
for a1 in no_response_het_female_idxs:
    total_MF[:, a1] -= NO_RESPONSE_PENALTY
for a1 in no_response_het_male_idxs:
    total_FM[:, a1] -= NO_RESPONSE_PENALTY
for a1 in no_response_homo_female_idxs:
    total_FF[:, a1] -= NO_RESPONSE_PENALTY
for a1 in no_response_homo_male_idxs:
    total_MM[:, a1] -= NO_RESPONSE_PENALTY
    

In [None]:
# aggregate the total score for both applicants
final_FM = MINMAX_RATIO * np.min([total_FM, total_MF.T], axis=0) + (1 - MINMAX_RATIO) * np.max([total_FM, total_MF.T], axis=0)
final_MM = MINMAX_RATIO * np.min([total_MM, total_MM.T], axis=0) + (1 - MINMAX_RATIO) * np.max([total_MM, total_MM.T], axis=0)
final_FF = MINMAX_RATIO * np.min([total_FF, total_FF.T], axis=0) + (1 - MINMAX_RATIO) * np.max([total_FF, total_FF.T], axis=0)

final_FM = np.clip(final_FM, a_min=0, a_max=MAX_SCORE)
final_MM = np.clip(final_MM, a_min=0, a_max=MAX_SCORE)
final_FF = np.clip(final_FF, a_min=0, a_max=MAX_SCORE)

In [None]:
from matplotlib import pyplot as plt

plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.hist(final_FM.flatten(), range=(2, MAX_SCORE), bins=100)
plt.subplot(1, 3, 2)
plt.hist(final_MM.flatten(), range=(2, MAX_SCORE), bins=100)
plt.subplot(1, 3, 3)
plt.hist(final_FF.flatten(), range=(2, MAX_SCORE), bins=100)

plt.show()

In [None]:
hetrosexual_matcher = Matcher(final_FM)
hetrosexual_matching_result = hetrosexual_matcher.hungarian()
MM_matcher = Matcher(final_MM)
MM_matching_result = MM_matcher.max_weight_matching_same_group()
FF_matcher = Matcher(final_FF)
FF_matching_result = FF_matcher.max_weight_matching_same_group()

In [None]:
matched_pairs = []
unmatched_male = []
unmatched_female = []
total_score = 0

len_f, len_m = total_FM.shape
for f_idx, m_idx in hetrosexual_matching_result:
    if f_idx < len_f and m_idx < len_m:
        score = final_FM[f_idx, m_idx]
        if score > 0:
            matched_pairs.append((f_idx, m_idx, score.item()))
            total_score += score
        else:
            unmatched_male.append(m_idx)
            unmatched_female.append(f_idx)
    elif f_idx >= len_f:
        unmatched_male.append(m_idx)
    elif m_idx >= len_m:
        unmatched_female.append(f_idx)
        
matched_pairs.sort(key=lambda x: x[2], reverse=True)

print(f"Total score: {total_score}, average score: {total_score / len(matched_pairs)}")
print(f"Num pairs: {len(matched_pairs)}, num unmatched: M-{len(unmatched_male)}, F-{len(unmatched_female)}")


In [None]:
len_m = total_MM.shape[0]

matched_gay_pairs = []
total_gay_score = 0
matched_gay_idx = set()
for m1_idx, m2_idx in MM_matching_result:
    score = final_MM[m1_idx, m2_idx]
    if score > 0:
        matched_gay_pairs.append((m1_idx, m2_idx, score.item()))
        total_gay_score += score
        matched_gay_idx.add(m1_idx)
        matched_gay_idx.add(m2_idx)
unmatched_gay = [i for i in range(len_m) if i not in matched_gay_idx]

avg = total_gay_score / len(matched_gay_pairs) if matched_gay_pairs else 0
print(f"Total score: {total_gay_score}, average score: {avg}")
print(f"Num pairs: {len(matched_gay_pairs)}, num unmatched: {len(unmatched_gay)}")


In [None]:
len_f = total_FF.shape[0]

matched_les_pairs = []
total_les_score = 0
matched_les_idx = set()
for f1_idx, f2_idx in FF_matching_result:
    score = final_FF[f1_idx, f2_idx]
    if score > 0:
        matched_les_pairs.append((f1_idx, f2_idx, score.item()))
        total_les_score += score
        matched_les_idx.add(f1_idx)
        matched_les_idx.add(f2_idx)
unmatched_les = [i for i in range(len_f) if i not in matched_les_idx]

avg = total_les_score / len(matched_les_pairs) if matched_les_pairs else 0
print(f"Total score: {total_les_score}, average score: {avg}")
print(f"Num pairs: {len(matched_les_pairs)}, num unmatched: {len(unmatched_les)}")

# Save matched results

In [None]:
import os
import random
from random import sample
import pandas as pd

random.seed(ROUND)
save_path = "match_result/"
if not os.path.exists(save_path):
    os.makedirs(save_path)

In [None]:
all_paired_ids = []

for pair in matched_pairs:
    m = heterosexual_male_df.iloc[pair[1]].id
    f = heterosexual_female_df.iloc[pair[0]].id
    all_paired_ids.append(sample((m, f), 2))
    
for pair in matched_gay_pairs:
    m1 = homosexual_male_df.iloc[pair[0]].id
    m2 = homosexual_male_df.iloc[pair[1]].id
    all_paired_ids.append(sample((m1, m2), 2))
    
for pair in matched_les_pairs:
    f1 = homosexual_female_df.iloc[pair[0]].id
    f2 = homosexual_female_df.iloc[pair[1]].id
    all_paired_ids.append(sample((f1, f2), 2))
    
all_paired_ids_df = pd.DataFrame(all_paired_ids, columns=["applicant1_id", "applicant2_id"])
all_paired_ids_df.to_csv(os.path.join(save_path, f"matched_pairs_{ROUND}.csv"), index=False)