### Import Libraries

In [43]:
import numpy as np
import pandas as pd

from utils.processing import preprocess
from utils.matching import get_fuzzy_matches_df

In [44]:
matches_funds = pd.read_csv('../data/matches_funds.csv')

In [45]:
# Formatting Data Frame
filter_columns = [
    'form_d_firm_id', 'cik_no_fund', 'fund',
    'form_adv_firm_id', 'firm_id', 'crd_no_fund', 'matched_fund', 'fund_confidence',
]
matches_funds = matches_funds[filter_columns]

### Merge with corresponding <direct_owners>

In [46]:
related_partners = pd.read_pickle('../storage/related_partners.pkl')
direct_owners = pd.read_pickle('../storage/direct_owners.pkl')

In [47]:
related_partners = preprocess(related_partners, related_partners=True)
direct_owners = preprocess(direct_owners, direct_owners=True)

In [48]:
matches_owners = get_fuzzy_matches_df(matches_funds, related_partners, direct_owners)

In [49]:
match_funds_owners = pd.merge(matches_funds, matches_owners, how='inner')

In [50]:
filter_columns = [
    'form_d_firm_id', 'cik_no_fund', 'fund',
    'form_adv_firm_id', 'firm_id', 'crd_no_fund', 'matched_fund', 'fund_confidence',
    'related_partners',
    'direct_owners_fund', 'owners_fund_ratio'
]
match_funds_owners = match_funds_owners[filter_columns]

### Process merged matches

In [51]:
# Sort by fund_confidence
match_funds_owners = match_funds_owners.sort_values(by='fund_confidence', ignore_index=True)

In [52]:
# Round off confidence & ratio
match_funds_owners = match_funds_owners.round({'fund_confidence': 3, 'firm_confidence': 3, 'owners_firm_ratio': 3, 'owners_fund_ratio': 3})

In [53]:
# Threshold
fund_threshold, owners_threshold = 0.512, 0.555
match_funds_owners = match_funds_owners[(match_funds_owners.fund_confidence < fund_threshold) | (match_funds_owners.owners_fund_ratio > owners_threshold)]

In [54]:
# Drop Null matches
match_funds_owners.dropna(subset=['fund', 'matched_fund'], inplace=True)
match_funds_owners = match_funds_owners[match_funds_owners.fund_confidence != 1]

In [55]:
# Drop duplicates
match_funds_owners = match_funds_owners[match_funds_owners['owners_fund_ratio'] == match_funds_owners.groupby('cik_no_fund')['owners_fund_ratio'].transform(max)]

In [56]:
# Inverse fund_confidence
match_funds_owners['fund_confidence'] = 1 - match_funds_owners['fund_confidence']

In [57]:
# Add cik_no_fund Column
match_funds_owners['cik_no_fund'] = np.nan

In [58]:
# Rename columns
filter_columns = [
    'fund', 'cik_no_fund',
    'matched_fund', 'crd_no_fund', 'cik_no_fund', 'fund_confidence',
    'firm_id', 'form_d_firm_id', 'form_adv_firm_id',
]
match_funds_owners = match_funds_owners[filter_columns]
match_funds_owners.rename({
    'fund': 'entity_name', 'cik_no_fund': 'formd_cik',
    'matched_fund': 'firm_name', 'crd_no_fund': 'firm_crd', 'fund_confidence': 'match_score',
    'form_d_firm_id': 'firm_formd_value_id'
}, inplace=True, axis=1)

In [59]:
# To CSV format
match_funds_owners.to_csv('../output/matches_funds_owners.csv')