### Import Libraries

In [1]:
import pandas as pd

from utils.processing import preprocess
from utils.matching import get_fuzzy_matches_df

In [2]:
matches_funds = pd.read_csv('../data/matches_funds.csv')

In [3]:
# Formatting Data Frame
filter_columns = [
    'form_d_fund_id', 'cik_no_fund', 'fund',
    'form_adv_fund_id', 'crd_no_fund', 'matched_fund', 'fund_confidence',
]
matches_funds = matches_funds[filter_columns]

### Merge with corresponding <direct_owners>

In [4]:
related_partners = pd.read_pickle('../storage/related_partners.pkl')
direct_owners = pd.read_pickle('../storage/direct_owners.pkl')

In [5]:
related_partners = preprocess(related_partners, related_partners=True)
direct_owners = preprocess(direct_owners, direct_owners=True)

In [6]:
matches_owners = get_fuzzy_matches_df(matches_funds, related_partners, direct_owners)

In [7]:
match_funds_owners = pd.merge(matches_funds, matches_owners, how='inner')

In [9]:
filter_columns = [
    'form_d_fund_id', 'cik_no_fund', 'fund',
    'form_adv_fund_id', 'crd_no_fund', 'matched_fund', 'fund_confidence',
    'related_partners',
    'direct_owners_fund','owners_fund_ratio'
]
match_funds_owners = match_funds_owners[filter_columns]

In [10]:
# Sort by fund_confidence
match_funds_owners = match_funds_owners.sort_values(by='fund_confidence', ignore_index=True)

In [11]:
# Round off confidence & ratio
match_funds_owners = match_funds_owners.round({'fund_confidence': 3, 'firm_confidence': 3, 'owners_firm_ratio': 3, 'owners_fund_ratio': 3})

In [12]:
match_funds_owners.to_csv('../output/matches_funds_owners.csv')

In [None]:
# Threshold
fund_threshold, owners_threshold = 0.512, 0.555

match_funds_owners_threshold = match_funds_owners[(match_funds_owners.fund_confidence < fund_threshold) | (match_funds_owners.owners_fund_ratio > owners_threshold)]

In [None]:
matches_frac_1 = match_funds_owners.iloc[:6000]
matches_frac_1.sort_values(by='fund_confidence', inplace= True, ignore_index=True)
matches_frac_1.to_csv("../output/matches_frac_1.csv")

In [None]:
matches_frac_2 = match_funds_owners.iloc[6000:12000]
matches_frac_2.sort_values(by='fund_confidence', inplace= True, ignore_index=True)
matches_frac_2.to_csv("../output/matches_frac_2.csv")

In [None]:
matches_frac_3 = match_funds_owners.iloc[12000:18000]
matches_frac_3.sort_values(by='fund_confidence', inplace= True, ignore_index=True)
matches_frac_3.to_csv("../output/matches_frac_3.csv")