In [19]:
import pandas as pd

from mapping.utils.processing import preprocess

In [20]:
matches_firms = pd.read_csv('../data/matches_firms.csv')
matches_funds = pd.read_csv('../data/matches_funds.csv')
matches_owners = pd.read_csv('../data/matches_owners.csv')

In [21]:
matches_firms_funds = pd.merge(matches_firms, matches_funds, how='inner')

In [22]:
# Formatting Data Frame
filter_columns = [
    'form_d_fund_id', 'cik_no_fund', 'fund',
    'form_adv_firm_id', 'crd_no_firm', 'matched_firm', 'firm_confidence',
    'form_adv_fund_id', 'crd_no_fund', 'matched_fund', 'fund_confidence',
]
matches_firms_funds = matches_firms_funds[filter_columns]

In [23]:
matches_firms_funds.head()

Unnamed: 0,form_d_fund_id,cik_no_fund,fund,form_adv_firm_id,crd_no_firm,matched_firm,firm_confidence,form_adv_fund_id,crd_no_fund,matched_fund,fund_confidence
0,54568,1694312,Ajo Emerging Markets Small Cap Fund,,309921,Capitalworks Emerging Markets Advisor,0.985133,602.0,105545,Ajo Emerging Markets Small Cap Fund,0.0
1,60070,1685357,Libra Fossil Fuel Free Fund,,281914,Fund,0.92708,11352.0,294197,Libra Fossil Fuel Free Fund,0.0
2,38363,1679614,Fiam Global Low Volatility Equity Fund,,137649,Global Equity Advisors,0.891444,10035.0,133196,Fiam Global Low Volatility Equity Fund,0.0
3,43377,1753457,Parian Global Us Fund,20874.0,297279,Parian Global Management,0.819455,20874.0,297279,Parian Global Us Fund Ii,0.263488
4,24565,1450552,Davi Luxury Brand Group,142265.0,306533,Daventry Group,0.983191,3170.0,110885,Capital Group Euro Bond Fund Lux,0.97405


Testing

In [24]:
match_df = matches_firms_funds

In [25]:
cols = ['cik_no_fund', 'crd_no_firm', 'crd_no_fund']

match_df = match_df[cols]

In [43]:
from fuzzywuzzy import fuzz

def get_match_df(data_a, data_b):

    left_values = list(set(data_a.related_partners))
    right_values = data_b.direct_owners.to_list()

    matches = []

    for partner in left_values:
        best_owner, best_raio = None, 0
        for owner in right_values:
            ratio = fuzz.SequenceMatcher(None, partner, owner).ratio()
            if ratio > best_raio:
                best_owner = owner
                best_raio = ratio
        matches.append((partner, best_owner, best_raio))

    return pd.DataFrame(matches, columns=['related_partner', 'direct_owners', 'owners_ratio'])

In [58]:
related_partners = pd.read_pickle('../storage/related_partners.pkl')
direct_owners = pd.read_pickle('../storage/direct_owners.pkl')

related_partners = preprocess(related_partners, related_partners=True)
direct_owners = preprocess(direct_owners, direct_owners=True)

result_df = []

for idx, row in match_df.iterrows():

    cik_no_fund, crd_no_firm, crd_no_fund = row.cik_no_fund, row.crd_no_firm, row.crd_no_fund

    table_a = related_partners[related_partners.cik_no_related_partners == cik_no_fund]
    table_b = direct_owners[direct_owners.crd_no_owners == crd_no_firm] 
    table_c = direct_owners[direct_owners.crd_no_owners == crd_no_fund]
   
    match_a = get_match_df(table_a, table_b)
    match_b = get_match_df(table_a,table_c)

    match = pd.concat([match_a, match_b],axis=1)

    match.columns = ['related_partners','direct_owners_firm','owners_firm_ratio','drop','direct_owners_fund','owners_fund_ratio']
    match.drop(columns=['drop'],axis=1,inplace=True)
    match['cik_no_fund'] = cik_no_fund
    match['crd_no_firm'] = crd_no_firm
    match['crd_no_fund'] = crd_no_fund

    result_df.append(match)

  dataFrame.related_partners = dataFrame.related_partners.str.replace('(', '')
  dataFrame.related_partners = dataFrame.related_partners.str.replace(')', '')
  dataFrame.related_partners = dataFrame.related_partners.str.replace('.', '')
  dataFrame.direct_owners = dataFrame.direct_owners.str.replace('(', '')
  dataFrame.direct_owners = dataFrame.direct_owners.str.replace(')', '')
  dataFrame.direct_owners = dataFrame.direct_owners.str.replace('.', '')


In [59]:
stack_result = pd.DataFrame()

for data in result_df:
    if not data.empty:
        if not stack_result.empty:
            stack_result = pd.concat([stack_result, data], join='inner', ignore_index=True)
        else:
            stack_result = data

In [61]:
result_df[6].head()

Unnamed: 0,related_partners,direct_owners_firm,owners_firm_ratio,direct_owners_fund,owners_fund_ratio,cik_no_fund,crd_no_firm,crd_no_fund
0,Michael Pruitt,,0,Bergeron Michael Robert,0.540541,1106838,135387,134320


In [65]:
stack_result[stack_result.direct_owners_firm.notnull()]

Unnamed: 0,related_partners,direct_owners_firm,owners_firm_ratio,direct_owners_fund,owners_fund_ratio,cik_no_fund,crd_no_firm,crd_no_fund
11,Susan Etzel,Sherman Mark Douglas,0.322581,Rosenthal Robert Daniel,0.294118,1136174,108401,167212
12,Steven Kriegsman,Sherman Mark Douglas,0.333333,Juchem Stephen Joseph,0.378378,1136174,108401,167212
13,Terren Peizer,Sherman Mark Douglas,0.303030,Kudu Investment Management,0.307692,1136174,108401,167212
14,Richard Berman,Sherman Mark Douglas,0.352941,Rosenthal Robert Daniel,0.432432,1136174,108401,167212
15,Marvin Ingelman,Sherman Mark Douglas,0.400000,Siegel Bruce Alan,0.375000,1136174,108401,167212
...,...,...,...,...,...,...,...,...
4073,Constantine Patamianos,Sherman Mark Douglas,0.285714,Kudu Investment Us,0.350000,1362190,108401,167212
4074,Edward Odonnell,Sherman Mark Douglas,0.285714,Rosenthal Robert Daniel,0.315789,1362190,108401,167212
4083,David Alexander,Walters Charles,0.266667,Hill Glenn Joseph,0.375000,1139053,169483,110186
4084,N A Pa Capital Advisors,Walters Charles,0.315789,Pantaleo Laura Ann,0.341463,1139053,169483,110186


In [66]:
match_firms_funds_owners = pd.merge(matches_firms_funds, stack_result, how='inner')

In [72]:
match_firms_funds_owners = match_firms_funds_owners.round(
    {"firm_confidence": 3, "fund_confidence": 3, "owners_firm_ratio": 3, 'owners_fund_ratio':3})


In [68]:
match_firms_funds_owners.columns


Index(['form_d_fund_id', 'cik_no_fund', 'fund', 'form_adv_firm_id',
       'crd_no_firm', 'matched_firm', 'firm_confidence', 'form_adv_fund_id',
       'crd_no_fund', 'matched_fund', 'fund_confidence', 'related_partners',
       'direct_owners_firm', 'owners_firm_ratio', 'direct_owners_fund',
       'owners_fund_ratio'],
      dtype='object')

In [73]:
match_firms_funds_owners = match_firms_funds_owners[['form_d_fund_id', 'cik_no_fund', 'fund', 'form_adv_fund_id', 'crd_no_fund', 'matched_fund', 'fund_confidence', 'form_adv_firm_id', 'crd_no_firm', 'matched_firm',
'firm_confidence', 'related_partners', 'direct_owners_firm', 'owners_firm_ratio', 'direct_owners_fund','owners_fund_ratio']]


In [75]:
match_firms_funds_owners = match_firms_funds_owners.sort_values(by='fund_confidence', ignore_index=True)


In [76]:
match_firms_funds_owners.to_csv('../output/matches_firms_funds_owners.csv')