In [None]:
import pandas as pd
import igraph as ig
from tests.VAT import load_firm_data_cw, load_link_data_cw, split_and_aggregate_edgelist
from config import DATA_PATH, FIGURE_PATH

## Banana prod. netw. (PN) extraction

### Load countrywide data nodelist and edgelist

In [None]:
contr_type = 'Sociedades'
cw_nodelist = load_firm_data_cw(contr_type=contr_type).reset_index(drop=True)
cw_edgelist = load_link_data_cw(contr_type=contr_type).reset_index(drop=True)

ID_TO_SECTOR = dict(zip(cw_nodelist["firm_id"], cw_nodelist["ISIC4"]))
SECTOR_TO_DESCR = dict(zip(cw_nodelist["ISIC4"], cw_nodelist["descrip_n4"]))
ID_TO_ADM = dict(zip(cw_nodelist["firm_id"], zip(cw_nodelist["ADM2"], cw_nodelist["ADM1"])))

### Choose banana PN IDs

In [None]:
# Customers of A0122 in sectors: G4630, G4620, G4772, A0163
def sector_neighbors_in_sector(cw_edgelist_y, focus_sector, neighb_sector, neigh_type):
    """
    Extract IDs of neighbors in a given sector connected to firms in the focus sector.
    """
    # The neighbor type matches the column you want to extract
    id_col = f"id_{neigh_type}"

    # The opposite role is the firm in the focus sector
    focus_type = "customer" if neigh_type == "supplier" else "supplier"
    
    mask = (
        (cw_edgelist_y[f"sector_{neigh_type}"] == neighb_sector) &
        (cw_edgelist_y[f"sector_{focus_type}"] == focus_sector)
    )

    return set(cw_edgelist_y.loc[mask, id_col])

sector_color_map = {'A0122':"#ffe135", 'G4630':"#3bba1e",
                    'A0163':"#FFB922", 'G4620':"#3bba1e",
                    'C2220':"#2669AC", 'G4669':"#303030",
                    'G4772':"#158C92", 'S9609':"#AA29B3",
                    'C1702':"#9A7444", 'C1701':"#9A7444",
                    'H4923':"#A0A0A0",
                    }

In [None]:
for year in range(2007,2015+1):
    print(year)
    # ---------------- Reduce to year
    cw_nodelist_y = cw_nodelist[(cw_nodelist['date'] == year)].reset_index(drop=True).copy()
    cw_edgelist_y = cw_edgelist[(cw_edgelist['date'] == year)].reset_index(drop=True).copy()

    cw_edgelist_y['sector_supplier'] = cw_edgelist_y['id_supplier'].map(ID_TO_SECTOR)
    cw_edgelist_y['sector_customer'] = cw_edgelist_y['id_customer'].map(ID_TO_SECTOR)

    cw_edgelist_y['ADM_supplier'] = cw_edgelist_y['id_supplier'].map(ID_TO_ADM)
    cw_edgelist_y['ADM_customer'] = cw_edgelist_y['id_customer'].map(ID_TO_ADM)

    # ---------------- Identifying relevant PN firm ID sets
    firm_PN_IDs_dict = dict()

    # A0122 sociedades
    firm_PN_IDs_dict['A0122 sociedades'] = set(cw_nodelist_y[(cw_nodelist_y['ISIC4'] == 'A0122')]['firm_id'])

    # A0122 suppliers and customers
    A0122customers_inG4630 = sector_neighbors_in_sector(cw_edgelist_y, focus_sector='A0122', neighb_sector='G4630', neigh_type='customer')
    firm_PN_IDs_dict['A0122 customers in G4630'] = A0122customers_inG4630

    if year == 2015:
        # The following sectors are only chosen for the year 2015
        firm_PN_IDs_dict['A0122 customers in G4620'] = sector_neighbors_in_sector(cw_edgelist_y, focus_sector='A0122',neighb_sector='G4620', neigh_type='customer')
        firm_PN_IDs_dict['A0122 customers in G4772'] = sector_neighbors_in_sector(cw_edgelist_y, focus_sector='A0122',neighb_sector='G4772', neigh_type='customer')
        firm_PN_IDs_dict['A0122 suppliers in A0163'] = sector_neighbors_in_sector(cw_edgelist_y, focus_sector='A0122',neighb_sector='A0163', neigh_type='supplier')
        firm_PN_IDs_dict['A0122 customers in A0163'] = sector_neighbors_in_sector(cw_edgelist_y, focus_sector='A0122',neighb_sector='A0163', neigh_type='customer')
        firm_PN_IDs_dict['A0122 suppliers in G4669'] = sector_neighbors_in_sector(cw_edgelist_y, focus_sector='A0122',neighb_sector='G4669', neigh_type='supplier')
        firm_PN_IDs_dict['A0122 suppliers in S9609'] = sector_neighbors_in_sector(cw_edgelist_y, focus_sector='A0122',neighb_sector='S9609', neigh_type='supplier')
        firm_PN_IDs_dict['A0122 suppliers in C2220'] = sector_neighbors_in_sector(cw_edgelist_y, focus_sector='A0122',neighb_sector='C2220', neigh_type='supplier')
        firm_PN_IDs_dict['A0122 suppliers in H4923'] = sector_neighbors_in_sector(cw_edgelist_y, focus_sector='A0122',neighb_sector='H4923', neigh_type='supplier')
        #firm_PN_IDs_dict['G4630 suppliers in H4923'] = sector_neighbors_in_sector(cw_edgelist_y, focus_sector='G4630',neighb_sector='H4923', neigh_type='supplier')

        # Firms in C1702
        firm_PN_IDs_dict['Firms in C1702'] = set(cw_nodelist_y[(cw_nodelist_y['ISIC4'] == 'C1702')]['firm_id'])
        firm_PN_IDs_dict['Firms in C1701'] = set(cw_nodelist_y[(cw_nodelist_y['ISIC4'] == 'C1701')]['firm_id'])
        #firm_PN_IDs_dict['Firms in H4923'] = set(cw_nodelist_y[(cw_nodelist_y['ISIC4'] == 'H4923')]['firm_id'])

        ## Suppliers of A0122 customers in G4630
        G4630suppliers_of_A0122customers_inG4630 = set(cw_edgelist_y[
            (cw_edgelist_y[f'id_customer'].isin(A0122customers_inG4630)) &
            (cw_edgelist_y[f'sector_supplier'] == 'G4630')]['id_supplier'])
        #firm_PN_IDs_dict["G4630 suppliers of A0122's customers in G4630"] = G4630suppliers_of_A0122customers_inG4630

    # ---------------- Final count
    firm_PN_IDs = {id_
                for _, id_set in firm_PN_IDs_dict.items()
                for id_ in id_set}

    explain = [reason_key + ': ' + str(len(id_set)) for reason_key, id_set in firm_PN_IDs_dict.items()]
    #for subset in explain: print(subset)

    # ---------------- Building network file
    global_edgelist, pn_edgelist, _, _ = split_and_aggregate_edgelist(cw_edgelist_y, firm_PN_IDs, verbose=False)
    firm_PN_IDs_df = cw_nodelist_y[cw_nodelist_y['firm_id'].isin(firm_PN_IDs)]

    pn_ig = ig.Graph(directed=True)
    pn_ig.add_vertices(firm_PN_IDs_df['firm_id'].tolist())

    # Add firm attributes
    for col in firm_PN_IDs_df.columns:
        pn_ig.vs[col] = firm_PN_IDs_df[col].values

    #A0122_mask = firm_PN_IDs_df['ISIC4'] == 'A0122'
    #firm_PN_IDs_df.loc[A0122_mask, 'A0122_location'] = firm_PN_IDs_df.loc[A0122_mask, 'ADM1']
    #firm_PN_IDs_df['A0122_location'] = firm_PN_IDs_df['A0122_location'].fillna('other')
    #pn_ig.vs['A0122_location'] = firm_PN_IDs_df['A0122_location'].values

    pn_ig.vs['custom_color'] = firm_PN_IDs_df['ISIC4'].map(sector_color_map).values

    # Step 2: Prepare edges
    edges = list(zip(pn_edgelist['id_supplier'], pn_edgelist['id_customer']))

    # Step 3: Add edges (igraph will use the vertex names)
    pn_ig.add_edges(edges)
    pn_ig.es['weight'] = pn_edgelist['weight'].values
    pn_ig.es['date'] = pn_edgelist['date'].values.astype(int)
    pn_ig.es['id_supplier'] = pn_edgelist['id_supplier'].values
    pn_ig.es['id_customer'] = pn_edgelist['id_customer'].values
    pn_ig.es['sector_supplier'] = pn_edgelist['sector_supplier'].values
    pn_ig.es['sector_customer'] = pn_edgelist['sector_customer'].values

    print('N, L:', pn_ig.vcount(), pn_ig.ecount(), '\n')

    for mode in ['in', 'out']:
        pn_ig.vs[f'pn_s_{mode}'] = pn_ig.strength(mode=mode, weights=pn_ig.es['weight'])
        pn_ig.vs[f'pn_k_{mode}'] = pn_ig.degree(mode=mode)

    ig.write(pn_ig, filename=DATA_PATH / f'pn{year}.graphml', format='graphml')
    global_edgelist.to_csv(DATA_PATH / 'firm-level' / f'global_edgelist_{year}.csv', index=False, sep=',')

In [None]:
global_edgelist, pn_edgelist, pn_and_roe_edgelist, roe_edgelist = split_and_aggregate_edgelist(cw_edgelist_y, firm_PN_IDs)

## Debugging

In [None]:
#cw_nodelist_y[cw_nodelist_y['firm_id'].isin(firm_PN_IDs_dict['A0122 customers in G4790'])]
#cw_nodelist_y[cw_nodelist_y['firm_id'].isin(firm_PN_IDs_dict['A0122 customers in G4772'])]

In [None]:
# For each supplier, get the unique sectors they supply to
sup_to_sectors = cw_edgelist_y.groupby("id_supplier")["sector_customer"].unique()
# Filter suppliers that supply to both A and B
target_suppliers = sup_to_sectors[sup_to_sectors.apply(lambda x: {"A", "B"}.issubset(x))].index

# For each customer, get the unique sectors they buy from
cust_of_sectors = cw_edgelist_y.groupby("id_customer")["sector_supplier"].unique()
# Filter customers that buy from both 
target_customers_id = cust_of_sectors[cust_of_sectors.apply(lambda x: {"A0122", "C1702"}.issubset(x))].index

target_customers = cw_nodelist_y[cw_nodelist_y['firm_id'].isin(target_customers_id)]
print(target_customers.shape[0])
display(target_customers.groupby('ISIC4').count().sort_values(by='firm_id', ascending=False).head())

#new_add = target_customers[target_customers['ISIC4'].isin(set(['G4630', 'G4620', 'C1079', 'A0163']))]['firm_id']
#loaded['data']['G4630 and G4620 customers of A0122, C1702, A0163'] = list(new_add.values)

#firm_PN_IDs = firm_PN_IDs.union(set(new_add))
#global_edgelist, pn_edgelist, pn_and_roe_edgelist, roe_edgelist = split_and_aggregate_edgelist(cw_edgelist_y, firm_PN_IDs)