In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from filter_BEA import filter_by_granularity
from dotenv import dotenv_values, find_dotenv
import os
config = dotenv_values(find_dotenv())
path_cleandata = os.path.abspath(config["CLEANDATA"]) + '\\'

In [12]:
# import data
bea_products = pd.read_pickle(path_cleandata + 'BEA_PCE.pkl')
inputoutput = pd.read_pickle(path_cleandata + 'use.pkl')

# two different versions of the pce index tables:
bea4 = filter_by_granularity(bea_products, target_granularity=4)
bea6 = filter_by_granularity(bea_products, target_granularity=6)

In [13]:
# all the products included in these versions
products_bea4 = list(set(bea4['product']))
products_bea6 = list(set(bea6['product']))
# all the NAICS descriptions (you can use either input or output sides i think)
naicsdescriptions = list(set(inputoutput['desc_I']))

In [14]:
# load the NLP model
bert = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# create the crosswalk
crosswalk = pd.DataFrame(columns=['product', 'NAICS_desc', 'similarity'])
for product in products_bea4:

    # get embeddings for the product category and NAICS sectors
    category_embedding = bert.encode(product, convert_to_tensor=True).reshape(1, -1)
    sector_embeddings = [bert.encode(sector, convert_to_tensor=True).reshape(1, -1) for sector in naicsdescriptions]

    # calculate cosine similarity
    similarities = [cosine_similarity(category_embedding, sector_embedding).item() for sector_embedding in sector_embeddings]

    # filter matches based on the similarity threshold
    # im taking anything with above 0.7 cosine similarity or the highest 3 matches if none above 70 exist
    matching_indices = [i for i, sim in enumerate(similarities) if sim > 0.7]
    if not matching_indices:
        matching_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:3]

    # append the new matches to the dataframe
    rows = pd.DataFrame({'product': [product] * len(matching_indices),
                         'NAICS_desc': [naicsdescriptions[i] for i in matching_indices],
                         'similarity': [similarities[i] for i in matching_indices]})
    crosswalk = pd.concat([crosswalk, rows], ignore_index=True)


  crosswalk = pd.concat([crosswalk, rows], ignore_index=True)


In [15]:
# merging with NAICS I-O table 

# merging with crosswalk
crosswalk_I = crosswalk[['product', 'NAICS_desc']].rename(columns={'product': 'product_I', 'NAICS_desc': 'desc_I'})
crosswalk_O = crosswalk[['product', 'NAICS_desc']].rename(columns={'product': 'product_O', 'NAICS_desc': 'desc_O'})

# merge each side together
add_naics_I = pd.merge(left=crosswalk_I, right=inputoutput, on='desc_I', how='left')
add_naics_O = pd.merge(left=crosswalk_O, right=inputoutput, on='desc_O', how='left')
IO_naics = pd.merge(left=add_naics_I, right=add_naics_O, on=['NAICS_I', 'desc_I', 'NAICS_O', 'desc_O', 'value'], how='inner')

# sum all values in the value column of the I-O matrix with the same product_I and product_O
IO_naics = IO_naics.sort_values(by=['product_I', 'product_O'])
IO_naics = IO_naics[['product_I', 'product_O', 'value']]
IO_naics['value'] = pd.to_numeric(IO_naics['value'])
IO_naics_grouped = IO_naics.groupby(['product_I', 'product_O'], as_index=False)['value'].sum(min_count=1)

# merge with BEA table (I)
IO_naics_I = pd.merge(left=IO_naics_grouped, right=bea4, left_on='product_I', right_on='product', how='inner')
IO_naics_I.drop(columns=['product'], inplace=True)
IO_naics_I.rename(columns={
    'value': 'IO_value',
    'quantityindex': 'quantityindex_I',
    'priceindex': 'priceindex_I'
}, inplace=True)

# merge with BEA table (O)
IO_naics_O = pd.merge(left=IO_naics_grouped, right=bea4, left_on='product_O', right_on='product', how='inner')
IO_naics_O.drop(columns=['product'], inplace=True)
IO_naics_O.rename(columns={
    'value': 'IO_value',
    'quantityindex': 'quantityindex_O',
    'priceindex': 'priceindex_O'
}, inplace=True)

IO_naics = pd.merge(left=IO_naics_I, right=IO_naics_O, on=['product_I', 'product_O', 'IO_value', 'date'], how='inner')

In [19]:
IO_naics

Unnamed: 0,product_I,product_O,IO_value,date,quantityindex_I,priceindex_I,quantityindex_O,priceindex_O
0,Accessories and parts,Accessories and parts,12099.0,1959-01-31,6.375,27.245,6.375,27.245
1,Accessories and parts,Accessories and parts,12099.0,1959-04-30,6.527,27.312,6.527,27.312
2,Accessories and parts,Accessories and parts,12099.0,1959-07-31,6.871,26.977,6.871,26.977
3,Accessories and parts,Accessories and parts,12099.0,1959-10-31,6.457,28.384,6.457,28.384
4,Accessories and parts,Accessories and parts,12099.0,1960-01-31,6.629,28.338,6.629,28.338
...,...,...,...,...,...,...,...,...
6302708,Tobacco (127),Tobacco (127),9614.0,2022-07-31,80.268,133.069,80.268,133.069
6302709,Tobacco (127),Tobacco (127),9614.0,2022-10-31,78.183,134.713,78.183,134.713
6302710,Tobacco (127),Tobacco (127),9614.0,2023-01-31,76.784,137.076,76.784,137.076
6302711,Tobacco (127),Tobacco (127),9614.0,2023-04-30,75.003,138.95,75.003,138.95


In [20]:
# save
IO_naics.to_pickle(path_cleandata + 'BEA4_IOuse_merged.pkl')