# Requirements to collpase requirements tables Note + Making Concordances Concord - Tony Gui

In [None]:
import pandas as pd
import numpy as np
from dotenv import dotenv_values, find_dotenv
from statsmodels.tsa.api import VAR
import os
import re
from data_cleaning_functions import requirements_clean, concordance_PCE_clean, \
    find_intermediate_industries, concordance_PCQ_clean, get_sales_from_make_matrix, clean_make_matrix, \
    get_demand_shock_from_shaipro_output, get_expenditure_weights_from_shapiro_outputs,plot_shapiro_graph_from_shapiro_ouput,clean_bea_PQE_table

from pathlib import Path
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore', category=FutureWarning, message=".*concatenation with empty or all-NA entries is deprecated.*")

script_dir = str(Path().resolve().parent)
file_path = os.path.join(script_dir) + "/" 
shapiro_file =  file_path + "Shapiro"
raw_data_path = file_path + "raw_bea_data" 


# 1. Load in Requirements Matrix 

In [None]:
requirements = pd.read_excel(os.path.join(raw_data_path, 'IxI_TR_2017_PRO_Det.xlsx'), sheet_name='2017')
requirements = requirements_clean(requirements)
requirements

# 2. Calculate Delta

In [None]:
requirements = requirements.T
with pd.option_context("future.no_silent_downcasting", True):
    requirements = requirements.fillna(0).infer_objects(copy=False)
delta = np.identity(len(requirements)) - np.linalg.inv(requirements)

# 2 (Making Concordances Concord) Adding Scrap, Used and secondhand goods and ROW adjustments to Delta

In [None]:
"""These 4 industries are found in the concordance table but not in the Reqirements table so I am 
adding them manually with row and column inputs of zero"""
concordance_but_not_requirments = ["Scrap", "Used and secondhand goods", "Rest of the world adjustment", "noncomparable imports"] 
delta = pd.DataFrame(delta, index=requirements.index, columns=requirements.columns)
delta = delta.reindex(index=requirements.index.append(pd.Index(concordance_but_not_requirments)).str.lower() , columns=requirements.columns.append(pd.Index(concordance_but_not_requirments)).str.lower() , fill_value=0)

delta

# 2.5b Making Negative Values in Delta Zero

In [None]:
negative_count = (delta < 0).sum().sum()
x = negative_count/402**2
delta[delta < 0] = 0
negative_count_new = (delta < 0).sum().sum()
y = negative_count_new/402**2

# 2.5b Force negatives to be zero but adjust row sums to be same as before

In [None]:
# V = delta.sum(axis=0)
# P = delta[delta > 0].sum()
# delta.loc['norm'] = V/P
# last_values = delta.iloc[-1]
# delta[delta < 0] = 0
# delta = delta.iloc[:-1].div(last_values)
# delta

# Dealting with Intermediate Industries - Making Concordances Concord Section 3.3.1. - Operationalizing Industries without Products

# Step 1. Making Concordances Concord

In [None]:
use_table = pd.read_excel(os.path.join(raw_data_path, "Use_SUT_Framework_2017_DET.xlsx"), sheet_name="2017")
# peq_concordance = pd.read_excel(os.path.join(raw_data_path, "PEQBridge_2017_DET.xlsx"), sheet_name="2017")

# Returns all industries with zero PCE 
intermediate_industries = find_intermediate_industries(use_table)

# peq_concordance = concordance_PCQ_clean(peq_concordance)
# intermediate_industries = intermediate_industries[~intermediate_industries['Industry'].isin(peq_concordance['Industries'])]

intermediate_industries = intermediate_industries.iloc[:, [0]]
intermediate_industries = intermediate_industries.sort_values(by="Industry")
intermediate_industries

# Steps 2 - 5. Making Concordances Concord - Loop over Intermediate Industries and Remove

In [None]:
# requirements = pd.read_excel(os.path.join(raw_data_path, 'IxI_TR_2017_PRO_Det.xlsx'), sheet_name='2017')
# requirements = requirements_clean(requirements, wide=True)

# For the Loop 
big_lambda = np.eye(len(delta)) # create a big_lambda identity matrix that will become out final output
big_lambda = pd.DataFrame(big_lambda)
big_lambda = big_lambda.set_index(delta.index)
big_lambda.columns = delta.index
delta_industries = delta.index # save all the requirment table industries outside the loop 
removed = [] # stores all removed industries through the loop 

i = 0
for row in intermediate_industries.iloc[::-1].itertuples(): # looping backwards
    i += 1 
    current = row.Industry # industry we are currently removing
    # Create identity matrix for current loop iteration
    phi_i = np.eye(len(delta) - (i - 1))
    phi_i = pd.DataFrame(phi_i)
    phi_i = phi_i.set_index(delta_industries)
    phi_i.columns = delta_industries

    current_intermediate_use_table = delta.copy() # use a new intermediates use matrix each time we do this
    # Remove all the industries we've previously removed in earlier loop iterations 
    current_intermediate_use_table.drop(columns=removed, inplace=True)
    current_intermediate_use_table = current_intermediate_use_table.loc[[current]] # the row of the intermediates use table associated with the current industry  
    current_intermediate_use_table.drop(columns=[current], inplace=True) # drop the column associated with the current industry so we dont include it in our sum
    current_intermediate_use_table = current_intermediate_use_table.astype(float)
    current_industry_sum = current_intermediate_use_table.loc[current].sum() # find sum of row

    # just making sure that we aren't dividing by zero 
    if current_industry_sum != 0:
        current_intermediate_use_table = current_intermediate_use_table.astype(float)
        current_intermediate_use_table.loc[current] = current_intermediate_use_table.loc[current] / current_industry_sum


    current_intermediate_use_table = current_intermediate_use_table[sorted(current_intermediate_use_table.columns)] # sort the normazlied row alphabetically

    # remove the current industry from intermediate list
    intermediate_industries = intermediate_industries[intermediate_industries['Industry'] != current]
    # drop the current industry column from phi_i and industry indexing
    phi_i = phi_i.drop(columns=current)
    delta_industries = delta_industries.drop(current) 
    
    removed.append(current)

    # Update phi_i with the current industry sale shares
    phi_i.loc[current_intermediate_use_table.index] = current_intermediate_use_table.loc[current_intermediate_use_table.index].astype(np.float64).values
    big_lambda_old = big_lambda
    big_lambda = big_lambda @ phi_i

# Step 6. Making Concordances Concord - Converting IO matrix from sales shares to dollars

In [None]:
make_matrix = pd.read_excel(os.path.join(raw_data_path, "Supply_2017_DET.xlsx"), sheet_name="2017")
sales_vector = get_sales_from_make_matrix(make_matrix) # returns the sales for each industry

# these industries are not in the make matrix so i will add them manually with zero entries to preserve df sizes
industries_not_in_make_matrix = ["state and local government passenger transit", "state and local government electric utilities", \
                      "secondary smelting and alloying of aluminum", "federal electric utilities"]
not_in_make_matrix = pd.DataFrame({'Industries': industries_not_in_make_matrix, 'Sales': [0,0,0,0]})
sales_vector = pd.concat([sales_vector, not_in_make_matrix], ignore_index=True)
sales_vector.set_index('Industries', inplace=True)

# repeat the sales vector n times to make a sales matrix
sales_repeated = pd.DataFrame({f'{i}': sales_vector['Sales'].values for i in range(1, len(delta))})
sales_repeated.set_index(sales_vector.index, inplace=True)
sales_matrix = pd.concat([sales_vector, sales_repeated], axis=1)
sales_matrix.columns = delta.columns

# Y is IO matrix in dollars 
Y = delta * sales_matrix 
Y

# Step 7. Making Concordances Concord - Value Added for the Economy

In [None]:
sum_Y = Y.sum(axis=1).to_frame()
sum_Y.columns = ['Sales']
VA_pre_transformation = (sales_vector - sum_Y).sum()

VA_pre_transformation

# Step 8. Making Concordances Concord - IO Matrix in USD

In [None]:
# new IO matrix in USD 
Y_tilde = big_lambda.T @ Y @ big_lambda
sales_tilde = big_lambda.T @ sales_vector

# Verifing that economy wide value added is identical to the pre-transformation level
sum_Y_tilde = Y_tilde.sum(axis=1).to_frame()
sum_Y_tilde.columns = ['Sales']
VA_post_transformation = (sales_tilde - sum_Y_tilde).sum()
VA_post_transformation

In [None]:
"""Pre transformation and post transformation are not the same"""

differnece = VA_post_transformation - VA_pre_transformation
differnece

# Step 9. Making Concordances Concord - Calculating New Delta

In [None]:
recip_sales = 1/sales_tilde
sales_tilde_repeated = pd.DataFrame({f'{i}': recip_sales['Sales'].values for i in range(1, len(delta))})
sales_tilde_repeated.set_index(recip_sales.index, inplace=True)
sales_tilde_matrix = pd.concat([recip_sales, sales_tilde_repeated], axis=1)
sales_tilde_matrix.columns = delta.columns

delta_tilde = Y_tilde * sales_tilde_matrix
delta = delta_tilde
delta

# End of Making Concordances Concord - Back to Requirements to Collapse Requirements Table

# 3. Merge Concordance with Delta

In [None]:
concordance = pd.read_excel(os.path.join(raw_data_path, "PCEBridge_2017_DET.xlsx"), sheet_name="2017")
concordance = concordance_PCE_clean(concordance)
concordance

In [None]:
"""the problem here is some industries in the requirments matrix still have no match to PCE they are matching to PEQ so i must drop them here when im trying to 
make the requirments table in terms of products however im not sure if this changes some of the math performed to the requirments matrix ie delta later on
"""
industries_in_requirments = set(delta.columns)
indsutries_in_concordance = set(concordance["PCE Bridge Industries"])
industires_not_in_concordance = industries_in_requirments - indsutries_in_concordance
delta = delta.drop(index=industires_not_in_concordance, columns=industires_not_in_concordance, errors='ignore')
delta

In [None]:
in_num_products = concordance['PCE Bridge Industries'].value_counts()

delta_product_columns = pd.DataFrame(index=delta.index, columns=concordance['PCE Bridge Products'])

for column in delta_product_columns: 
    industry = concordance.loc[concordance['PCE Bridge Products'] == column, 'PCE Bridge Industries'].values
    delta_product_columns[column] = delta[industry] / in_num_products[industry]

delta_product_cr = pd.DataFrame(columns=delta_product_columns.columns)
lst_for_industry = []
for row in delta_product_columns.index:
    final_value = delta_product_columns.loc[row] / in_num_products[row]
    products = concordance[concordance['PCE Bridge Industries'] == row]['PCE Bridge Products'].tolist()
    for product in products: 
        lst_for_industry.append(row)
        final_value_row = pd.DataFrame(final_value).T
        final_value_row.index = [product]
        
        delta_product_cr = pd.concat([delta_product_cr, final_value_row])

industry = pd.DataFrame(lst_for_industry, columns=['industry'])
industry.index = delta_product_cr.index
delta_product_cr = pd.concat([delta_product_cr, industry], axis = 1)
delta_product_cr

# 4. Collapse columns

In [None]:
delta_products = delta_product_cr.T.groupby(delta_product_cr.columns).sum().T
delta_products

# 5. Make Matrix to Sales

In [None]:
make_matrix = pd.read_excel(os.path.join(raw_data_path, "Supply_2017_DET.xlsx"), sheet_name='2017')
sales = get_sales_from_make_matrix(make_matrix)
sales

# 6. Merge concordance with sales 

In [None]:
product_sales = pd.merge(concordance, sales, left_on='PCE Bridge Industries', right_on='Industries')
product_sales = product_sales[["PCE Bridge Products", "Industries", 'Sales']]
product_sales

# 7. Group sum of Sales for every Product

In [None]:
# calculates the number of product categories an Industry belongs to 
product_sales['In#Products'] = product_sales['Industries'].map(product_sales['Industries'].value_counts())
product_sales["Ratio_in_Product"] = product_sales['Sales'] / product_sales['In#Products'] 
product_sales['Sales_Sum'] = product_sales.groupby('PCE Bridge Products')['Ratio_in_Product'].transform('sum')
product_sales

# 8. Sales Share 

In [None]:
product_sales['Sale Share'] = product_sales["Ratio_in_Product"] / product_sales["Sales_Sum"]
product_sales = product_sales.rename(columns={'Industries': 'industry'})
product_sales = product_sales.rename(columns={'PCE Bridge Products': 'product'})
product_sales

# 9. Merge Sale shares and Delta

In [None]:
df_reset = delta_products.reset_index()
delta_products = df_reset.rename(columns={'index': 'product'})
delta_products_saleshare = delta_products.merge(product_sales, how = 'inner', on = ['industry', 'product'])
delta_products_saleshare = delta_products_saleshare.drop(columns=['Sales_Sum', "Ratio_in_Product", "In#Products", "Sales", "industry"])
delta_products_saleshare = delta_products_saleshare.set_index(delta_products_saleshare.columns[0])
delta_products_saleshare

# 10. weightTimesDeltaValue + 11.

In [None]:
# Convert from Wide to Long
delta_final = delta_products_saleshare.reset_index().melt(id_vars=[delta_products_saleshare.index.name, "Sale Share"],
                                var_name="Column Products", value_name="value")

# Just Renaming and Reordering
delta_final = delta_final.rename(columns={delta_products_saleshare.index.name: "Row Products"})
delta_final = delta_final[['Row Products', 'Column Products', 'value', 'Sale Share']]
# Calculating weightTimesDeltaValue
delta_final["weightTimesDeltaValue"] = delta_final["value"] * delta_final["Sale Share"]

# Sum weightTimesDeltaValue grouping by Row Products AND Column Products
delta_final = delta_final.groupby(['Row Products', 'Column Products']).sum()
delta_final
# Convert Back to Wide Format 
delta_final = delta_final.pivot_table(values='weightTimesDeltaValue', index='Row Products', columns='Column Products')

# Removes Index and Column Names cuz it looks better. Both Index and Columns are simply Product Categories now
delta_final.columns.name = None 
delta_final.index.name = None 


labels_to_drop = ["government employees' expenditures abroad", "private employees' expenditures abroad","u.s. travel outside the united states","u.s. student expenditures"]
delta_final = delta_final.drop(index=labels_to_drop, columns=labels_to_drop, errors="ignore")

delta_final

# Fixing Products with no price, quantity or expendiuture data 

In [None]:
bea_quantities = pd.read_excel(os.path.join(raw_data_path, 'BEA Monthly Quantities.xlsx'))
bea_quantities = clean_bea_PQE_table(bea_quantities, "Quantities", long=True)
bea_quantities = bea_quantities[bea_quantities['products'].isin(delta_final.index)]
bea_quantities = bea_quantities.drop_duplicates()
bea_quantities['Quantities'] = bea_quantities['Quantities'].replace('---', 0).astype(float)
bea_quantities = bea_quantities.fillna(0)

bea_prices = pd.read_excel(os.path.join(raw_data_path, 'BEA Monthly Prices.xlsx'))
bea_prices = clean_bea_PQE_table(bea_prices, "Prices", long=True)
bea_prices = bea_prices[bea_prices['products'].isin(delta_final.index)]
bea_prices = bea_prices.drop_duplicates()
bea_prices['Prices'] = bea_prices['Prices'].replace('---', 0).astype(float)
bea_prices = bea_prices.fillna(0)

bea_expenditures = pd.read_excel(os.path.join(raw_data_path, 'BEA Monthly Expenditures.xlsx'))
bea_expenditures = clean_bea_PQE_table(bea_expenditures, "Expenditures", long=True)
bea_expenditures = bea_expenditures[bea_expenditures['products'].isin(delta_final.index)]
bea_expenditures = bea_expenditures.drop_duplicates()
bea_expenditures['Expenditures'] = bea_expenditures['Expenditures'].replace('---', 0).astype(float)
bea_expenditures = bea_expenditures.fillna(0)

bea_PQE_merged = pd.merge(left=bea_quantities, right=bea_prices, on=['products', 'date'], how='outer')
bea_PQE_merged = pd.merge(left=bea_PQE_merged, right=bea_expenditures, on=['products', 'date'], how='outer')

# dropping this because 2024-10 has no data yet 
bea_PQE_merged = bea_PQE_merged.dropna(subset=['Expenditures'])

dates = list(set(bea_expenditures["date"]) & set(bea_prices["date"]) & set(bea_quantities["date"]))
dates.sort()

cols_to_check = ['Quantities', 'Prices', 'Expenditures']

# Create filtered DataFrame where at least one value is missing or zero
products_with_zero_somwhere = bea_PQE_merged[
    bea_PQE_merged[cols_to_check].isnull().any(axis=1) |
    (bea_PQE_merged[cols_to_check] == 0).any(axis=1)
]

all_bad_products_dict = {}

for date in dates:
    bad_products_currnet = products_with_zero_somwhere[products_with_zero_somwhere['date'] == date]["products"]
    all_bad_products_dict[date] = bad_products_currnet.tolist()

# 12. Calculate Gamma

In [None]:
row_sums = delta_final.sum(axis=1)
gamma = pd.DataFrame(row_sums, columns=['Row_Sum'])
gamma.index = delta_final.index 
gamma = gamma.apply(pd.to_numeric, errors='coerce')
gamma

# 13. Calculate Omega

In [None]:
delta_gamma = delta_final.merge(gamma, left_on=gamma.index, right_on=delta_final.index)
delta_gamma = delta_gamma.set_index(delta_gamma.columns[0])
row_sums = delta_gamma.iloc[:, -1]
row_sums[row_sums == 0] = np.nan
omega = delta_gamma.iloc[:, :-1].div(row_sums, axis=0)
omega.columns.name = None 
omega.index.name = None 
omega = omega.apply(pd.to_numeric, errors='coerce')
omega

# Different gamma and omega for missing products

In [None]:
gamma_omega_yearly = {} # this will store a different gamma and omega for each month

for date in all_bad_products_dict:

    zero_products = all_bad_products_dict.get(date) # products that have no time series data for current month

    current_gamma = gamma.copy()
    current_delta = delta_final.copy()

    for product in zero_products:
        current_gamma.loc[current_gamma.index == product, 'Row_Sum'] = 0 # making missing sector sell 0 to very other sector
        if product in delta_final.columns: 
            current_delta[product] = 0 # rescale omega for each month so it sums to 1 
            
    current_row_sum = current_delta.sum(axis=1)

    current_omega = current_delta.div(current_row_sum, axis=0)
    current_omega.columns.name = None 
    current_omega.index.name = None 
    current_omega = current_omega.apply(pd.to_numeric, errors='coerce')

    gamma_omega_yearly[date] = [current_gamma, current_omega]

# Start of Doing our Supply/demand contribution graph properly (Which influenceer has hte most influence?)

In [None]:
shapiro_code_output = pd.read_excel(os.path.join(shapiro_file, 'shaprio_stata_output_excel.xlsx'))
haver_code_concordance = pd.read_excel(os.path.join(shapiro_file, 'Haver Codes to Product Names.xlsx'))
mapping = dict(zip(haver_code_concordance['Shapiro Price Name'], haver_code_concordance['PCE Category']))
""" I dont know why but in Shapiro's output the expenditure for water transport is titled s"""
mapping["s"] = "Water transportation (65)"
expenditure_weights = get_expenditure_weights_from_shapiro_outputs(shapiro_code_output, mapping)
demand_shock = get_demand_shock_from_shaipro_output(shapiro_code_output, mapping)
# shapiro_graph = plot_shapiro_graph_from_shapiro_ouput(shapiro_code_output, "Shapiro Graph")

# Find Sales

In [None]:
# calculate sales in product and time period
sales = pd.DataFrame({'date': pd.Series(dtype='datetime64[ns]'),
                   'products': pd.Series(dtype='str'),
                   'sales': pd.Series(dtype='float')})

for date in dates:

    gamma = gamma_omega_yearly.get(date)[0]
    omega = gamma_omega_yearly.get(date)[1]
 
    # filter expenditures for the current date
    expenditures_date = bea_expenditures[bea_expenditures['date'] == date][['products', 'Expenditures']].set_index('products')
    expenditures_date = expenditures_date.sort_index()
    expenditures_date = expenditures_date.apply(pd.to_numeric, errors='coerce')

    gamma_series = gamma["Row_Sum"]
    diag_matrix = np.diag(gamma_series)

    x = np.identity(len(omega)) - (omega.T @ diag_matrix)

    sales_date = np.linalg.inv(x) @ expenditures_date

    sales_date['date'] = date
    sales_date['products'] = expenditures_date.index
    sales_date.rename(columns={'Expenditures': 'sales'}, inplace=True)

    sales = pd.concat([sales, sales_date], ignore_index=True)

# Find Prices of Intermediates

In [None]:
# calculate prices of intermediates (we use cobb douglas production with intermediates)
intermediates = pd.DataFrame({'date': pd.Series(dtype='datetime64[ns]'),
                   'products': pd.Series(dtype='str'),
                   'intermediates': pd.Series(dtype='float')})
for date in dates:
    prices_date = bea_prices[bea_prices['date'] == date][['products', 'Prices']].set_index('products')
    prices_date = prices_date.sort_index()

    gamma = gamma_omega_yearly.get(date)[0]
    omega = gamma_omega_yearly.get(date)[1]
    
    for i in gamma.index:
        log_prices = np.log(prices_date['Prices'])
        log_prices.replace(-np.inf, 0, inplace=True)        
        intermediates.loc[len(intermediates)] = [date, i, np.exp(omega.loc[i] @ log_prices)]

# Find Price of Value Added

In [None]:
# calculate prices of value added
value_added = pd.DataFrame({'date': pd.Series(dtype='datetime64[ns]'),
                   'products': pd.Series(dtype='str'),
                   'value_added': pd.Series(dtype='float')})
for date in dates:
    # filter prices for the current date
    prices_date = bea_prices[bea_prices['date'] == date][['products', 'Prices']].set_index('products')
    prices_date = prices_date.sort_index()

    # filter intermediates for the current date
    intermediates_date = intermediates[intermediates['date'] == date][['products', 'intermediates']].set_index('products')
    intermediates_date = intermediates_date.sort_index()

    gamma = gamma_omega_yearly.get(date)[0]    
    gamma_series = gamma["Row_Sum"]

    value_added_date = np.exp((1/(1 - gamma_series.sort_index()))*(np.log(prices_date['Prices']) - gamma_series.sort_index() * np.log(intermediates_date['intermediates'])))

    value_added_date = value_added_date.reset_index().rename(columns={0: 'value_added'})
    value_added_date['date'] = date
    value_added_date.rename(columns={"index": "products"}, inplace=True)

    value_added = pd.concat([value_added, value_added_date], ignore_index=True)

# Sales and Value Added VAR

In [None]:
lags = 12
residuals_part = []

for product in gamma.index:
    product_bad_dates = products_with_zero_somwhere[products_with_zero_somwhere["products"] == product]
    calculated = pd.merge(left=value_added, right=sales, on=['products', 'date'], how='inner')
    calculated = calculated[calculated['products'] == product][['date', 'value_added', 'sales']].sort_values(['date'])
    calculated = calculated.set_index('date')

    if not product_bad_dates.empty: 
        calculated = calculated[~calculated.index.isin(product_bad_dates['date'])] # remove rows before we have all time series data
    
    calculated['value_added'] = np.log(calculated['value_added']).diff()
    calculated['sales'] = np.log(calculated['sales']).diff()

    calculated.replace([np.inf, -np.inf, np.nan], 0, inplace=True)

    full_index_calc = pd.date_range(start=calculated.index.min(), end=calculated.index.max(), freq='MS')
    calculated = calculated.reindex(full_index_calc)

    model_calculated = VAR(calculated)
    result_calculated = model_calculated.fit(lags)

    residuals_calculated = result_calculated.resid.reset_index()
    residuals_calculated['products'] = product
    
    residuals_part.append(residuals_calculated)

IO_residuals = pd.concat(residuals_part, ignore_index=True)
IO_residuals.rename(columns={'index': 'date', 'value_added': 'residual_value_added', 'sales': 'residual_sales'}, inplace=True)
IO_residuals = IO_residuals.groupby(['date', 'products']).sum(min_count=1).reset_index()
IO_residuals = IO_residuals.sort_values(['date', 'products'])

# Price and Quantity VAR

In [None]:
lags = 12
residual_temp = []

for product in gamma.index:

    original = bea_PQE_merged[bea_PQE_merged['products'] == product][['date', 'Prices', 'Quantities']].sort_values(['date'])
    original = original.set_index('date')

    original.dropna(inplace=True)

    original['Prices'] = np.log(original['Prices']).diff()
    original['Quantities'] = np.log(original['Quantities']).diff()
     

    original.replace([np.inf, -np.inf], np.nan, inplace=True)
    original.dropna(inplace=True)

    full_index = pd.date_range(start=original.index.min(), end=original.index.max(), freq='MS')
    original = original.reindex(full_index)

    model_original = VAR(original)
    result_original = model_original.fit(lags)

    residuals_original = result_original.resid.reset_index()
    residuals_original['products'] = product
    
    residual_temp.append(residuals_original)
    
residuals_normal = pd.concat(residual_temp, ignore_index=True)
residuals_normal.rename(columns={'index': 'date', 'Prices': 'residual_prices', 'Quantities': 'residual_quantities'}, inplace=True)
residuals_normal = residuals_normal.groupby(['date', 'products']).sum(min_count=1).reset_index()
residuals_normal = residuals_normal.sort_values(['date', 'products'])

# a) Classification of Price of Value Added and Real Production

In [None]:
residuals_normal['majority_demand'] = ((residuals_normal['residual_prices'] * residuals_normal['residual_quantities']) >= 0).astype(int)
residuals_normal['majority_supply'] = ((residuals_normal['residual_prices'] * residuals_normal['residual_quantities']) < 0).astype(int)

IO_residuals['majority_demand_a'] = ((IO_residuals['residual_value_added'] * IO_residuals['residual_sales']) >= 0).astype(int)
IO_residuals['majority_supply_a'] = ((IO_residuals['residual_value_added'] * IO_residuals['residual_sales']) < 0).astype(int)

In [None]:
D_IO = IO_residuals.pivot(index='products', columns='date', values='majority_demand_a')
D_shapiro = residuals_normal.pivot(index='products', columns='date', values='majority_demand')
D_IO

# b) Calculate the Influence of D_i_t onto Prices

In [None]:
big_o_yearly = {}

for date in gamma_omega_yearly:
    zero_products = all_bad_products_dict.get(date) # products that have no time series data for current month

    current_delta = delta_final.copy()

    for product in zero_products:
        if product in delta_final.columns: 
            current_delta[product] = 0 # rescale omega for each month so it sums to 1 

    current_delta = current_delta.values.astype(float)
    big_o_current = np.linalg.inv(np.identity(len(current_delta)) - current_delta)
    big_o_current = np.linalg.inv(big_o_current)
    big_o_current = pd.DataFrame(big_o_current, index=delta_final.index, columns=delta_final.index)
    big_o_current.index.name = None

    for product in zero_products:
        if product in big_o_current.columns: 
            big_o_current[product] = 0 # rescale omega for each month so it sums to 1 

    big_o_yearly[date] = big_o_current      

# D) Calculate theta_t 

In [None]:
value_added_pivot = value_added.pivot(index='products', columns='date', values='value_added')

# Dictionary that stores all the theta for each time period
all_lambda = {}

for series_name, series in value_added_pivot.items():
    """This function performs the steps outlined in 3d) to 3ef). It loops through P_VA for all time periods 
    and appends the resutling theta in all_thetas. """
    series[series == 0] = np.nan 
    
    series = np.log(series)
    
    series = series.to_numpy().reshape(-1, 1)
    
    series = np.where(np.isnan(series), 0, series) # replacing NA with zeros
    
    ones = np.ones((1, len(series)))
    theta = np.kron(series, ones)

    current_big_o = big_o_yearly.get(series_name)

    """all columns are the same??? """
    theta_t = current_big_o @ theta 

    # Calculates and stores column sums 
    k = theta_t.sum(axis=0).to_frame().T

    k = k.to_numpy().flatten() 

    inv_k = 1 / k
    diag = np.diag(inv_k)
    
    lam = diag @ theta_t

    lam.set_index(current_big_o.index, inplace=True)
    lam.columns =current_big_o.index

    all_lambda[f'{series_name}'] = lam

# g) + h) Calculate input D 

In [None]:
D_parts = {}

for date in D_IO.columns: 

    D_IO_final = D_IO[str(date)]
    missing_products = D_IO_final[D_IO_final.isna()].index
    D_IO_final = D_IO_final.dropna()

    lam_t = all_lambda.get(str(date))
    lam_t = lam_t.drop(columns=missing_products)

    D_t = lam_t.values @ D_IO_final.values
    D_parts[date] = D_t

D_matrix_final = pd.DataFrame(D_parts)

D_matrix_final.set_index(current_big_o.index, inplace=True)

# Inflation and Weights for Graphing

In [None]:
def annual_inflation_calculator_by_product(df):
    # Ensure sorted by product and time
    df = df.sort_values(by=['products', 'month'])
    # Group by product and compute year-over-year inflation
    df['inflation'] = df.groupby('products')['prices'].transform(lambda x: (x / x.shift(12) - 1) * 100)
    
    return df

In [None]:
prices = pd.read_excel(os.path.join(raw_data_path, 'BEA Monthly Prices.xlsx'))
prices = clean_bea_PQE_table(prices, "prices")
prices = prices.loc[prices.index.intersection(D_shapiro.index)]
prices = prices[~prices.index.duplicated(keep='first')]
prices = prices.reset_index()

prices_long = pd.melt(prices, id_vars='products', var_name='month', value_name='prices')
prices_long['prices'] = pd.to_numeric(prices_long['prices'], errors='coerce')

In [None]:
inflation = annual_inflation_calculator_by_product(prices_long)
inflation = inflation.pivot(index='products', columns='month', values='inflation')
inflation = inflation.iloc[:, 13:]

bea_expenditures = bea_expenditures.pivot(index='products', columns='date', values='Expenditures')
expenditure_weights = bea_expenditures.div(bea_expenditures.sum(axis=0), axis=1)
expenditure_weights = expenditure_weights.iloc[:, 13:]
weighted_inflation = expenditure_weights * inflation
total_inflation = weighted_inflation.sum(axis=0)
total_inflation

In [None]:
# D_matrix_final_percent = D_matrix_final / 100
# inflation_contribution = {}

# for date in D_matrix_final_percent.iloc[:, 1:]:
#     print(date)
#     current_inflation = total_inflation[total_inflation.index == date]
#     D_matrix_final_percent[]
#     contribution = 
    

#     break
# D_matrix_final_percent['scaled_A'] = D_matrix_final_percent[date] * current_inflation.iloc[0]
# D_matrix_final_percent

In [None]:
inflation_parts_IO = {}

for date in weighted_inflation:
    current_demand = D_IO[str(date)]
    current_supply = 1 - current_demand
    current_inflation = weighted_inflation[str(date)]

    demand_inflation = (current_inflation * current_demand).sum()
    supply_inflation = (current_inflation * current_supply).sum()

    inflation_parts_IO[date] = [demand_inflation, supply_inflation]

inflation_IO_final = pd.DataFrame(inflation_parts_IO, index=['demand_inflation', 'supply_inflation'])
inflation_IO_final = inflation_IO_final.T

In [None]:
inflation_parts_shapiro = {}

for date in weighted_inflation:
    current_demand = D_shapiro[str(date)]
    current_supply = 1 - current_demand
    current_inflation = weighted_inflation[str(date)]

    demand_inflation = (current_inflation * current_demand).sum()
    supply_inflation = (current_inflation * current_supply).sum()

    inflation_parts_shapiro[date] = [demand_inflation, supply_inflation]

inflation_shapiro_final = pd.DataFrame(inflation_parts_shapiro, index=['demand_inflation', 'supply_inflation'])
inflation_shapiro_final = inflation_shapiro_final.T

In [None]:
def plot_graphs(data, plot_title, plot_text):
    data = data.loc[data.index >= pd.Timestamp('1970-01-01')]

    supply_inflation = data[["supply_inflation"]].copy()
    supply_inflation.rename(columns={'supply_inflation': 'Supply Inflation'}, inplace=True)

    demand_inflation = data[["demand_inflation"]].copy()
    demand_inflation.rename(columns={'demand_inflation': 'Demand Inflation'}, inplace=True)

    supply_inflation['supply_pos'] = supply_inflation['Supply Inflation'].apply(lambda x: x if x > 0 else 0)
    demand_inflation['demand_pos'] = demand_inflation['Demand Inflation'].apply(lambda x: x if x > 0 else 0)
    supply_inflation['supply_neg'] = supply_inflation['Supply Inflation'].apply(lambda x: x if x < 0 else 0)
    demand_inflation['demand_neg'] = demand_inflation['Demand Inflation'].apply(lambda x: x if x < 0 else 0)

    demand_inflation = demand_inflation.iloc[:-1]
    supply_inflation = supply_inflation.iloc[:-1]

    plt.figure(figsize=(26, 12))

    plt.stackplot(supply_inflation.index, demand_inflation['demand_pos'], supply_inflation['supply_pos'], colors= ["#008000", "#FF0000"], labels = ["Deamnd", "Supply"])
    plt.stackplot(supply_inflation.index, demand_inflation['demand_neg'], supply_inflation['supply_neg'], colors= ["#008000", "#FF0000"])
    plt.xlabel('Date')
    plt.ylabel('Inflation Percent')
    plt.title(f'{plot_title}')
    plt.legend()

    plt.text(0.02, 0.95, 
         f'{plot_text}', 
         transform=plt.gca().transAxes, fontsize=9,
         bbox=dict(facecolor='white', alpha=0.8))

    return plt

In [None]:
plot_graphs(inflation_shapiro_final, "Shapiro Classification", "• Using our 210 products \n• Using Price and Quantities for Residuals and Classification")

In [None]:
plot_graphs(inflation_IO_final, "ShapirIO Classification", "• Using our 210 products \n• Using Value-Added and Sales for Residuals and Classification")

In [None]:
shapiro_graph = plot_shapiro_graph_from_shapiro_ouput(shapiro_code_output, "Shapiro Graph Using His 130 Products, Data as Found in His Paper")
shapiro_graph