In [221]:
import os
import numpy as np
import pandas as pd

In [222]:
# Load the dataset
# country_to_load = 'MEX' 
country_to_load = 'BGR' 

dataframe = pd.read_stata('datasets/'+ country_to_load + '.dta')

# Clean data

In [223]:
keep_top = 50
year = 2006

# Filter and clean the dataset
dataframe = dataframe.drop(columns=['q', 'c'])
dataframe = dataframe[(dataframe['y'] == year) & (dataframe['d'] != 'OTH')].drop(columns=['y'])
top_d = dataframe['d'].value_counts().nlargest(keep_top).index
dataframe = dataframe[dataframe['d'].isin(top_d)]


revenue_df = dataframe.pivot_table(index='f', columns='d', values='v', aggfunc='sum', fill_value=0)   

firms = list(revenue_df.index)
destinations = list(revenue_df.columns)

print(f"Number of firms: {len(firms)}")
print(f"Number of destinations: {len(destinations)}")

Number of firms: 13229
Number of destinations: 50


### Poisson regression

In [224]:
[i for i, country in enumerate(destinations) if country == 'USA']

[48]

In [225]:
from scipy.special import logsumexp

def my_poisson(y_i_j):
    num_i, num_j = y_i_j.shape
    alpha_i = np.zeros(num_i)
    beta_j = np.zeros(num_j)

    log_marg_i = np.log(y_i_j.sum(axis=1))
    log_marg_j = np.log(y_i_j.sum(axis=0))

    iters = 0
    while True:
        iters += 1
        alpha_i_new = log_marg_i - logsumexp(beta_j)
        beta_j_new = log_marg_j - logsumexp(alpha_i_new)
        if (np.allclose(alpha_i, alpha_i_new, rtol=1e-13, atol=1e-13) and 
            np.allclose(beta_j, beta_j_new, rtol=1e-13, atol=1e-13)):
            print(f'Converged after {iters} iterations')
            break
        alpha_i, beta_j = alpha_i_new, beta_j_new
        # print(alpha_i_new[0])
    
    return np.exp(alpha_i[:, None] + beta_j[None, :])

revenue_i_j = revenue_df.values
expected_revenue_i_j = my_poisson(revenue_i_j)

Converged after 2 iterations


### Distance matrix

In [226]:
from geopy.distance import geodesic
from itertools import product

countries = ["MEX"] + destinations 

url = 'https://gist.githubusercontent.com/tadast/8827699/raw/countries_codes_and_coordinates.csv'
coordinates_df = pd.read_csv(url)

coordinates_df.columns = coordinates_df.columns.str.strip()
coordinates_df = coordinates_df.astype(str).apply(lambda col: col.str.replace('"', '').str.strip())
coordinates_df = coordinates_df.rename(columns={
    'Alpha-3 code': 'ISO3',
    'Latitude (average)': 'Lat',
    'Longitude (average)': 'Lon'
})

coordinates_df = coordinates_df.drop_duplicates(subset='ISO3')

coords_dict = coordinates_df.set_index('ISO3')[['Lat', 'Lon']].T.to_dict('list')

# Filter countries you already defined
valid_countries = [code for code in countries if code in coords_dict]
missing = [code for code in countries if code not in coords_dict]
print(f"Missing countries: {missing}")


distance_df = pd.DataFrame(index=valid_countries, columns=valid_countries, dtype=float)
for c1, c2 in product(valid_countries, repeat=2):
    distance_df.loc[c1, c2] = geodesic(coords_dict[c1], coords_dict[c2]).kilometers / 1000  # Convert to thousands kilometers

Missing countries: []


In [227]:
print(np.all(distance_df.index[1:] == revenue_df.columns))
print(np.all(distance_df.columns[1:] == revenue_df.columns))

True
True


# Create input data

In [228]:
sample_size = 10000000
num_agents = len(firms)
np.random.seed(49)  
if num_agents < sample_size:
    random_agents = np.arange(num_agents)
else:
    random_agents = np.random.choice(num_agents, sample_size, replace=False)

beta = 3
add_fixed_effects = False

In [229]:
os.makedirs('../input_data', exist_ok=True)


Modular agent specific

In [230]:
exp_revenue_i_j= expected_revenue_i_j/ 1_000_000
# exp_revenue_i_j= exp_revenue_df.to_numpy() /1_000

assert num_agents == exp_revenue_i_j.shape[0]

modular_i_j_k = []
modular_i_j_k.append(exp_revenue_i_j)

if country_to_load == 'MEX':
    exp_revenue_i_j_nafta = exp_revenue_i_j * np.isin(revenue_df.columns, ['USA', 'CAN'])
    modular_i_j_k.append(exp_revenue_i_j_nafta)

    # nafta_FE = np.isin(revenue_df.columns, ['USA', 'CAN'])[None,:].repeat(num_agents, axis=0)
    # modular_i_j_k.append(nafta_FE)


# Stack and save
modular_i_j_k = np.stack(modular_i_j_k, axis=2)
modular_i_j_k = modular_i_j_k[random_agents, :, :]
np.save('../input_data/modular_i_j_k.npy', modular_i_j_k)
print(modular_i_j_k.shape)
print("min:", modular_i_j_k[modular_i_j_k>0].min())
print("max:", modular_i_j_k.max())

(13229, 50, 1)
min: 2.2001476614732414e-11
max: 199.72324863940617


Modular agent independent

In [231]:
distance_c_c = distance_df.to_numpy()
distance_j_j = distance_c_c[1:,1:]
np.save('../input_data/distance_j_j.npy', distance_j_j)
distance_hj = distance_c_c[1:,0]

modular_j_k = []
if add_fixed_effects:
    for j in range(exp_revenue_i_j.shape[1]):
        dummy_j = np.zeros(exp_revenue_i_j.shape[1])
        dummy_j[j] = 1
        modular_j_k.append(dummy_j)
else:
    modular_j_k.append(-np.ones_like(distance_hj))
    modular_j_k.append(-distance_hj)


# Stack and save
modular_j_k = np.stack(modular_j_k, axis=1)
np.save('../input_data/modular_j_k.npy', modular_j_k)
print(modular_j_k.shape)
print("min:", modular_j_k.min())
print("max:", modular_j_k.max())

(50, 2)
min: -16.264341545892467
max: -1.0


Quadratic agent-independant

In [232]:
exp_d_j_j = np.exp(-beta * distance_j_j)
np.fill_diagonal(exp_d_j_j, 0)
quadratic_feat = []
quadratic_feat.append(exp_d_j_j)
quadratic_feat.append(exp_d_j_j * distance_hj[None,:])

# Stack and save
quadratic_j_j_k = np.stack(quadratic_feat, axis=2) 
np.save('../input_data/quadratic_j_j_k.npy', quadratic_j_j_k)
print(quadratic_j_j_k.shape)
print("min:", quadratic_j_j_k[quadratic_j_j_k>0].min())
print("max:", quadratic_j_j_k.max())

(50, 50, 2)
min: 1.5789021724014102e-24
max: 7.5225905629642895


In [233]:
# a = quadratic_j_j_k[:,:, 0].copy()
# b = quadratic_j_j_k[:,:, 1].copy()

# np.fill_diagonal(a, 1)
# np.fill_diagonal(b, 200000)

# (b/a).min()

In [234]:
# np.all((quadratic_j_j_k[:,:, 1] - 1.487189 * quadratic_j_j_k[:,:, 0] )>=0 )

In [235]:
# (quadratic_j_j_k  @ np.array([-1.5,1])).min()

In [236]:
modular_j_k.shape[-1] + modular_i_j_k.shape[-1] + quadratic_j_j_k.shape[-1]

5

Observed bundles


In [237]:
obs_bundles = revenue_i_j > 0
obs_bundles = obs_bundles[random_agents, :]
np.save('../input_data/obs_bundles.npy', obs_bundles)
print(obs_bundles.shape)

(13229, 50)


In [238]:
# import matplotlib.pyplot as plt
# count = obs_bundles.sum(1)
# plt.hist(count, bins=50, alpha=0.5, color='blue', label='observed')

# plt.yscale('log') 
# plt.xlabel("Number of destinations")
# plt.ylabel("Count (log scale)")
# plt.legend()
# plt.grid(True, which="both", linestyle="--", linewidth=0.5)
# plt.tight_layout()
# plt.show()

Errors

In [239]:
rho_0 = .16
rho_d = .82
sigmasq = 18

Covariance = sigmasq * rho_0 * np.exp(- rho_d * distance_j_j)
from scipy.linalg import sqrtm
A_sqrt = sqrtm(Covariance)