In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
### Load the data

bidder_data = pd.read_csv('datasets/bidder_data.csv')

bta_data = pd.read_csv('datasets/bta_data.csv')

bta_adjacency = pd.read_csv('datasets/btamatrix_merged.csv',  header=None)
bta_adjacency = bta_adjacency.drop(bta_adjacency.columns[0], axis=1)
bta_adjacency_j_j = bta_adjacency.values

geo_distance = pd.read_csv('datasets/distancesmat_dio_perl_fixed.dat', delimiter=' ', header=None)
geo_distance = geo_distance.drop(geo_distance.columns[-1], axis=1)
geo_distance_j_j = geo_distance.values

travel_survey = pd.read_csv('datasets/american-travel-survey-1995-zero.csv')
travel_survey_j_j_0 = travel_survey.values
travel_survey_j_j = np.zeros_like(geo_distance_j_j)
travel_survey_j_j[1:,:] = travel_survey_j_j_0
travel_survey_j_j[0,1:] = travel_survey_j_j_0[:,0]

air_travel = pd.read_csv('datasets/air-travel-passengers-bta-year-1994.csv')
air_travel_j_j_0 = air_travel.values
air_travel_j_j = np.zeros_like(geo_distance_j_j)
air_travel_j_j[1:,:] = air_travel_j_j_0
air_travel_j_j[0,1:] = air_travel_j_j_0[:,0]

In [3]:
# # Flatten the matrix and get the indices of the top 10 largest values
# flat_indices = np.argsort(air_travel_j_j.ravel())[-10:][::-1]

# # Convert flat indices back to row and column indices
# rows, cols = np.unravel_index(flat_indices, air_travel_j_j.shape)

# # Get the corresponding values
# top_10_values = air_travel_j_j[rows, cols]

# # Combine row, column indices and their values
# top_10_entries = list(zip(rows, cols, top_10_values))

# # Display the top 10 entries
# top_10_entries

In [4]:
# # Flatten the matrix and get the indices of the top 10 largest values
# flat_indices = np.argsort(travel_survey_j_j.ravel())[-10:][::-1]

# # Convert flat indices back to row and column indices
# rows, cols = np.unravel_index(flat_indices, travel_survey_j_j.shape)

# # Get the corresponding values
# top_10_values = travel_survey_j_j[rows, cols]

# # Combine row, column indices and their values
# top_10_entries = list(zip(rows, cols, top_10_values))

# # Display the top 10 entries
# top_10_entries

ALBQ: 7, Albany: 5,6, NY: 320, LA: 261, SD: 401, SF: 403, Miami: 292, Chicago: 77

In [5]:
### Weights and Capacities

# Reduce encoding lenght of weights and capacities
def round_weights(tick):

        capacity_i = bidder_data['pops_eligible'].to_numpy()
        weight_j = bta_data['pop90'].to_numpy()

        weight_j_rounded = (np.round((weight_j / tick) )).astype(int)
        capacity_i_rounded = (np.round((capacity_i / tick)) ).astype(int)

        return weight_j_rounded, capacity_i_rounded

weight_j, capacity_i = round_weights(1000)

### Matching matrix
def generate_matching_matrix():
    num_agents = len(capacity_i)
    num_objects = len(weight_j)
    matching_i_j = np.zeros((num_agents, num_objects), dtype=bool)

    for j in range(num_objects):
        winner_id = bta_data['bidder_num_fox'].values[j] - 1
        matching_i_j[winner_id, j] = True
        
    return matching_i_j

matching_i_j = generate_matching_matrix()

In [6]:
delta = 4
E_j_j =  (weight_j[:, None] * weight_j[None,:]).astype(float)
np.fill_diagonal(E_j_j,0)
E_j_j[geo_distance_j_j> 0] /= (geo_distance_j_j[geo_distance_j_j> 0])**delta
pop_centroid_j_j = (weight_j[:, None] / weight_j.sum()) * (E_j_j / E_j_j.sum(1)[:,None])

print(pop_centroid_j_j.sum())

winning = np.unique(np.where(matching_i_j)[0])
pop_centroid_hat_i = (pop_centroid_j_j[None,:,:] * matching_i_j[:, :, None] * matching_i_j[:, None, :]).sum((1,2))
print('mean ', pop_centroid_hat_i[winning].mean())
print('std ', pop_centroid_hat_i[winning].std())
print('max ', pop_centroid_hat_i[winning].max())

0.9999999999999999
mean  0.005403853978430972
std  0.02261971252835403
max  0.19410511809941688


In [7]:
percentile = np.percentile(pop_centroid_j_j, 0)
print('percentile:', percentile)
truncated_pop_centroid_j_j= np.where(pop_centroid_j_j > percentile, pop_centroid_j_j, 0)
print(truncated_pop_centroid_j_j.sum())
winning = np.unique(np.where(matching_i_j)[0])
truncated_pop_centroid_hat_i = (truncated_pop_centroid_j_j[None,:,:] * matching_i_j[:, :, None] * matching_i_j[:, None, :]).sum((1,2))
print('mean ',truncated_pop_centroid_hat_i[winning].mean())
print('std ',truncated_pop_centroid_hat_i[winning].std())
print('max ',truncated_pop_centroid_hat_i[winning].max())
print('density:', np.count_nonzero(truncated_pop_centroid_j_j) / truncated_pop_centroid_j_j.size)

percentile: 0.0
0.9999999999999999
mean  0.005403853978430972
std  0.02261971252835403
max  0.19410511809941688
density: 0.9979716024340771


### Build characteristics matrices

In [8]:
### Modular Characteristics
modular_list = []

# eligibility_i * pop_j  
modular_list.append((capacity_i[:, None]/ weight_j.sum() )* (weight_j[None, :]/ weight_j.sum()))

# 2. geo_distance_i_j
bidder_bta = bidder_data['bta'].to_numpy() - 1
# modular_list.append( - geo_distance_j_j[bidder_bta] /geo_distance_j_j[bidder_bta].sum(1)[:,None])

# 3. Rural status_i * density_j
# rural_i = bidder_data['Applicant_Status'].str.contains('Rural Telephone Company', na=False).to_numpy()
# density_j = bta_data['density'].to_numpy()
# modular_list.append(rural_i[:, None] * density_j[None, :])

# Stack 
modular_characteristics_i_j_k = np.stack(modular_list, axis=2)

### Quadratic Characteristics

quadratic_list = []

# bta_adjacency_j_j
# quadratic_list.append(bta_adjacency_j_j / bta_adjacency_j_j.sum())

# pop_centroid_j_j
quadratic_list.append(truncated_pop_centroid_j_j)

# travel_survey_j_j
quadratic_travel_j_j = travel_survey_j_j.copy() + 1e-15
np.fill_diagonal(quadratic_travel_j_j, 0)
outflow_j = quadratic_travel_j_j.sum(1)
quadratic_travel_j_j[outflow_j > 0] /= outflow_j[outflow_j > 0][:, None]
quadratic_travel_j_j *= weight_j[:, None] / weight_j[outflow_j > 0].sum()

quadratic_list.append(quadratic_travel_j_j)

# air_travel_j_j
quadratic_air_j_j = air_travel_j_j.copy() + 1e-15
np.fill_diagonal(quadratic_air_j_j, 0)
outflow_j = quadratic_air_j_j.sum(1)
quadratic_air_j_j[outflow_j > 0] /= outflow_j[outflow_j > 0][:, None]
quadratic_air_j_j *= weight_j[:, None] / weight_j[outflow_j > 0].sum()

quadratic_list.append(quadratic_air_j_j)

# Stack
quadratic_characteristic_j_j_k = np.stack(quadratic_list, axis=2)

print('Density of quadratic term: ',(quadratic_characteristic_j_j_k.sum(2) > 0).sum() / quadratic_characteristic_j_j_k.sum(2).size)
print('Sum of quadratic term: ',quadratic_characteristic_j_j_k.sum((0,1)))

# Compute characteristics at observed matching
phi_hat_i_k = np.concatenate(((modular_characteristics_i_j_k * matching_i_j[:, :, None]).sum(1),
                np.einsum('jlk,ij,il->ik', quadratic_characteristic_j_j_k, matching_i_j, matching_i_j)), 
                axis = 1)
phi_hat = phi_hat_i_k.sum(0)

print('phi_hat: ', phi_hat)
print('std:     ', phi_hat_i_k.std(0))

Density of quadratic term:  0.9979716024340771
Sum of quadratic term:  [1. 1. 1.]
phi_hat:  [0.34392993 0.47553915 0.25142282 0.19437477]
std:      [0.01636621 0.01353402 0.01078501 0.00934324]


In [9]:
winning = np.unique(np.where(matching_i_j)[0])
pop_hat = (matching_i_j * weight_j[None, :] ).sum(1)/ weight_j.sum()
# columns = ['eligibility_i * pop_j', 'geo_distance_i_j', 'bta_adjacency_j_j', 'pop_centroid_j_j', 'travel_survey_j_j', 'air_travel_j_j']
columns = ['eligibility_i * pop_j',  'pop_centroid_j_j', 'travel_survey_j_j', 'air_travel_j_j']

means = phi_hat_i_k[winning,:].mean(0)
stds = phi_hat_i_k[winning,:].std(0)
maxs = phi_hat_i_k[winning,:].max(0)

print('Statistics at winning bids')
data = np.array([means, stds, maxs]).T
df = pd.DataFrame(data, columns=['Mean', 'Std', 'Max'], index=columns)
df.round(4)

Statistics at winning bids


Unnamed: 0,Mean,Std,Max
eligibility_i * pop_j,0.0039,0.0277,0.2589
pop_centroid_j_j,0.0054,0.0226,0.1941
travel_survey_j_j,0.0029,0.0182,0.1686
air_travel_j_j,0.0022,0.0158,0.1462


# Save

In [None]:
def save_instance_data(destination_path):
    np.save(destination_path + 'matching_i_j.npy', matching_i_j)
    np.save(destination_path + 'weight_j.npy', weight_j)
    np.save(destination_path + 'capacity_i.npy', capacity_i)

    np.save(destination_path + 'quadratic_characteristic_j_j_k.npy', quadratic_characteristic_j_j_k)
    np.save(destination_path + 'modular_characteristics_i_j_k.npy', modular_characteristics_i_j_k)

destination_path = 'input_data/'

save_instance_data(destination_path)
print(np.load(destination_path + 'quadratic_characteristic_j_j_k.npy').shape)
print(np.load(destination_path + 'modular_characteristics_i_j_k.npy').shape)

(493, 493, 3)
(255, 493, 1)
