# Distance matching: City matching example

In [1]:
import metric
metric = reload(metric)
from metric import *

### Load example data

In [2]:
data = pd.read_csv('example_data.csv', header=0, index_col=0)

In [3]:
data

Unnamed: 0,city_id,has_eats,has_pool,has_cash,n_trips,rider_maus,driver_maus,percent_surged,avg_driver_age,percent_female,...,rider_spend,driver_spend,pool_ufp_spend,nonpool_ufp_spend,distance_rate_x,time_rate_x,uber_cp_vs_lyft,integration_and_segregation_index,2000_pop_density,walkscore
Atlanta,23,1,1,0,2387878,599479,32309,0.163449,14073.76834,0.401725,...,0.0575,0.3775,0.5925,-0.0925,0.75,0.12,0.748692,-0.145,3161,48.4
Austin,4,1,0,0,38625,40751,2813,0.103223,13775.96007,0.294094,...,0.0,0.0025,0.0,0.0675,1.0,0.12,0.743385,0.006,2610,39.2
Baltimore-Maryland,35,1,0,0,847189,237009,17927,0.172626,14693.18802,0.34246,...,0.015,0.245,0.0,0.055,1.15,0.11,0.638538,-0.111,8058,68.7
Boston,6,1,1,0,4219941,750903,27234,0.345397,14221.67227,0.195451,...,0.015,0.9875,1.055,-0.465,1.24,0.2,0.743538,-0.029,12165,80.7
Charlotte,22,0,0,0,488346,141375,6969,0.077466,14297.44006,0.336363,...,0.0025,0.0375,0.0,0.0225,0.75,0.15,0.791462,-0.007,2232,25.5
Chicago,7,1,1,0,5587356,1069627,50368,0.168873,14229.33312,0.287956,...,0.025,1.0025,0.805,0.04,0.9,0.2,0.717154,-0.186,12749,77.5
Cincinnati,141,1,0,0,287967,94344,4830,0.15342,14443.9392,0.298367,...,0.0075,0.0525,0.0,-0.0075,0.8,0.15,0.856769,-0.06,4249,50.2
Cleveland,142,1,0,0,331988,108497,6035,0.113841,14919.51111,0.332794,...,0.0125,0.045,0.0,-0.005,0.77,0.12,0.831,-0.109,6166,58.9
Columbus,139,1,0,0,398746,123750,5760,0.035017,14165.59137,0.269673,...,0.01,0.005,0.0,0.0025,1.1,0.18,0.855385,-0.018,3383,40.4
Dallas,25,1,0,0,1509712,491010,21993,0.135391,14171.2972,0.282257,...,0.21,0.1975,0.0,-0.065,0.85,0.1,0.760308,-0.07,3470,45.4


### Calculate distance between cities

In [4]:
# Exclude cities which have cash
exclude = ['has_cash']

# Manually exclude the following cities for various policy-related reasons
manual_exclude = ['Las Vegas', 'Denver', 'New Orleans', 'Austin']

# We will bin the following continuous variables into discrete variables by binning them into quartiles
bin_characteristics = ['n_trips', 'uber_cp_vs_lyft']
num_bins = {'n_trips': 2, 'uber_cp_vs_lyft': 2}

# We want to exactly match on the following characteristics
# 'n_trips_1' vs. 'n_trips_2' is bottom half vs top half of distribution; created via binning. Same with CP
exact_match_on = ['has_eats', 'has_pool', 
                  'n_trips_1', 'n_trips_2', 
                  'uber_cp_vs_lyft_1', 'uber_cp_vs_lyft_2']

# We will then calculate distance within each exactly-matched partition using the following variables
dist_match_on = [
    'n_trips',
    'rider_maus',
    'driver_maus',
    'percent_surged',
    'avg_driver_age',
    'percent_female',
    'launch_days_since',
    'hourly_earnings',
    'rider_spend',
    'driver_spend',
    'pool_ufp_spend',
    'nonpool_ufp_spend',
    'distance_rate_x',
    'time_rate_x',
    'uber_cp_vs_lyft',
    'integration_and_segregation_index',
    '2000_pop_density',
    'walkscore'
]

# We'll use the Mahalanobis distance metric
dist_metric = 'mahalanobis'

In [5]:
# Clean data
data = clean_data(data, exclude=exclude, bin_characteristics=bin_characteristics, num_bins=num_bins,
                  manual_exclude=manual_exclude)

In [6]:
# Partition based on "exact match" variables
partitions = partition_universe(data, exact_match_on)

In [7]:
partitions

{'Atlanta': ['Atlanta',
  'Boston',
  'New Jersey',
  'New York City',
  'Washington D.C.'],
 'Baltimore-Maryland': ['Baltimore-Maryland', 'Orange County', 'Phoenix'],
 'Boston': ['Atlanta',
  'Boston',
  'New Jersey',
  'New York City',
  'Washington D.C.'],
 'Charlotte': ['Charlotte',
  'Hampton Roads',
  'Indianapolis',
  'Jacksonville',
  'Pittsburgh',
  'Raleigh-Durham'],
 'Chicago': ['Chicago',
  'Los Angeles',
  'Miami',
  'Philadelphia',
  'San Diego',
  'San Francisco',
  'Seattle'],
 'Cincinnati': ['Cincinnati', 'Cleveland', 'Columbus', 'San Antonio'],
 'Cleveland': ['Cincinnati', 'Cleveland', 'Columbus', 'San Antonio'],
 'Columbus': ['Cincinnati', 'Cleveland', 'Columbus', 'San Antonio'],
 'Dallas': ['Dallas', 'Houston', 'Orlando', 'Tampa Bay'],
 'Detroit': ['Detroit', 'Honolulu', 'Sacramento', 'Tucson'],
 'Hampton Roads': ['Charlotte',
  'Hampton Roads',
  'Indianapolis',
  'Jacksonville',
  'Pittsburgh',
  'Raleigh-Durham'],
 'Honolulu': ['Detroit', 'Honolulu', 'Sacramento'

In [8]:
# Calculate distance between all cities
dist = get_dist(data[dist_match_on], metric=dist_metric)

In [9]:
dist

Unnamed: 0,Atlanta,Baltimore-Maryland,Boston,Charlotte,Chicago,Cincinnati,Cleveland,Columbus,Dallas,Detroit,...,Portland,Raleigh-Durham,Sacramento,San Antonio,San Diego,San Francisco,Seattle,Tampa Bay,Tucson,Washington D.C.
Atlanta,,6.8768,6.6893,5.3924,6.974,6.2295,6.6638,6.5421,7.1464,6.5481,...,7.0733,5.3156,6.1062,7.1237,5.0988,7.5264,6.648,5.9613,7.3026,6.5395
Baltimore-Maryland,6.8768,,7.2604,5.8274,6.7081,6.3979,5.392,6.4406,7.1736,6.4483,...,6.6369,6.1616,5.6715,7.1203,6.5264,7.576,7.3721,5.3918,5.6032,7.2451
Boston,6.6893,7.2604,,6.6873,7.3801,6.1081,6.6427,6.5259,7.5762,6.1856,...,8.0914,6.0308,6.2753,7.0045,6.8233,7.4261,6.5696,6.7574,5.7178,7.1532
Charlotte,5.3924,5.8274,6.6873,,5.391,5.0192,5.5287,4.705,6.7985,5.2751,...,6.4961,4.3306,4.4696,6.4035,5.8609,6.8586,5.8174,3.8095,5.759,6.6611
Chicago,6.974,6.7081,7.3801,5.391,,5.1652,5.8004,6.45,7.1452,5.6728,...,6.7728,5.9547,6.7246,6.5196,6.1446,7.3525,6.3137,5.5618,6.2513,6.9776
Cincinnati,6.2295,6.3979,6.1081,5.0192,5.1652,,2.5083,5.2821,6.2031,4.8929,...,5.8786,3.5207,4.9587,5.9423,5.1498,6.5213,5.1293,4.0562,4.8815,5.6537
Cleveland,6.6638,5.392,6.6427,5.5287,5.8004,2.5083,,5.5787,6.3388,5.3624,...,6.1832,4.2047,5.1277,6.5863,5.9107,6.3845,5.8578,4.2095,5.0856,6.2606
Columbus,6.5421,6.4406,6.5259,4.705,6.45,5.2821,5.5787,,6.5852,6.2084,...,5.7729,4.4582,6.0417,5.1904,6.2652,7.0551,5.2719,5.2565,4.5011,5.9908
Dallas,7.1464,7.1736,7.5762,6.7985,7.1452,6.2031,6.3388,6.5852,,6.1806,...,7.2486,6.2654,6.7829,6.9096,6.6509,7.7623,7.0355,5.6879,6.896,7.1666
Detroit,6.5481,6.4483,6.1856,5.2751,5.6728,4.8929,5.3624,6.2084,6.1806,,...,6.0492,4.1506,4.3817,6.5413,6.2984,7.6015,6.4086,6.0269,4.9604,6.0156


### Example: KNN matching

In [10]:
knn_result = knn(dist, k=1, restrictions=partitions)

In [11]:
knn_result

Unnamed: 0,0
Atlanta,Washington D.C.
Baltimore-Maryland,Phoenix
Boston,Atlanta
Charlotte,Indianapolis
Chicago,Philadelphia
Cincinnati,Cleveland
Cleveland,Cincinnati
Columbus,San Antonio
Dallas,Tampa Bay
Detroit,Sacramento


### Example: Greedy matching

In [12]:
greedy_result = greedy_match(dist, partitions)

In [13]:
greedy_result

Atlanta                          Washington D.C.
Baltimore-Maryland                       Phoenix
Boston                                New Jersey
Charlotte                             Pittsburgh
Chicago                             Philadelphia
Cincinnati                             Cleveland
Cleveland                             Cincinnati
Columbus                             San Antonio
Dallas                                   Orlando
Detroit                               Sacramento
Hampton Roads                     Raleigh-Durham
Honolulu                                  Tucson
Houston                                Tampa Bay
Indianapolis                        Jacksonville
Jacksonville                        Indianapolis
Los Angeles                                Miami
Memphis                                  Memphis
Miami                                Los Angeles
Milwaukee                              Nashville
Minneapolis - St. Paul    Minneapolis - St. Paul
Nashville           

### Example: Optimal matching

In [14]:
optimal_result = get_optimal_matches(dist, partitions, func=ell_one)

In [15]:
optimal_result

Milwaukee                      Nashville
Chicago                     Philadelphia
Los Angeles                        Miami
San Diego                        Seattle
Charlotte                   Indianapolis
Hampton Roads                 Pittsburgh
Jacksonville              Raleigh-Durham
Cincinnati                     Cleveland
Columbus                     San Antonio
Baltimore-Maryland               Phoenix
Atlanta                           Boston
New Jersey               Washington D.C.
Dallas                           Orlando
Houston                        Tampa Bay
Detroit                           Tucson
Honolulu                      Sacramento
Nashville                      Milwaukee
Philadelphia                     Chicago
Miami                        Los Angeles
Seattle                        San Diego
Indianapolis                   Charlotte
Pittsburgh                 Hampton Roads
Raleigh-Durham              Jacksonville
Cleveland                     Cincinnati
San Antonio     

### Comparison: KNN vs. greedy vs. optimal matching

In [16]:
comparison = pd.DataFrame({'knn': knn_result[0], 'greedy': greedy_result, 'optimal': optimal_result})

In [17]:
# NaN or self match means that no match was able to be made
comparison

Unnamed: 0,greedy,knn,optimal
Atlanta,Washington D.C.,Washington D.C.,Boston
Baltimore-Maryland,Phoenix,Phoenix,Phoenix
Boston,New Jersey,Atlanta,Atlanta
Charlotte,Pittsburgh,Indianapolis,Indianapolis
Chicago,Philadelphia,Philadelphia,Philadelphia
Cincinnati,Cleveland,Cleveland,Cleveland
Cleveland,Cincinnati,Cincinnati,Cincinnati
Columbus,San Antonio,San Antonio,San Antonio
Dallas,Orlando,Tampa Bay,Orlando
Detroit,Sacramento,Sacramento,Tucson


# Propensity score matching: [example here]