In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import recordlinkage

In [3]:
%%html
<style>
  table {margin-left: 0 !important;}
</style>

In [4]:
df_sba = pd.read_csv("./data/PPP Data 150k plus.csv")
df_ue = pd.read_csv("./data/new-york-details.csv")
df_sba.head(1)
df_ue.head(1)

Unnamed: 0,LoanRange,BusinessName,Address,City,State,Zip,NAICSCode,BusinessType,RaceEthnicity,Gender,Veteran,NonProfit,JobsRetained,DateApproved,Lender,CD
0,a $5-10 million,"ARCTIC SLOPE NATIVE ASSOCIATION, LTD.",7000 Uula St,BARROW,AK,99723.0,813920.0,Non-Profit Organization,Unanswered,Unanswered,Unanswered,Y,295.0,04/14/2020,"National Cooperative Bank, National Association",AK - 00


Unnamed: 0,name,url,city,cuisine,avgRating,numReviews,priceRange,latitude,longitude,telephone,postalCode,streetAddress,addressLocality,addressRegion,openingHoursSpecification,addressString,geoString,ratingString
0,Market Cafe,https://www.ubereats.com/new-york/food-deliver...,new-york,"['Pizza', 'American', 'Italian']",,,$,40.752349,-73.975021,12126820000.0,10017,425 Lexington Ave,New York,NY,"[{'@type': 'OpeningHoursSpecification', 'dayOf...","{'@type': 'PostalAddress', 'addressLocality': ...","{'@type': 'GeoCoordinates', 'latitude': 40.752...",{}


## Pre-processing

In [5]:
# Converting Zipcodes in SBA data from `float` to `int`. Also there are some cases where the leading zeroes may have been lost (due to erronious conversion to `float`).

# | zipcode       | zipcode       |
# | ------------- |-------------  |
# | 11.0          | 00011         |
# | 11013.0       | 11013         |

df_sba['Zip'] = df_sba['Zip'].astype(int).astype(str).str.zfill(5)

ValueError: Cannot convert non-finite values (NA or inf) to integer

In [None]:
# The names are all upper case in SBA data. So, doing the same for Uber Eats as well.
df_ue['name'] = df_ue['name'].str.upper()
df_ue['addressLocality'] = df_ue['addressLocality'].str.upper()
df_ue['addressRegion'] = df_ue['addressRegion'].str.upper()

In [None]:
# Taking out the useful columns
df_a = df_sba[['BusinessName', 'Address', 'City', 'State', 'Zip']]
df_b = df_ue[['name', 'streetAddress', 'addressLocality', 'addressRegion', 'postalCode']]

reqd_columns = ['name', 'street', 'city', 'state', 'zip']

df_a.columns = reqd_columns
df_b.columns = reqd_columns

In [None]:
df_a.head(1)
df_b.head(1)

In [None]:
# Counting NaN values
df_a.apply(lambda x: x.isna().sum())
df_b.apply(lambda x: x.isna().sum())

In [None]:
df_a[df_a['name'].isnull()]
df_b[df_b['name'].isnull()]

In [None]:
# Each entry in Uber Eats DF with `NaN` in `name` necessarily has `NaN` in all other columns. Thus, we will drop these rows.
# If the address street matches perfectly there is a chance that the match is true, indeed. Thus, we will simply replace the `name` with an empty string instead of dropping the rows.
df_b = df_b.dropna(subset=['name'])
df_a = df_a.fillna({'name':""})
df_b = df_b.fillna({'street':"", 'city':"", 'state':"", 'zip':""})

In [None]:
# Counting NaN values
df_a.apply(lambda x: x.isna().sum())
df_b.apply(lambda x: x.isna().sum())

In [None]:
# Checking the columns for inconsistencies
df_b['city'].value_counts()
df_b['state'].value_counts()
df_b['zip'].apply(lambda x: len(x)).value_counts()

In [None]:
df_b['city'] = df_b['city'].value_counts().keys()[0]
df_b['state'] = df_b['state'].value_counts().keys()[0]
df_b['zip'] = df_b['zip'].apply(lambda x: "" if len(x) != 5 else x)

In [None]:
# Verifying
df_b['city'].value_counts()
df_b['state'].value_counts()
df_b['zip'].apply(lambda x: len(x)).value_counts()

## Using Recordlinkage

In [None]:
indexer = recordlinkage.Index()
indexer.block('zip')

In [None]:
candidates = indexer.index(df_a, df_b)
print(len(candidates))

In [None]:
compare = recordlinkage.Compare()
# compare.exact('city', 'Provider City', label='City')
compare.string('name',
            'name',
            method='jarowinkler',
            threshold=0.75,
            label='name_low')
compare.string('name',
            'name',
            method='jarowinkler',
            threshold=0.9,
            label='name_high')
compare.string('street',
            'street',
            method='jarowinkler',
            threshold=0.85,
            label='street')
# compare.exact('zip',
#             'zip',
#             label='zip')
features = compare.compute(candidates, df_a, df_b)
features.sum(axis=1).value_counts().sort_index(ascending=False)

In [None]:
potential_matches = features[features.sum(axis=1) > 1].reset_index()
potential_matches['score'] = potential_matches.loc[:, 'name_low':'street'].sum(axis=1)
potential_matches.head(5)

In [None]:
df_a.loc[1]

## Data Wrangling

In [None]:
final_df = potential_matches
final_df['name_a'] = final_df['level_0'].apply(lambda x: df_a.loc[x]['name'])
final_df['name_b'] = final_df['level_1'].apply(lambda x: df_b.loc[x]['name'])

final_df['street_a'] = final_df['level_0'].apply(lambda x: df_a.loc[x]['street'])
final_df['street_b'] = final_df['level_1'].apply(lambda x: df_b.loc[x]['street'])

In [None]:
final_df = final_df.sort_values('score', ascending=False).reset_index()
final_df.head(10)
final_df.shape

In [None]:
final_df.to_csv("./processed-data/new-york-matching.csv", index=False)