Authors: Tim Gorman, Yu Cao, Ling Zhou

---

Data analyzed in this notebook is from [Kaggle's code competation: Foursquare - Location Matching](https://www.kaggle.com/competitions/foursquare-location-matching/data). The data comprises over one-and-a-half million place entries for hundreds of thousands of commercial Points-of-Interest (POIs) around the globe.

In [None]:
import pandas as pd
import numpy as np
import string
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

# from googletrans import Translator, constants
import seaborn as sns
import langdetect
from langdetect import detect
import matplotlib.pyplot as plt

from shapely.geometry import Point
from geopandas import GeoDataFrame
import geopandas as gpd

Note: To install geopandas, open a terminal in jupyter and run the following commands

pip install pipwin <br>
pipwin install gdal <br>
pipwin install fiona <br>
pip install geopandas

# Exploring `train.csv`

## Basic information

In [None]:
df_train = pd.read_csv("../../code-2022/final_project/data-foursquare-location-matching/train.csv")
# df_train  = pd.read_csv("../../data_raw/train.csv")

In [None]:
df_train.info()

In [None]:
df_train.head()

In [None]:
df_train.shape

## Missing values
- Every row has the features: `id`, `latitude`, `longitude`, `point_of_interest`
- Other features have missing values
- Features that have less missing values: `name`, `country`, `categories`

In [None]:
print('There are', len(set(df_train.id)), 'id, and', len(set(df_train.point_of_interest)), 'distinct POIs.\n')
print('About', np.round(len(set(df_train.point_of_interest))/len(set(df_train.id))*100,2),'% POIs are unique.\n')

In [None]:
df = df_train
# Count the missing values in each feature
missing = df.isna().sum()/df.shape[0]*100
print(missing.sort_values())

# Plot the percentages of missing values for each feature
import matplotlib.ticker as mtick
tmp = df.isna().sum().sort_values()
fig, ax = plt.subplots(figsize=(8,6))

y = tmp/len(df)*100
ax.barh(tmp.index, y)
ax.set_title("Missing values", fontsize=18)
ax.axvline(x=60,color='r',linestyle='--')
# ax.xaxis.set_major_formatter(mtick.PercentFormatter())
ax.tick_params(axis='both', labelsize=15 )
ax.set_ylabel('features', fontsize=18)
ax.set_xlabel('percentages (%)', fontsize=18)
ax.set_xlim(0,100)

for index, value in enumerate(y):
    if value < 1:
        ax.text(value, index, '{:.6f}'.format(value), color = 'red', fontweight = 'bold')
    else:
        ax.text(value, index, '{:.1f}'.format(value), color = 'red', fontweight = 'bold')
plt.show()

## Features in `train.csv`

In [None]:
df.columns

### `id`

In [None]:
# each value in `id` is unique.
print("Each value in `id` is unique:", len(df.id.unique()) == df.shape[0])

### `latitude` and `longitude`

In [None]:
# simple sketch
# fig, ax = plt.subplots(figsize=(40,25))
# ax.scatter(df_train['longitude'],df_train['latitude'])
# ax.set_xlabel('longitude',fontsize=40)
# ax.set_ylabel('latitude',fontsize=40)
# ax.tick_params(axis='both', labelsize=40)
# ax.grid()
# plt.show()

# fancy plot
geometry = [Point(xy) for xy in zip(df['longitude'], df['latitude'])]
gdf = GeoDataFrame(df_train.copy(), geometry=geometry)   

#this is a simple map that goes with geopandas
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
fig, ax = plt.subplots(figsize=(20, 12))
world.plot(figsize=(20, 12), ax=ax)
gdf.plot(ax=ax, marker='o', color='red', markersize=10)

fig.savefig("./train_coords.png")
plt.show()

### `name`

This column has different languages. 

In [None]:
df['name']

In [None]:
# Takes a few minutes.
# flag = 0
# lang = []
# for row in df['name'].dropna():
#     if flag > 20:
#         break
#     try:
#         lang.append(detect(row))
#     except:
#         flag += 1
#         language = "error"
#         print("This row throws and error:", row)
# print(set(lang))

### How Many Languages are Present?

In [None]:
languages = []
name_data = df_train['name'].sample(10000).fillna('').str.replace('[{}]'.format(string.punctuation),'').str.lower()
for item in name_data:
    try:
        if item != '':
            languages.append(detect(item))
        pass
    except langdetect.LangDetectException as e:
        print(item)
        continue

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
sns.countplot(y = languages, ax=ax, orient='v')
ax.set_xlabel("count", fontsize=18)
ax.set_ylabel("language", fontsize=18)
ax.tick_params(labelsize=10)
plt.show()

In [None]:
type(languages)

In [None]:
languages.count('en')/len(languages)

So roughly 20% of the sampled data is in English.

### `country` 

Data availability


In [None]:
country_stats = df['country'].value_counts()*100/df['country'].value_counts().sum()
country_stats = country_stats.head(10).sort_values()
print(country_stats.shape)

fig, ax = plt.subplots(figsize=(8,6))
color = ["gray"]*len(country_stats.index)
color[-1] = "red"
country_stats.plot(kind = 'barh', ax = ax, color = color)

ax.set_title("Available Data by Countries (top 10)", fontsize = 18)
ax.set_ylabel('country', fontsize = 18)
ax.set_xlabel('Percentages (%)', fontsize = 18)
plt.show()

### `point_of_interest`

In [None]:
df['point_of_interest'].value_counts()

In [None]:
tmp = df[df['point_of_interest']=='P_399ab9d64f2a2e'].sort_values(by='point_of_interest')
tmp.head()

In [None]:
tmp = df[df['point_of_interest']=='P_ce9291000a8f0b'].sort_values(by='point_of_interest')
tmp.head()

# The US data in `train.csv`

We focus on the US data in `train.csv` to have a smaller data set.

In [None]:
df_us = df[df['country']=='US']

In [None]:
df_us.info()

## Missing values

In [None]:
df = df_us
# Count the missing values in each feature
missing = df.isna().sum()/df.shape[0]*100
print(missing.sort_values())

# Plot the percentages of missing values for each feature
import matplotlib.ticker as mtick
tmp = df.isna().sum().sort_values()
fig, ax = plt.subplots(figsize=(8,6))

y = tmp/len(df)*100
ax.barh(tmp.index, y, color = 'gray')
ax.set_title("Missing values", fontsize=18)
ax.axvline(x=60,color='r',linestyle='--')
# ax.xaxis.set_major_formatter(mtick.PercentFormatter())
ax.tick_params(axis='both', labelsize=15 )
ax.set_ylabel('features', fontsize=18)
ax.set_xlabel('percentages (%)', fontsize=18)
ax.set_xlim(0,100)

for index, value in enumerate(y):
    if value < 1:
        ax.text(value, index, '{:.6f}'.format(value), color = 'blue', fontweight = 'bold')
    else:
        ax.text(value, index, '{:5.1f}'.format(value), color = 'blue', fontweight = 'bold')
plt.show()

Should throw away the `url` column because about 60% data are missing.

In [None]:
df_train[df_train['country'].isna()].head(3)

In [None]:
df_train[df_train['name'].isna()].head(3)

## Features in the US set

### `latitude` and `longitude`

In [None]:
fig, ax = plt.subplots(figsize=(40,10))
ax.scatter(df_us['longitude'],df_us['latitude'])
ax.set_xlabel('longitude',fontsize=40)
ax.set_ylabel('latitude',fontsize=40)
ax.tick_params(axis='both', labelsize=40)
ax.set_title('US data',fontsize=40)
ax.grid()
plt.show()

In [None]:
df_us[df_us['longitude']>-50]

### `name`

In [None]:
# # Takes a few minutes.
# df = df_us
# flag = 0
# lang = []
# for row in df['name'].dropna():
#     if flag > 20:
#         break
#     try:
#         lang.append(detect(row))
#     except:
#         flag += 1
#         language = "error"
#         print("This row throws and error:", row)
# print(set(lang))

In [None]:
tmp = df_us['name'].sort_values()
tmp

### `point_of_interest`

Notes on the `point_of_interest` feature:
https://www.kaggle.com/competitions/foursquare-location-matching/discussion/318967#1783581

In [None]:
print('Number of (unique) POIs : {:8d}'.format(len(df_train['point_of_interest'].unique())))
print('Number of (unique) ids  : {:8d}'.format(len(df_train['id'].unique())))

In [None]:
df_us['point_of_interest'].value_counts().head(10)

In [None]:
# Noise: same POI with different physical places; sorted by 'name'.
df_us.loc[df_us['point_of_interest']=='P_399ab9d64f2a2e',
          ['name','latitude','longitude','city','state','zip','categories','point_of_interest']].sort_values(by='name').head(10)

In [None]:
# Noise: same POI with different physical places; sorted by 'state'.
df_us.loc[df_us['point_of_interest']=='P_399ab9d64f2a2e',
          ['name','latitude','longitude','city','state','zip','categories','point_of_interest']].sort_values(by='state').head(10)

In [None]:
# Noise: same POI with different physical places; sorted by 'city'.
df_us.loc[df_us['point_of_interest']=='P_399ab9d64f2a2e',
          ['name','latitude','longitude','city','state','zip','categories','point_of_interest']].sort_values(by='city').head(10)
# df_us[df_us['name']=='CVS'].sort_values(by='point_of_interest').head()

### `state`

In [None]:
df_us['state'].unique()

In [None]:
# Drop these.
filter_set = ['国外','UK','CE','ON/NY','Capital Region of Denmark','BCN','Tamaulipas','NU']
df_us[df_us['state'].isin(filter_set)]

### `categories`

In [None]:
df_us.categories.sample(30)

# Exploring and Manipulating `pairs.csv`

The file `pairs.csv` is a subset of pairs from `train.csv`. The value of `match` is "True" if an only if the pair has the same `point_of_interest` (POI) value in `train.csv`. We will focus on the US pairs. 

## Loading the Pairs Dataset

In [None]:
# # Tim loading
# df_pairs = pd.read_csv(r'C:\Users\gorma\OneDrive\Documents\Erdos\foursquare-location-matching\data_raw\pairs.csv')
# Yu loading
df_pairs = pd.read_csv('../../code-2022/final_project/data-foursquare-location-matching/pairs.csv')

In [None]:
df_pairs.head()

In [None]:
print('There are', len(df_pairs), 'pairs,', len(set(df_pairs.id_1)), 'id_1s and', len(set(df_pairs.id_2)), 'id_2s.\n')

In [None]:
df_pairs.info()

In [None]:
df_pairs.describe()

## Feature analysis

---

The available pairs of features from `train.csv` include

```
'name', 'latitude', 'longitude', 'address', 'city', 'state',
       'zip', 'country', 'url', 'phone', 'categories'
```

In [None]:
df_pairs.columns

### True vs False in the `match` feature

In [None]:
df_pairs.groupby('match')['match'].count()

### latitude and longitude


---
We compute the difference for (latitude, longitude) of each pair in `pairs.csv`. It turns out 
  * locations with close (difference less than 0.001) can have have different POIs: stores are next to each other in a shopping center. In some extreme cases, places with identical coordinates can have different POI. For example, one classroom can be located vertically on top of another.  
  * conversely, locations physically far can represent the same POI, such as mountains.

In [None]:
# Compute the L^\infty difference of (latitude, longitude) of each pair in df_pairs.csv
df_pairs['location_diff'] = pd.concat([(df_pairs['latitude_1']-df_pairs['latitude_2']).abs(), 
                                       (df_pairs['longitude_1']-df_pairs['longitude_2']).abs()], axis=1).max(axis=1)

df_pairs.location_diff.describe()

In [None]:
fig, ax = plt.subplots(figsize = (8,6))
sns.stripplot(x="match", y="location_diff", data=df_pairs.loc[df_pairs.location_diff<0.001], ax=ax)
ax.set_ylabel(r"location difference", fontsize=18)
ax.set_xlabel("match", fontsize=18)
ax.tick_params(labelsize=14)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize = (8,6))
sns.stripplot(x="match", y="location_diff", data=df_pairs.loc[df_pairs.location_diff==0],ax=ax)
ax.set_ylabel(r"location difference", fontsize=18)
ax.set_xlabel("match", fontsize=18)
ax.tick_params(labelsize=14)
plt.show()

In [None]:
print('There are',len(df_pairs.loc[(df_pairs.location_diff==0) & (df_pairs.match==False)]),
      'df_pairs with location_diff=0 but different POIs, out of all',len(df_pairs),'df_pairs.')

### name

  * There are some true pairs with slightly different names, e.g. short name v.s. full name.
  * Conversely, there are false pairs with the same name, e.g. chain stores in different cities.

In [None]:
df_pairs.loc[df_pairs.match==True].values[0]

In [None]:
df_pairs.loc[(df_pairs.name_1==df_pairs.name_2)&(df_pairs.name_1=="McDonald's")&(df_pairs.match==True)].values[0]

### address

### Filling NAs and  Making Combined Full Addresses

In [None]:
df_pairs['categories_1']   = df_pairs['categories_1'].fillna('')
df_pairs['categories_2']   = df_pairs['categories_2'].fillna('')
df_pairs['full_address_1'] = df_pairs['address_1'].fillna('') + ' ' + df_pairs['city_1'].fillna('') + ' ' + df_pairs['state_1'].fillna('') + ' ' + df_pairs['zip_1'].fillna('')  + ' ' + df_pairs['country_1'].fillna('')

In [None]:
df_pairs['full_address_1'] = df_pairs['address_1'].fillna('') + ' ' + df_pairs['city_1'].fillna('') + ' ' + df_pairs['state_1'].fillna('') + ' ' + df_pairs['zip_1'].fillna('')  + ' ' + df_pairs['country_1'].fillna('')
df_pairs['full_address_2'] = df_pairs['address_2'].fillna('') + ' ' + df_pairs['city_2'].fillna('') + ' ' + df_pairs['state_2'].fillna('') + ' ' + df_pairs['zip_2'].fillna('')  + ' ' + df_pairs['country_2'].fillna('')

### Reducing to only columns that seems useful. (Mostly based on shear amount of nans)

In [None]:
column_list = ['id_1','name_1', 'latitude_1', 'longitude_1', 'country_1', 'full_address_1', 'categories_1', 'id_2','name_2', 'latitude_2', 'longitude_2', 'country_2', 'full_address_2', 'categories_2', 'match']

In [None]:
pairs_reduced = df_pairs[column_list]

In [None]:
pairs_reduced.head()

In [None]:
pairs_reduced.info()

### Calculating angular difference of latitude and longitude

In [None]:
# pairs_reduced['theta_diff'] = np.arccos(np.sin(np.radians(pairs_reduced['latitude_1']))*np.sin(np.radians(pairs_reduced['latitude_2']))+
#                                        np.cos(np.radians(pairs_reduced['latitude_1']))*np.cos(np.radians(pairs_reduced['latitude_2']))*
#                                         np.cos(np.radians(pairs_reduced['longitude_1']-pairs_reduced['longitude_2']))
#                                        )

In [None]:
pairs_reduced['theta_diff'] = np.abs(np.arctan(np.sqrt(
    (np.cos(np.radians(pairs_reduced['latitude_2']))*np.sin(np.radians(pairs_reduced['longitude_1']-pairs_reduced['longitude_2'])))**2 +
        (np.cos(np.radians(pairs_reduced['latitude_1']))*np.sin(np.radians(pairs_reduced['latitude_2']))-
            np.sin(np.radians(pairs_reduced['latitude_1']))*np.cos(np.radians(pairs_reduced['latitude_2']))*np.cos(np.radians(pairs_reduced['longitude_1']-pairs_reduced['longitude_2'])))**2
                                                )/
    (np.sin(np.radians(pairs_reduced['latitude_1']))*np.sin(np.radians(pairs_reduced['latitude_2']))+
        np.cos(np.radians(pairs_reduced['latitude_1']))*np.cos(np.radians(pairs_reduced['latitude_2']))*np.cos(np.radians(pairs_reduced['longitude_1']-pairs_reduced['longitude_2']))
    )
                                               )
                                      )

In [None]:
# pairs_reduced['theta_diff'] = np.sqrt((np.radians(pairs_reduced['longitude_1'])-np.radians(pairs_reduced['longitude_2'])*np.cos(np.radians(pairs_reduced['latitude_1'])))**2 +
#         (np.radians(pairs_reduced['latitude_1'])-np.radians(pairs_reduced['latitude_2']))**2
#        )

In [None]:
# pairs_reduced['theta_diff'] = (np.radians(pairs_reduced['longitude_1'])-np.radians(pairs_reduced['longitude_2']))**2 + (np.radians(pairs_reduced['latitude_1'])-np.radians(pairs_reduced['latitude_2']))**2

In [None]:
pairs_reduced[pairs_reduced['theta_diff'].isna()]

# Reducing `pairs.csv` to only US data

In [None]:
len(pairs_reduced[(pairs_reduced['country_1'] == 'US') & (pairs_reduced['country_2'] == 'US')])

Dropping no longer needed columns

In [None]:
pairs_red_us = pairs_reduced[(pairs_reduced['country_1'] == 'US') & (pairs_reduced['country_2'] == 'US')]

In [None]:
pairs_red_us = pairs_red_us.drop(['country_1','country_2', 'latitude_1', 'longitude_1', 'latitude_2', 'longitude_2'], axis = 1)

In [None]:
pairs_red_us.head()

It's probably in my best interest to lower case all of the strings when doing the string matching aspect

In [None]:
pairs_red_us['name_1'] = pairs_red_us['name_1'].astype(str)
pairs_red_us['name_2'] = pairs_red_us['name_2'].astype(str)
pairs_red_us['full_address_1'] = pairs_red_us['full_address_1'].astype(str)
pairs_red_us['full_address_2'] = pairs_red_us['full_address_2'].astype(str)
pairs_red_us['categories_1'] = pairs_red_us['categories_1'].astype(str)
pairs_red_us['categories_2'] = pairs_red_us['categories_2'].astype(str)

pairs_red_us['name_1'] = pairs_red_us['name_1'].str.lower()
pairs_red_us['full_address_1'] = pairs_red_us['full_address_1'].str.lower()
pairs_red_us['categories_1'] = pairs_red_us['categories_1'].str.lower()
pairs_red_us['name_2'] = pairs_red_us['name_2'].str.lower()
pairs_red_us['full_address_2'] = pairs_red_us['full_address_2'].str.lower()
pairs_red_us['categories_2'] = pairs_red_us['categories_2'].str.lower()

In [None]:
pairs_red_us.head()

I tihnk I can drop ids as well

In [None]:
pairs_red_us = pairs_red_us.drop(['id_1', 'id_2'], axis = 1)

In [None]:
len(pairs_red_us['name_1'])

In [None]:
len(pairs_red_us['name_1'][pairs_red_us['name_1'].isna()])

In [None]:
pairs_red_us['name_1'].tolist()[1:100]

Now following the article "Calculating STring Similarity in Python"

In [None]:
pairs_red_us['name_1']

In [None]:
pairs_red_us['name_1'] = pairs_red_us['name_1'].str.replace('[{}]'.format(string.punctuation),'')
pairs_red_us['name_2'] = pairs_red_us['name_2'].str.replace('[{}]'.format(string.punctuation),'')
pairs_red_us['full_address_1'] = pairs_red_us['full_address_1'].str.replace('[{}]'.format(string.punctuation),'')
pairs_red_us['full_address_2'] = pairs_red_us['full_address_2'].str.replace('[{}]'.format(string.punctuation),'')
pairs_red_us['categories_1'] = pairs_red_us['categories_1'].str.replace('[{}]'.format(string.punctuation),'')
pairs_red_us['categories_2'] = pairs_red_us['categories_2'].str.replace('[{}]'.format(string.punctuation),'')

In [None]:
pairs_red_us['name_1'].iloc[0]

In [None]:
pairs_red_us.head()

In [None]:
pairs_red_us = pairs_red_us.reset_index().drop('index', axis = 1)

In [None]:
pairs_red_us.head()

In [None]:
pairs_red_us.to_csv(r'C:\Users\gorma\OneDrive\Documents\Erdos\foursquare-location-matching\tim_code\working_data\pairs_reduced_us.csv')

In [None]:
vectors = CountVectorizer().fit_transform([pairs_red_us['name_1'].iloc[0], pairs_red_us['name_2'].iloc[0]]).toarray()

In [None]:
csim = cosine_similarity(vectors)

In [None]:
csim

In [None]:
csim[0][1]

In [None]:
range(len(pairs_red_us.iloc[:]))

In [None]:
name_cosines = []
for i in range(len(pairs_red_us)):
    if (pairs_red_us['name_1'].iloc[i] == '') | (pairs_red_us['name_2'].iloc[i] == ''):
        csim = -1
    else:
        csim = cosine_similarity(CountVectorizer().fit_transform([pairs_red_us['name_1'].iloc[i], pairs_red_us['name_2'].iloc[i]]).toarray())[0][1]
    name_cosines.append(csim)

In [None]:
len(name_cosines)

In [None]:
pairs_red_us['name_cosines'] = pd.DataFrame(name_cosines)

In [None]:
pairs_red_us.info()

In [None]:
pairs_red_us[pairs_red_us['name_cosines'].isna()]

In [None]:
pairs_red_us[pairs_red_us['categories_2'].isna()]

In [None]:
pairs_red_us.head()

In [None]:
address_cosines = []
for i in range(len(pairs_red_us)):
    if (pairs_red_us['full_address_1'].iloc[i] == '') | (pairs_red_us['full_address_2'].iloc[i] == ''):
        csim = -1
    else:
        csim = cosine_similarity(CountVectorizer().fit_transform([pairs_red_us['full_address_1'].iloc[i], pairs_red_us['full_address_2'].iloc[i]]).toarray())[0][1]
    address_cosines.append(csim)

pairs_red_us['full_address_cosines'] = pd.DataFrame(address_cosines)

In [None]:
pairs_red_us.head()

In [None]:
categories_cosines = []
for i in range(len(pairs_red_us)):
    if (pairs_red_us['categories_1'].iloc[i] == '') | (pairs_red_us['categories_2'].iloc[i] == ''):
        csim = -1
    else:
        csim = cosine_similarity(CountVectorizer().fit_transform([pairs_red_us['categories_1'].iloc[i], pairs_red_us['categories_2'].iloc[i]]).toarray())[0][1]
    categories_cosines.append(csim)
    
pairs_red_us['categories_cosines'] = pd.DataFrame(categories_cosines)

In [None]:
pairs_red_us.head()

In [None]:
pairs_final_diffed_us = pairs_red_us[['theta_diff', 'name_cosines', 'full_address_cosines', 'categories_cosines', 'match']]

In [None]:
pairs_final_diffed_us

# Exploring the transformed data

In [None]:
pairs_final_diffed_us['theta_diff']

In [None]:
pairs_final_diffed_us.describe()

In [None]:
# import seaborn as sns

In [None]:
pairs_corr = pairs_final_diffed_us.corr()

In [None]:
sns.heatmap(pairs_corr)

In [None]:
pairs_corr

In [None]:
pairs_final_diffed_us['theta_diff'][pairs_final_diffed_us['match']==True].describe()

In [None]:
pairs_final_diffed_us['theta_diff'][pairs_final_diffed_us['match']==False].describe()

In [None]:
pairs_final_diffed_us.describe()

In [None]:
pairs_final_diffed_us.head()

In [None]:
pairs_final_diffed_us.to_csv(r'C:\Users\gorma\OneDrive\Documents\Erdos\foursquare-location-matching\tim_code\working_data\pairs_final_diffed_us.csv')

## Optional Scaling

In [None]:
pairs_final_diffed_us = pd.read_csv(r'C:\Users\gorma\OneDrive\Documents\Erdos\foursquare-location-matching\tim_code\working_data\pairs_final_diffed_us.csv')

In [None]:
pairs_final_diffed_us = pairs_final_diffed_us.drop('Unnamed: 0', axis =1)

In [None]:
pairs_final_diffed_us.head()

In [None]:
pairs_final_diffed_us.describe()

In [None]:
from sklearn.preprocessing import StandardScaler
cols_to_scale = ['theta_diff', 'name_cosines', 'full_address_cosines', 'categories_cosines']
scaler = StandardScaler()
scaler.fit(pairs_final_diffed_us[cols_to_scale])
pairs_final_diffed_us[cols_to_scale] = scaler.transform(pairs_final_diffed_us[cols_to_scale])

In [None]:
pairs_final_diffed_us.describe()

In [None]:
pairs_final_diffed_us.to_csv(r'C:\Users\gorma\OneDrive\Documents\Erdos\foursquare-location-matching\tim_code\working_data\pairs_final_diffed_us_scaled.csv', index = False)

## Comparing string features

Most columns in this data are strings. We consider different metrics for string comparison letter by letter, including:

*   `SequenceMatcher`
*   `Levenshtein distance`

or word by word using a `CountVectorizer` and `cosine_similarity`

If one value of a pair of features is missing, we set the distance to be -1.



In [None]:
from difflib import SequenceMatcher
import math
# Reference: https://stackoverflow.com/questions/43916271/python-pandas-sequencematch-columns-for-each-value-and-return-closet-match

def seq_metric(df, col1, col2):
    a = df[col1]
    b = df[col2]
    
    if type(a)!=str or type(b)!=str:
        return -1
    else:
        a = a.lower().replace(" ", "").replace("'", "")
        b = b.lower().replace(" ", "").replace("'", "")
        
    return 1-SequenceMatcher(None, a, b).ratio()

In [None]:
# !pip install python-Levenshtein

from Levenshtein import distance as lev
def lev_metric(df, col1, col2):
    a = df[col1]
    b = df[col2]
    
    if type(a)!=str or type(b)!=str:
        return -1
    else:
        a = a.lower().replace(" ", "").replace("'", "")
        b = b.lower().replace(" ", "").replace("'", "")
    #if a in b or b in a: 
    #    return 0
    return lev(a,b)/max(len(a),len(b))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def countvec_metric(df, col1, col2):
    a = df[col1]
    b = df[col2]
    if type(a)!=str or type(b)!=str:
        return -1
    else:
        a = a.lower().replace("'", "")
        b = b.lower().replace("'", "")
    return cosine_similarity(CountVectorizer().fit_transform([a,b]).toarray())[0][1]

In [None]:
# The following function allows us to apply different metric on the columns of pairs.csv to generate distance between features.
def compute_diffs(df, cols, string_metric):
    df_copy = df
    for col in cols:
        df_copy[col+'_diff'] = df.apply(string_metric,
                          args=(col+'_1', col+'_2'),
                          axis=1)
    return df_copy[ ['id_1','id_2','match']+['location_diff']+ [col+'_diff' for col in cols] ]

We now construct the new dataframe with the selected feactures and chosen methods to compute a metric between the features.

In [None]:
# Using SequenceMatcher

columns = ['name', 'address', 'city', 'state',
       'zip', 'url', 'phone', 'categories']
       
df_pairs_seq = compute_diffs(df_pairs, columns, seq_metric)

In [None]:
# Using Levenshtein distance

columns = ['name', 'address', 'city', 'state',
       'zip', 'url', 'phone', 'categories']
       
df_pairs_lev = compute_diffs(df_pairs, columns, lev_metric)

In [None]:
# Using CountVectorizer

columns = ['name']#, 'address', 'city', 'state', 'zip', 'url', 'phone', 'categories']
       
df_pairs_count = compute_diffs(df_pairs, columns, countvec_metric)

## Histogram

In [None]:
## histograms for SequenceMatcher

columns = ['location_diff','name_diff','address_diff','city_diff','zip_diff','url_diff', 'phone_diff', 'categories_diff']

for i in range(len(columns)):
    column=columns[i]
    plt.hist(df_pairs_seq.loc[(df_pairs_seq.match==True)&(df_pairs_seq.location_diff<1), column], color='b', label="True")
    plt.hist(df_pairs_seq.loc[(df_pairs_seq.match==False)&(df_pairs_seq.location_diff<1), column], color='r', label="False")
    plt.legend(fontsize=14)
    plt.title(column,fontsize=16)
    plt.show()

In [None]:
# Histogram for Levenshtein distance

columns = ['location_diff','name_diff','address_diff','city_diff','zip_diff','url_diff', 'phone_diff', 'categories_diff']

for i in range(len(columns)):
    column=columns[i]
    plt.hist(df_pairs_lev.loc[(df_pairs_lev.match==True)&(df_pairs_lev.location_diff<1), column], color='b', label="True")
    plt.hist(df_pairs_lev.loc[(df_pairs_lev.match==False)&(df_pairs_lev.location_diff<1), column], color='r', label="False")
    plt.legend(fontsize=14)
    plt.title(column,fontsize=16)
    plt.show()

# To improve (to be filled)



1.   Generate more pairs: mention how many pairs there are blabla
2.   Consider places in other countries blabla
3.   Filling in missing values: for example, zip codes can be obtained from the coordinates.
4.   Balance the dataset: change the ratio of true/false pairs blabla.

