Authors: Tim Gorman, Yu Cao, Ling Zhou

In [None]:
import pandas as pd
import numpy as np
import string
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

# from googletrans import Translator, constants
import seaborn as sns
import langdetect
from langdetect import detect
import matplotlib.pyplot as plt

from shapely.geometry import Point
from geopandas import GeoDataFrame
import geopandas as gpd

Note: To install geopandas, open a terminal in jupyter and run the following commands

pip install pipwin <br>
pipwin install gdal <br>
pipwin install fiona <br>
pip install geopandas

# 1. Exploring train.csv

In [None]:
df_train  = pd.read_csv("../../data_raw/train.csv")

In [None]:
df_train.sample(3)

In [None]:
df_train.info()

### Missing values in the train set
- Every row has the features: `id`, `latitude`, `longitude`, `point_of_interest`
- Other features have missing values
- Features that have less missing values: `name`, `country`, `categories`

In [None]:
# Count the missing values
missing = df_train.isna().sum()/df_train.shape[0]*100
missing.sort_values()

In [None]:
# Plot the percentages of missing values for each feature
import matplotlib.ticker as mtick
tmp = df_train.drop(columns=['id','latitude','longitude','point_of_interest']).isna().sum().sort_values()
fig, ax = plt.subplots(figsize=(8,6))
ax.barh(tmp.index, tmp/len(df_train)*100)
ax.set_title("(Most) missing values (percentages)", fontsize=18)
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
ax.tick_params(axis='both', labelsize=15 )
ax.set_ylabel('features', fontsize=18)
plt.show()

In [None]:
# Take a look at the stupid outliers
df_train[df_train['country'].isna()].head(3)

In [None]:
# Take a look at the stupid outliers
df_train[df_train['name'].isna()]

The `id` feature is unique:

In [None]:
len(df_train['id'].unique()) == len(df_train)

In [None]:
print('Number of (unique) POIs : {:8d}'.format(len(df_train['point_of_interest'].unique())))
print('Number of (unique) ids  : {:8d}'.format(len(df_train['id'].unique())))

#### Notes on the `point_of_interest` feature:

https://www.kaggle.com/competitions/foursquare-location-matching/discussion/318967#1783581

In [None]:
mask = (df_train['country'] == 'US')
print(len(df_train[df_train['zip'].isna()  & mask]))
print(len(df_train[df_train['state'].isna()& mask]))
print(len(df_train[df_train['city'].isna() & mask]))
print(len(df_train[df_train['zip'].isna() & df_train['state'].isna() & df_train['city'].isna() & mask]))

### Latitude and longitude
Data concentrate on the US and the Europe: 

In [None]:
fig, ax = plt.subplots(figsize=(40,25))
ax.scatter(df_train['longitude'],df_train['latitude'])
ax.set_xlabel('longitude',fontsize=40)
ax.set_ylabel('latitude',fontsize=40)
ax.tick_params(axis='both', labelsize=40)
ax.grid()
plt.show()

In [None]:
geometry = [Point(xy) for xy in zip(df_train['longitude'], df_train['latitude'])]
gdf = GeoDataFrame(df_train.copy(), geometry=geometry)   

#this is a simple map that goes with geopandas
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
ax = gdf.plot(ax=world.plot(figsize=(20, 12)), marker='o', color='red', markersize=10)
plt.show()

## How Many Languages are Present?

In [None]:
languages = []
name_data = df_train['name'].sample(10000).fillna('').str.replace('[{}]'.format(string.punctuation),'').str.lower()
for item in name_data:
    try:
        if item != '':
            languages.append(detect(item))
        pass
    except langdetect.LangDetectException as e:
        print(item)
        continue

plt.figure(figsize=(12,8))
sns.countplot(x = languages)

In [None]:
languages.count('en')/len(languages)

So roughly 20% of the sampled data is in english

# Availability of data by country


In [None]:
country_stats = df_train['country'].value_counts()*100/df_train['country'].value_counts().sum()
country_stats = country_stats.head(10).sort_values()
country_stats.shape

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
color = ["gray"]*len(country_stats.index)
color[-1] = "red"
country_stats.plot(kind = 'barh', ax = ax, color = color)

ax.set_title("% Available Data by Countries")
ax.set_ylabel('country')
ax.set_xlabel('Percentage')
plt.show()

## The US data

In [None]:
df_us = df_train[df_train['country']=='US']

In [None]:
df_us.info()

In [None]:
fig, ax = plt.subplots(figsize=(40,10))
ax.scatter(df_us['longitude'],df_us['latitude'])
ax.set_xlabel('longitude',fontsize=40)
ax.set_ylabel('latitude',fontsize=40)
ax.tick_params(axis='both', labelsize=40)
ax.set_title('US data',fontsize=40)
ax.grid()
plt.show()

In [None]:
df_us[df_us['longitude']>-50]

# 2. Exploring and Manipulating pairs.csv

In [None]:
df_train.info()

## Loading the Pairs Dataset

In [None]:
pairs = pd.read_csv(r'C:\Users\gorma\OneDrive\Documents\Erdos\foursquare-location-matching\data_raw\pairs.csv')

In [None]:
pairs.head()

In [None]:
pairs.info()

In [None]:
pairs.describe()

## Filling NAs and  Making Combined Full Addresses

In [None]:
pairs['categories_1'] = pairs['categories_1'].fillna('')
pairs['categories_2'] = pairs['categories_2'].fillna('')
pairs['full_address_1'] = pairs['address_1'].fillna('') + ' ' + pairs['city_1'].fillna('') + ' ' + pairs['state_1'].fillna('') + ' ' + pairs['zip_1'].fillna('')  + ' ' + pairs['country_1'].fillna('')

In [None]:
pairs['full_address_1'] = pairs['address_1'].fillna('') + ' ' + pairs['city_1'].fillna('') + ' ' + pairs['state_1'].fillna('') + ' ' + pairs['zip_1'].fillna('')  + ' ' + pairs['country_1'].fillna('')
pairs['full_address_2'] = pairs['address_2'].fillna('') + ' ' + pairs['city_2'].fillna('') + ' ' + pairs['state_2'].fillna('') + ' ' + pairs['zip_2'].fillna('')  + ' ' + pairs['country_2'].fillna('')

## Reducing to only columns that seems useful. (mostly based on shear amount of nans)

In [None]:
column_list = ['id_1','name_1', 'latitude_1', 'longitude_1', 'country_1', 'full_address_1', 'categories_1', 'id_2','name_2', 'latitude_2', 'longitude_2', 'country_2', 'full_address_2', 'categories_2', 'match']

In [None]:
pairs_reduced = pairs[column_list]

In [None]:
pairs_reduced.head()

In [None]:
pairs_reduced.info()

## Calculating angular difference

In [None]:
# pairs_reduced['theta_diff'] = np.arccos(np.sin(np.radians(pairs_reduced['latitude_1']))*np.sin(np.radians(pairs_reduced['latitude_2']))+
#                                        np.cos(np.radians(pairs_reduced['latitude_1']))*np.cos(np.radians(pairs_reduced['latitude_2']))*
#                                         np.cos(np.radians(pairs_reduced['longitude_1']-pairs_reduced['longitude_2']))
#                                        )

In [None]:
pairs_reduced['theta_diff'] = np.abs(np.arctan(np.sqrt(
    (np.cos(np.radians(pairs_reduced['latitude_2']))*np.sin(np.radians(pairs_reduced['longitude_1']-pairs_reduced['longitude_2'])))**2 +
        (np.cos(np.radians(pairs_reduced['latitude_1']))*np.sin(np.radians(pairs_reduced['latitude_2']))-
            np.sin(np.radians(pairs_reduced['latitude_1']))*np.cos(np.radians(pairs_reduced['latitude_2']))*np.cos(np.radians(pairs_reduced['longitude_1']-pairs_reduced['longitude_2'])))**2
                                                )/
    (np.sin(np.radians(pairs_reduced['latitude_1']))*np.sin(np.radians(pairs_reduced['latitude_2']))+
        np.cos(np.radians(pairs_reduced['latitude_1']))*np.cos(np.radians(pairs_reduced['latitude_2']))*np.cos(np.radians(pairs_reduced['longitude_1']-pairs_reduced['longitude_2']))
    )
                                               )
                                      )

In [None]:
# pairs_reduced['theta_diff'] = np.sqrt((np.radians(pairs_reduced['longitude_1'])-np.radians(pairs_reduced['longitude_2'])*np.cos(np.radians(pairs_reduced['latitude_1'])))**2 +
#         (np.radians(pairs_reduced['latitude_1'])-np.radians(pairs_reduced['latitude_2']))**2
#        )

In [None]:
# pairs_reduced['theta_diff'] = (np.radians(pairs_reduced['longitude_1'])-np.radians(pairs_reduced['longitude_2']))**2 + (np.radians(pairs_reduced['latitude_1'])-np.radians(pairs_reduced['latitude_2']))**2

In [None]:
pairs_reduced[pairs_reduced['theta_diff'].isna()]

## Reducing to only US data

In [None]:
len(pairs_reduced[(pairs_reduced['country_1'] == 'US') & (pairs_reduced['country_1'] == 'US')])

Dropping no longer needed columns

In [None]:
pairs_red_us = pairs_reduced[(pairs_reduced['country_1'] == 'US') & (pairs_reduced['country_1'] == 'US')]

In [None]:
pairs_red_us = pairs_red_us.drop(['country_1','country_2', 'latitude_1', 'longitude_1', 'latitude_2', 'longitude_2'], axis = 1)

In [None]:
pairs_red_us.head()

It's probably in my best interest to lower case all of the strings when doing the string matching aspect

In [None]:
pairs_red_us['name_1'] = pairs_red_us['name_1'].astype(str)
pairs_red_us['name_2'] = pairs_red_us['name_2'].astype(str)
pairs_red_us['full_address_1'] = pairs_red_us['full_address_1'].astype(str)
pairs_red_us['full_address_2'] = pairs_red_us['full_address_2'].astype(str)
pairs_red_us['categories_1'] = pairs_red_us['categories_1'].astype(str)
pairs_red_us['categories_2'] = pairs_red_us['categories_2'].astype(str)

pairs_red_us['name_1'] = pairs_red_us['name_1'].str.lower()
pairs_red_us['full_address_1'] = pairs_red_us['full_address_1'].str.lower()
pairs_red_us['categories_1'] = pairs_red_us['categories_1'].str.lower()
pairs_red_us['name_2'] = pairs_red_us['name_2'].str.lower()
pairs_red_us['full_address_2'] = pairs_red_us['full_address_2'].str.lower()
pairs_red_us['categories_2'] = pairs_red_us['categories_2'].str.lower()

In [None]:
pairs_red_us.head()

I tihnk I can drop ids as well

In [None]:
pairs_red_us = pairs_red_us.drop(['id_1', 'id_2'], axis = 1)

In [None]:
len(pairs_red_us['name_1'])

In [None]:
len(pairs_red_us['name_1'][pairs_red_us['name_1'].isna()])

In [None]:
pairs_red_us['name_1'].tolist()[1:100]

Now following the article "Calculating STring Similarity in Python"

In [None]:
pairs_red_us['name_1']

In [None]:
pairs_red_us['name_1'] = pairs_red_us['name_1'].str.replace('[{}]'.format(string.punctuation),'')
pairs_red_us['name_2'] = pairs_red_us['name_2'].str.replace('[{}]'.format(string.punctuation),'')
pairs_red_us['full_address_1'] = pairs_red_us['full_address_1'].str.replace('[{}]'.format(string.punctuation),'')
pairs_red_us['full_address_2'] = pairs_red_us['full_address_2'].str.replace('[{}]'.format(string.punctuation),'')
pairs_red_us['categories_1'] = pairs_red_us['categories_1'].str.replace('[{}]'.format(string.punctuation),'')
pairs_red_us['categories_2'] = pairs_red_us['categories_2'].str.replace('[{}]'.format(string.punctuation),'')

In [None]:
pairs_red_us['name_1'].iloc[0]

In [None]:
pairs_red_us.head()

In [None]:
pairs_red_us = pairs_red_us.reset_index().drop('index', axis = 1)

In [None]:
pairs_red_us.head()

In [None]:
pairs_red_us.to_csv(r'C:\Users\gorma\OneDrive\Documents\Erdos\foursquare-location-matching\tim_code\working_data\pairs_reduced_us.csv')

In [None]:
vectors = CountVectorizer().fit_transform([pairs_red_us['name_1'].iloc[0], pairs_red_us['name_2'].iloc[0]]).toarray()

In [None]:
csim = cosine_similarity(vectors)

In [None]:
csim

In [None]:
csim[0][1]

In [None]:
range(len(pairs_red_us.iloc[:]))

In [None]:
name_cosines = []
for i in range(len(pairs_red_us)):
    if (pairs_red_us['name_1'].iloc[i] == '') | (pairs_red_us['name_2'].iloc[i] == ''):
        csim = -1
    else:
        csim = cosine_similarity(CountVectorizer().fit_transform([pairs_red_us['name_1'].iloc[i], pairs_red_us['name_2'].iloc[i]]).toarray())[0][1]
    name_cosines.append(csim)

In [None]:
len(name_cosines)

In [None]:
pairs_red_us['name_cosines'] = pd.DataFrame(name_cosines)

In [None]:
pairs_red_us.info()

In [None]:
pairs_red_us[pairs_red_us['name_cosines'].isna()]

In [None]:
pairs_red_us[pairs_red_us['categories_2'].isna()]

In [None]:
pairs_red_us.head()

In [None]:
address_cosines = []
for i in range(len(pairs_red_us)):
    if (pairs_red_us['full_address_1'].iloc[i] == '') | (pairs_red_us['full_address_2'].iloc[i] == ''):
        csim = -1
    else:
        csim = cosine_similarity(CountVectorizer().fit_transform([pairs_red_us['full_address_1'].iloc[i], pairs_red_us['full_address_2'].iloc[i]]).toarray())[0][1]
    address_cosines.append(csim)

pairs_red_us['full_address_cosines'] = pd.DataFrame(address_cosines)

In [None]:
pairs_red_us.head()

In [None]:
categories_cosines = []
for i in range(len(pairs_red_us)):
    if (pairs_red_us['categories_1'].iloc[i] == '') | (pairs_red_us['categories_2'].iloc[i] == ''):
        csim = -1
    else:
        csim = cosine_similarity(CountVectorizer().fit_transform([pairs_red_us['categories_1'].iloc[i], pairs_red_us['categories_2'].iloc[i]]).toarray())[0][1]
    categories_cosines.append(csim)
    
pairs_red_us['categories_cosines'] = pd.DataFrame(categories_cosines)

In [None]:
pairs_red_us.head()

In [None]:
pairs_final_diffed_us = pairs_red_us[['theta_diff', 'name_cosines', 'full_address_cosines', 'categories_cosines', 'match']]

In [None]:
pairs_final_diffed_us

# Exploring the transformed data

In [None]:
pairs_final_diffed_us['theta_diff']

In [None]:
pairs_final_diffed_us.describe()

In [None]:
import seaborn as sns

In [None]:
pairs_corr = pairs_final_diffed_us.corr()

In [None]:
sns.heatmap(pairs_corr)

In [None]:
pairs_corr

In [None]:
pairs_final_diffed_us['theta_diff'][pairs_final_diffed_us['match']==True].describe()

In [None]:
pairs_final_diffed_us['theta_diff'][pairs_final_diffed_us['match']==False].describe()

In [None]:
pairs_final_diffed_us.describe()

In [None]:
pairs_final_diffed_us.head()

In [None]:
pairs_final_diffed_us.to_csv(r'C:\Users\gorma\OneDrive\Documents\Erdos\foursquare-location-matching\tim_code\working_data\pairs_final_diffed_us.csv')

## Optional Scaling

In [None]:
pairs_final_diffed_us = pd.read_csv(r'C:\Users\gorma\OneDrive\Documents\Erdos\foursquare-location-matching\tim_code\working_data\pairs_final_diffed_us.csv')

In [None]:
pairs_final_diffed_us = pairs_final_diffed_us.drop('Unnamed: 0', axis =1)

In [None]:
pairs_final_diffed_us.head()

In [None]:
pairs_final_diffed_us.describe()

In [None]:
from sklearn.preprocessing import StandardScaler
cols_to_scale = ['theta_diff', 'name_cosines', 'full_address_cosines', 'categories_cosines']
scaler = StandardScaler()
scaler.fit(pairs_final_diffed_us[cols_to_scale])
pairs_final_diffed_us[cols_to_scale] = scaler.transform(pairs_final_diffed_us[cols_to_scale])

In [None]:
pairs_final_diffed_us.describe()

In [None]:
pairs_final_diffed_us.to_csv(r'C:\Users\gorma\OneDrive\Documents\Erdos\foursquare-location-matching\tim_code\working_data\pairs_final_diffed_us_scaled.csv', index = False)