In [110]:
import json
import csv
import seaborn as sns
import os
import pandas as pd
from pandas import DataFrame, merge
import Levenshtein as lv
import numpy as np
import sklearn as sk
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score, make_scorer
from sklearn import ensemble
from sklearn.model_selection import train_test_split as sksplit 
from sklearn.model_selection import cross_val_score, GridSearchCV
from imblearn.over_sampling import SMOTE
import codecs, difflib, distance

from sklearn.preprocessing import scale, PolynomialFeatures
from sklearn import linear_model as lm
from sklearn import preprocessing
import datetime
import matplotlib.pyplot as plt



In [111]:
locu_train_path = 'train/locu_train.json'
foursquare_train_path = 'train/foursquare_train.json'
matches_train_path = 'train/matches_train.csv'
locu_test_path = 'online_competition/locu_test.json'
foursquare_test_path = 'online_competition/foursquare_test.json'

In [112]:
#Loading Data
locu_train = json.load(open(locu_train_path))
foursquare_train = json.load(open(foursquare_train_path))
matches_train = pd.read_csv(matches_train_path)

In [113]:
#simultaneously processing test data
locu_test = json.load(open(locu_test_path))
foursquare_test = json.load(open(foursquare_test_path))

In [114]:
#Make DFs of train data
locu_train_df = pd.DataFrame(locu_train)
fsq_train_df = pd.DataFrame(foursquare_train)

In [115]:
#simultaneously processing test data
locu_test_df = pd.DataFrame(locu_test)
fsq_test_df = pd.DataFrame(foursquare_test)

In [116]:
#Check for feature variance

#Removing features that do not help classify
#Ex: country,locality(covered in pincode),region 
locu_train_df.drop(['country','locality','region'], axis = 1, inplace= True)
fsq_train_df.drop(['country','locality','region'], axis = 1, inplace= True)

In [117]:
#similarly updating test set
locu_test_df.drop(['country','locality','region'], axis = 1, inplace= True)
fsq_test_df.drop(['country','locality','region'], axis = 1, inplace= True)

In [118]:
#Handle NAs

In [119]:
fsq_train_df.isnull().any()

id                False
latitude          False
longitude         False
name              False
phone              True
postal_code       False
street_address    False
website           False
dtype: bool

In [120]:
fsq_test_df['phone'].isnull().sum()

181

In [121]:
#remove formatting in fsq to match that in locu
#fsq_train_df['phone'].unique() 
fsq_train_df['phone']=fsq_train_df['phone'].str.replace(r"[()-]",'') #remove ()-
fsq_train_df['phone']=fsq_train_df['phone'].str.replace('\s+','') #remove spaces
locu_train_df['phone'][locu_train_df['phone']==''] = None #encode empty as missing 
fsq_train_df['phone'][fsq_train_df['phone']==''] = None #encode empty as missing 
#same with test
fsq_test_df['phone']=fsq_test_df['phone'].str.replace(r"[()-]",'') #remove ()-
fsq_test_df['phone']=fsq_test_df['phone'].str.replace('\s+','') #remove spaces
locu_test_df['phone'][locu_test_df['phone']==''] = None #encode empty as missing 
fsq_test_df['phone'][fsq_test_df['phone']==''] = None #encode empty as missing 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [122]:
locu_train_df.isnull().any()

id                False
latitude           True
longitude          True
name              False
phone              True
postal_code       False
street_address    False
website           False
dtype: bool

In [123]:
locu_train_df['latitude'].isnull().sum()

1

In [124]:
locu_train_df['longitude'].isnull().sum()

1

In [125]:
#filling NA with -1
locu_train_df['longitude'].fillna(np.NaN,inplace=True)
locu_train_df['latitude'].fillna(np.NaN,inplace=True)

In [126]:
#updating test
locu_test_df['longitude'].fillna(np.NaN,inplace=True)
locu_test_df['latitude'].fillna(np.NaN,inplace=True)

In [127]:
locu_train_df[locu_train_df['website'] == ''].shape

(315, 8)

In [128]:
fsq_train_df[fsq_train_df['website'] == ''].shape

(488, 8)

In [129]:
#'website' is more than 50% sparse > Remove

In [130]:
locu_train_df.drop(['website'], axis = 1, inplace= True)
fsq_train_df.drop(['website'], axis = 1, inplace= True)

In [131]:
#updating test set
locu_test_df.drop(['website'], axis = 1, inplace= True)
fsq_test_df.drop(['website'], axis = 1, inplace= True)

In [132]:
#adding prefix to col names
locu_cols = list(locu_train_df.columns)
fsq_cols = list(fsq_train_df.columns)

new_locu_cols = ['locu' + '_' + str(x) for x in locu_cols]
new_fsq_cols = ['fsq' + '_' + str(x) for x in fsq_cols]

locu_train_df.columns = new_locu_cols
fsq_train_df.columns = new_fsq_cols

In [133]:
#updating test
locu_cols_test = list(locu_test_df.columns)
fsq_cols_test = list(fsq_test_df.columns)

new_locu_cols_test = ['locu' + '_' + str(x) for x in locu_cols_test]
new_fsq_cols_test = ['fsq' + '_' + str(x) for x in fsq_cols_test]

locu_test_df.columns = new_locu_cols_test
fsq_test_df.columns = new_fsq_cols_test

In [134]:
#create dummy keys for cartesian product of DFs
locu_train_df['key'] = 0
fsq_train_df['key'] = 0

#merge data
locu_fsq_df = merge(locu_train_df,fsq_train_df,on='key')
locu_fsq_df.drop(['key'],axis = 1, inplace = True)

In [135]:
#updating test
locu_test_df['key'] = 0
fsq_test_df['key'] = 0

#merge data
locu_fsq_df_test = merge(locu_test_df,fsq_test_df,on='key')
locu_fsq_df_test.drop(['key'],axis = 1, inplace = True)

In [136]:
locu_fsq_df.shape

(360000, 14)

In [137]:
locu_fsq_df_test.shape

(160000, 14)

In [138]:
#creating a name for every combination under the column 
locu_fsq_df['combination'] = locu_fsq_df['locu_id'].astype(str) +'_'+ locu_fsq_df['fsq_id'].astype(str)

matches_train['combination'] = matches_train['locu_id'].astype(str) +'_'+matches_train['foursquare_id'].astype(str)

In [139]:
#updating test set 
locu_fsq_df_test['combination'] = locu_fsq_df_test['locu_id'].astype(str) +'_'+ locu_fsq_df_test['fsq_id'].astype(str)

#matches_train['combination'] = matches_train['locu_id'].astype(str) +'_'+matches_train['foursquare_id'].astype(str)

In [140]:
def add_label(combi):
    if str(combi) in list(matches_train['combination']):
        label = 1
    else:
        label = 0
    return label

In [141]:
#add training labels from matches_train
locu_fsq_df['label'] = locu_fsq_df.apply(lambda x: add_label(x['combination']),axis=1)

In [142]:
#Class imbalance 
locu_fsq_df['label'].value_counts()/locu_fsq_df.shape[0]

0    0.999
1    0.001
Name: label, dtype: float64

In [143]:
def LevenshteinDistance_pkg(fsq,locu):
    if fsq is None or locu is None:
        lev_dist = 0
    else:
        lev_dist = lv.distance(fsq,locu)
    return lev_dist

In [145]:
locu_fsq_df['lev_dist_name'] = locu_fsq_df.apply(lambda x: LevenshteinDistance_pkg(x['fsq_name'],x['locu_name']),axis = 1)
locu_fsq_df['lev_address'] = locu_fsq_df.apply(lambda x: LevenshteinDistance_pkg(x['fsq_street_address'],x['locu_street_address']),axis = 1)
locu_fsq_df['phone_equal'] = (locu_fsq_df['locu_phone'] == locu_fsq_df['fsq_phone'])
locu_fsq_df['phone_lev'] = locu_fsq_df.apply(lambda x: LevenshteinDistance_pkg(x['locu_phone'],x['fsq_phone']),axis = 1)
locu_fsq_df['seq_dist_name']= locu_fsq_df.apply(lambda x: difflib.SequenceMatcher(a=x['fsq_name'].lower(),b=x['locu_name'].lower()).ratio(),axis = 1)
#locu_fsq_df['seq_dist_name']= locu_fsq_df.apply(lambda x: difflib.SequenceMatcher(a=x['fsq_name'].lower(),b=x['locu_name'].lower()).ratio(),axis = 1)

In [146]:
grouped = locu_fsq_df.groupby(by='label')
avg_metric = grouped[['seq_dist_name', 'lev_dist_name', 'lev_address',
       'phone_equal', 'phone_lev']].agg(np.mean)

In [147]:
avg_metric

Unnamed: 0_level_0,seq_dist_name,lev_dist_name,lev_address,phone_equal,phone_lev
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.235871,18.344589,13.73521,6e-06,2.629602
1,0.9319,2.113889,1.475,0.630556,0.161111


In [148]:
max_metric = grouped[[ 'seq_dist_name', 'lev_dist_name', 'lev_address',
       'phone_equal', 'phone_lev']].agg(np.max)

In [149]:
max_metric

Unnamed: 0_level_0,seq_dist_name,lev_dist_name,lev_address,phone_equal,phone_lev
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1.0,79,62,True,11
1,1.0,28,43,True,10


In [150]:
min_metric = grouped[['seq_dist_name', 'lev_dist_name', 'lev_address',
       'phone_equal', 'phone_lev']].agg(np.min)

In [151]:
min_metric

Unnamed: 0_level_0,seq_dist_name,lev_dist_name,lev_address,phone_equal,phone_lev
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0,0,0,False,0
1,0.3,0,0,False,0


In [152]:
#Update test set
locu_fsq_df_test['lev_dist_name'] = locu_fsq_df_test.apply(lambda x: LevenshteinDistance_pkg(x['fsq_name'],x['locu_name']),axis = 1)
locu_fsq_df_test['lev_address'] = locu_fsq_df_test.apply(lambda x: LevenshteinDistance_pkg(x['fsq_street_address'],x['locu_street_address']),axis = 1)
locu_fsq_df_test['phone_equal'] = (locu_fsq_df_test['locu_phone'] == locu_fsq_df_test['fsq_phone'])
locu_fsq_df_test['phone_lev'] = locu_fsq_df_test.apply(lambda x: LevenshteinDistance_pkg(x['locu_phone'],x['fsq_phone']),axis = 1)
locu_fsq_df_test['seq_dist_name']= locu_fsq_df_test.apply(lambda x: difflib.SequenceMatcher(a=x['fsq_name'].lower(),b=x['locu_name'].lower()).ratio(),axis = 1)
#locu_fsq_df['seq_dist_name'] = locu_fsq_df.apply(lambda x: sequence(x['fsq_name'],x['locu_name']),axis = 1)
#locu_fsq_df_test['sor_dist_name'] = locu_fsq_df_test.apply(lambda x: 1 - distance.sorensen(x['fsq_name'],x['locu_name']),axis = 1)
#locu_fsq_df_test['jac_dist_name'] = locu_fsq_df_test.apply(lambda x: 1 - distance.jaccard(x['fsq_name'],x['locu_name']),axis = 1)

In [153]:
# Feature: Distance between locu and foursquare places from lat long values

import math

def distance(lat1, lon1, lat2, lon2):
    #lat1, lon1 = origin
    #lat2, lon2 = destination
    radius = 6371 # km

    dlat = math.radians(lat2-lat1)
    dlon = math.radians(lon2-lon1)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c

    return d

locu_fsq_df['distance']=locu_fsq_df.apply(lambda x: distance(x['locu_latitude'],x['locu_longitude'],x['fsq_latitude'],x['fsq_longitude']),axis = 1)
#locu_fsq_df_filtered['distance']=locu_fsq_df_filtered.apply(lambda x: distance(x['locu_latitude'],x['locu_longitude'],x['fsq_latitude'],x['fsq_longitude']),axis = 1)
#dro[ lat long columns now]
#locu_fsq_df_filtered.drop(['locu_latitude', 'locu_longitude', 'fsq_latitude', 'fsq_longitude'],inplace=True,axis=1)

In [154]:
locu_fsq_df_test['distance']=locu_fsq_df_test.apply(lambda x: distance(x['locu_latitude'],x['locu_longitude'],x['fsq_latitude'],x['fsq_longitude']),axis = 1)

In [155]:
group_dist = locu_fsq_df.groupby(by = 'label')
avg_dist = group_dist[['distance','seq_dist_name']].agg(np.mean)
print(avg_dist)
max_dist = group_dist[['distance','seq_dist_name']].agg(np.max)
print(max_dist)
min_dist = group_dist[['distance','seq_dist_name']].agg(np.min)
print(min_dist)

       distance  seq_dist_name
label                         
0      8.897255       0.235871
1      0.195341       0.931900
         distance  seq_dist_name
label                           
0      478.609103            1.0
1       17.807884            1.0
       distance  seq_dist_name
label                         
0      0.001135            0.0
1      0.000000            0.3


In [156]:
group_dist['distance'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,359040.0,8.897255,37.978743,0.001135,2.061748,3.808079,6.666724,478.609103
1,360.0,0.195341,1.2966,0.0,0.0,0.0,0.020045,17.807884


In [157]:
group_dist['seq_dist_name'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,359640.0,0.235871,0.094141,0.0,0.171429,0.230769,0.294118,1.0
1,360.0,0.9319,0.158098,0.3,1.0,1.0,1.0,1.0


In [158]:
# locu_fsq_df_filtered = locu_fsq_df[locu_fsq_df['distance']  < 20]
# locu_fsq_df_filtered = locu_fsq_df_filtered[locu_fsq_df_filtered['seq_dist_name']  > 0.25]

In [159]:
locu_fsq_df_filtered = locu_fsq_df[locu_fsq_df['distance']  < 15]
locu_fsq_df_filtered = locu_fsq_df_filtered[locu_fsq_df_filtered['seq_dist_name']  > 0.3]
locu_fsq_df_filtered.shape

(78961, 22)

In [160]:
locu_fsq_df_filtered.label.value_counts()

0    78603
1      358
Name: label, dtype: int64

In [161]:
# #Update Test set
# locu_fsq_df_test = locu_fsq_df_test[locu_fsq_df_test['distance']  < 20]
# locu_fsq_df_test = locu_fsq_df_test[locu_fsq_df_test['seq_dist_name']  > 0.25]

In [162]:
#Update Test set
locu_fsq_df_test = locu_fsq_df_test[locu_fsq_df_test['distance']  < 15]
locu_fsq_df_test = locu_fsq_df_test[locu_fsq_df_test['seq_dist_name']  > 0.3]

In [163]:
locu_fsq_df_filtered.drop(['locu_id','locu_name', 'locu_phone',
         'fsq_id','fsq_name', 'fsq_phone'],axis=1,inplace=True)

In [164]:
locu_fsq_df_test.drop(['locu_id','locu_name', 'locu_phone',
         'fsq_id','fsq_name', 'fsq_phone'],axis=1,inplace=True)

In [165]:
locu_fsq_df_filtered.columns

Index(['locu_latitude', 'locu_longitude', 'locu_postal_code',
       'locu_street_address', 'fsq_latitude', 'fsq_longitude',
       'fsq_postal_code', 'fsq_street_address', 'combination', 'label',
       'lev_dist_name', 'lev_address', 'phone_equal', 'phone_lev',
       'seq_dist_name', 'distance'],
      dtype='object')

In [166]:
#feature: Does the first numeric value in the address field match?

#locu_fsq_df_filtered[['fsq_street_address','locu_street_address']]
first_fsq = locu_fsq_df_filtered.fsq_street_address.str.extract('(\d+)').str[0]
first_loc = locu_fsq_df_filtered.locu_street_address.str.extract('(\d+)').str[0]
locu_fsq_df_filtered['FirstAdrrDigit'] = (first_fsq==first_loc)

  after removing the cwd from sys.path.
  """


In [167]:
# Same feature for test

first_fsqt = locu_fsq_df_test.fsq_street_address.str.extract('(\d+)').str[0]
first_loct = locu_fsq_df_test.locu_street_address.str.extract('(\d+)').str[0]
locu_fsq_df_test['FirstAdrrDigit'] = (first_fsqt==first_loct)

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [168]:
#We dont need individual name strings > drop
locu_fsq_df_filtered.drop(['fsq_street_address','locu_street_address'],inplace=True,axis= 1)


In [169]:
# updating test

#We dont need individual name strings > drop
locu_fsq_df_test.drop(['fsq_street_address','locu_street_address'],inplace=True,axis= 1)


In [170]:
locu_fsq_df_filtered.columns

Index(['locu_latitude', 'locu_longitude', 'locu_postal_code', 'fsq_latitude',
       'fsq_longitude', 'fsq_postal_code', 'combination', 'label',
       'lev_dist_name', 'lev_address', 'phone_equal', 'phone_lev',
       'seq_dist_name', 'distance', 'FirstAdrrDigit'],
      dtype='object')

In [171]:
locu_fsq_df_test.drop(['locu_postal_code','fsq_postal_code'],inplace=True,axis=1)
locu_fsq_df_filtered.drop(['locu_postal_code','fsq_postal_code'],inplace=True,axis=1)
locu_fsq_df_test.isnull().sum()

locu_latitude     0
locu_longitude    0
fsq_latitude      0
fsq_longitude     0
combination       0
lev_dist_name     0
lev_address       0
phone_equal       0
phone_lev         0
seq_dist_name     0
distance          0
FirstAdrrDigit    0
dtype: int64

## Model Fitting:

### Data Prep:

In [172]:
#making x and y arrays
y = locu_fsq_df_filtered['label'].values

x = locu_fsq_df_filtered[[x for x in list(locu_fsq_df_filtered.columns) if x not in ['combination','label','locu_latitude', 'locu_longitude', 'fsq_latitude', 'fsq_longitude']]].values

In [173]:
#updating test
x_hold_out = locu_fsq_df_test[[k for k in list(locu_fsq_df_test.columns) if k not in ['combination','label','locu_latitude', 'locu_longitude', 'fsq_latitude', 'fsq_longitude']]].values

In [174]:
x_train,x_test,y_train,y_test=sksplit(x,y,random_state=10,stratify = y)

In [175]:
#Random Forests: Grid Search CV
rfc = ensemble.RandomForestClassifier(n_jobs=-1, oob_score = False) 
f1_scorer = make_scorer(f1_score)
param_grid = { 
    'n_estimators': [50,100, 150],
    'max_features': ['auto', 'sqrt', 'log2']
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5,scoring=f1_scorer)
#CV_rfc.fit(x_train, y_train)
CV_rfc.fit(x_train, y_train)
print(CV_rfc.best_params_)

{'n_estimators': 50, 'max_features': 'sqrt'}


In [176]:
CV_rfc_best_estimator = CV_rfc.best_estimator_
print("Training score with the above params:",CV_rfc_best_estimator.score(x_train, y_train) )

Training score with the above params: 1.0


## Best model fit to train and test split of train

In [177]:
##Best fit to test train split
CV_rfc_best_estimator.fit(x_train, y_train)
best_pred = CV_rfc_best_estimator.predict(x_test)
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test, best_pred, average='binary')


(1.0, 1.0, 1.0, None)

# Train on whole train and test on hold-out(competition site)

In [178]:
##train with whole data
#x_sm, y_sm = sm.fit_sample(x,y)
final_estimator_RF = CV_rfc.best_estimator_
final_estimator_RF.fit(x,y)
final_pred = final_estimator_RF.predict(x_hold_out)

In [179]:
final_estimator_RF.feature_importances_

array([0.15800717, 0.05429842, 0.18136118, 0.01509045, 0.22434801,
       0.34618314, 0.02071162])

In [180]:
locu_fsq_df_filtered.columns

Index(['locu_latitude', 'locu_longitude', 'fsq_latitude', 'fsq_longitude',
       'combination', 'label', 'lev_dist_name', 'lev_address', 'phone_equal',
       'phone_lev', 'seq_dist_name', 'distance', 'FirstAdrrDigit'],
      dtype='object')

In [181]:
#give matches in the csv format specified
pred_df = pd.DataFrame(locu_fsq_df_test['combination'].str.split('_',1).tolist(),
                                   columns = ['locu_id', 'foursquare_id'])
pred_df['lev_dist_name']=list(locu_fsq_df_test['lev_dist_name'])

#choose only matches
matches_test = pred_df[final_pred==1]

#if multiple matches for same locu_id, choose the one with smallest lev distance between name strings
groups = matches_test.groupby(by=['locu_id'])
final = groups.apply(lambda g: g[g['lev_dist_name'] == g['lev_dist_name'].min()]).reset_index(drop=True)

final.to_csv('matches_test_new.csv')