In [3]:
import numpy as np
from scipy.spatial.distance import pdist, jaccard, squareform, cosine
from itertools import combinations
import random
import sklearn.metrics.pairwise as pair
from sklearn.metrics import log_loss
import pandas as pd
from math import radians, cos, sin, asin, sqrt
#from __future__ import divison

xids = pd.read_csv('xdevrecon.csv')


In [4]:

#start with our lat/long data. 
#define the haversine function
def haversine(latlon1, latlon2):
    lon1 = latlon1[1]
    lat1 = latlon1[0]
    lon2 = latlon2[1]
    lat2 = latlon2[0]
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 1.0 
    return c * r
    
xids_la = xids.drop_duplicates('xid')
X = xids_la[['lat','long']]
X
dm=pdist(X.as_matrix(), lambda u, v: haversine(u,v))
dm
q = lambda i,j,n: n*j - j*(j+1)/2 + i - 1 - j
ds = squareform( dm )
for i in xrange( 1, 4967 ):
    for j in xrange( i ):
      assert ds[ i, j ] == dm[ q( i, j, 4967 ) ]



In [9]:
ds

array([[ 0.        ,  0.07174449,  0.29360641, ...,  0.32783175,
         0.06111808,  0.32783175],
       [ 0.07174449,  0.        ,  0.23376126, ...,  0.27137671,
         0.0107293 ,  0.27137671],
       [ 0.29360641,  0.23376126,  0.        , ...,  0.3252554 ,
         0.24305431,  0.3252554 ],
       ..., 
       [ 0.32783175,  0.27137671,  0.3252554 , ...,  0.        ,
         0.27842507,  0.        ],
       [ 0.06111808,  0.0107293 ,  0.24305431, ...,  0.27842507,
         0.        ,  0.27842507],
       [ 0.32783175,  0.27137671,  0.3252554 , ...,  0.        ,
         0.27842507,  0.        ]])

In [10]:

#now let's do our cosine matrix for the categorical data
#cats = xids[:5000]
cats = xids.drop_duplicates('xid')
cats = cats.drop('xid',1)
cats = cats.drop('match',1)
cats = cats.drop('lco',1)
cats = cats.drop('lci',1)
cats = cats.drop('lat',1)
cats = cats.drop('long',1)
cats = pd.get_dummies(cats)
catcosdf = 1- pair.cosine_similarity(cats, dense_output=True)


In [11]:
catcosdf

array([[  0.00000000e+00,   2.71955992e-01,   5.31358471e-02, ...,
          5.17618089e-01,   5.17618089e-01,   5.17618089e-01],
       [  2.71955992e-01,   1.11022302e-16,   2.77814619e-01, ...,
          5.13167019e-02,   5.13167019e-02,   2.09430585e-01],
       [  5.31358471e-02,   2.77814619e-01,  -2.22044605e-16, ...,
          5.01727121e-01,   5.01727121e-01,   5.01727121e-01],
       ..., 
       [  5.17618089e-01,   5.13167019e-02,   5.01727121e-01, ...,
          1.11022302e-16,   1.11022302e-16,   2.00000000e-01],
       [  5.17618089e-01,   5.13167019e-02,   5.01727121e-01, ...,
          1.11022302e-16,   1.11022302e-16,   2.00000000e-01],
       [  5.17618089e-01,   2.09430585e-01,   5.01727121e-01, ...,
          2.00000000e-01,   2.00000000e-01,   0.00000000e+00]])

In [12]:
#now multiply these two matrices to get our total similarity matrix
haversine_dist = ds
cosine_dist=catcosdf
alpha = 0.5
y_pred = alpha*haversine_dist + (1-alpha)*cosine_dist 
y_pred


array([[  0.00000000e+00,   1.71850242e-01,   1.73371126e-01, ...,
          4.22724919e-01,   2.89368085e-01,   4.22724919e-01],
       [  1.71850242e-01,   5.55111512e-17,   2.55787942e-01, ...,
          1.61346704e-01,   3.10229999e-02,   2.40403646e-01],
       [  1.73371126e-01,   2.55787942e-01,  -1.11022302e-16, ...,
          4.13491262e-01,   3.72390716e-01,   4.13491262e-01],
       ..., 
       [  4.22724919e-01,   1.61346704e-01,   4.13491262e-01, ...,
          5.55111512e-17,   1.39212533e-01,   1.00000000e-01],
       [  2.89368085e-01,   3.10229999e-02,   3.72390716e-01, ...,
          1.39212533e-01,   5.55111512e-17,   2.39212533e-01],
       [  4.22724919e-01,   2.40403646e-01,   4.13491262e-01, ...,
          1.00000000e-01,   2.39212533e-01,   0.00000000e+00]])

In [13]:






#now make our score set
#this function creates a dict with the xid and its index position
def createMapping(listuniqueid):
    mapping = {}
    for n,xid in enumerate(listuniqueid):
        mapping[xid] = n
    return mapping
    
#this creates a matrix whose l/w are the lengths of the mapping dict    
def createGTMatrix(mapping, matches):
    m = np.zeros([len(mapping), len(mapping)])
    for xid1, xid2 in matches:
    	try:
        	m[mapping[xid1],mapping[xid2]] = 1
        	m[mapping[xid2],mapping[xid1]] = 1
        except IndexError:
        	continue	
    return m

xids1 = xids
#here are my matched pairs
matches = xids1[['xid', 'match']]
matches = [tuple(x) for x in matches.values]
listuniqueid = xids1['xid']

mapping = createMapping(listuniqueid.unique())    
y_true = createGTMatrix(mapping,matches)

X_i_pred = np.argsort(y_pred)[1:]

#get only the top predictions per id
X_i_preds = X_i_pred[:,:4]

good = 1
ix = np.in1d(y_true.ravel(), good).reshape(x.shape)
true_loc = np.column_stack(np.where(ix))

#now want to iterate through values in pred and check if they match the ones in true loc
rows = []
for i in true_loc:
	for j in X_i_preds:
		if (i[1] == j[1] or i[1] == j[2] or i[1] == j[3]):
			rows.append(1)
		else:
			rows.append(0)

sum(rows)/len(rows)					




ValueError: total size of new array must be unchanged