In [1]:
import pandas as pd
import pylab as pl
import json
%pylab inline
pl.rcParams["font.size"] = 20

  'Matplotlib is building the font cache using fc-list. '


Populating the interactive namespace from numpy and matplotlib


# Categorical distance metrics

### reading in Women Services database from NYC open data

In [2]:
womenserv = pd.read_csv("https://data.cityofnewyork.us/api/views/pqg4-dm6b/rows.csv?accessType=DOWNLOAD")
#NYC_Women_s_Resource_Network_Database.csv")
womenserv.head()

Unnamed: 0,OrganizationName,Brooklyn,Bronx,Manhattan,Queens,Staten Island,Fax,Phone,URL,noURL,...,Location 2,Borough,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
0,YWCA of the City of New York,Y,N,Y,N,Y,2122236000.0,2127554000.0,http://www.ywcanyc.org,N,...,,,,,,,,,,
1,RightRides for Women's Safety,Y,Y,Y,Y,N,,7185221000.0,http://www.rightrides.org,N,...,,BROOKLYN,40.693244,-73.99066,2.0,33.0,9.0,3002094.0,3002500000.0,Brooklyn Heights-Cobble Hill ...
2,Charles B. Wang Community Health Center,N,N,Y,N,N,,2129660000.0,http://www.cbwchc.org/hcs/wh/wh.html,N,...,,,,,,,,,,
3,American-Italian Cancer Foundation's Mobile Ma...,Y,Y,Y,Y,Y,,2126289000.0,http://www.americanitaliancancer.org,N,...,,MANHATTAN,40.770141,-73.96367,8.0,4.0,128.0,1083890.0,1014058000.0,Upper East Side-Carnegie Hill ...
4,QHC-Diabetes Center,N,N,N,Y,N,7188836000.0,7188833000.0,http://nyc.gov,N,...,,QUEENS,40.71725,-73.803433,8.0,24.0,1267.0,4442431.0,4068580000.0,Pomonok-Flushing Heights-Hillcrest ...


### encoding variable as a binary
(reference on categorical encoding in python: http://pbpython.com/categorical-encoding.html)

In [3]:
Y21N20 = {'Y':1, 'N':0}
    
for br in 'Brooklyn', 'Bronx', 'Manhattan', 'Queens', 'Staten Island':
    #womenserv[br][womenserv[br] == 'Y'] = 1
    #womenserv[br][womenserv[br] == 'N'] = 0
    womenserv[br] = womenserv[br].map(Y21N20)
womenserv.head()

Unnamed: 0,OrganizationName,Brooklyn,Bronx,Manhattan,Queens,Staten Island,Fax,Phone,URL,noURL,...,Location 2,Borough,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
0,YWCA of the City of New York,1,0,1,0,1,2122236000.0,2127554000.0,http://www.ywcanyc.org,N,...,,,,,,,,,,
1,RightRides for Women's Safety,1,1,1,1,0,,7185221000.0,http://www.rightrides.org,N,...,,BROOKLYN,40.693244,-73.99066,2.0,33.0,9.0,3002094.0,3002500000.0,Brooklyn Heights-Cobble Hill ...
2,Charles B. Wang Community Health Center,0,0,1,0,0,,2129660000.0,http://www.cbwchc.org/hcs/wh/wh.html,N,...,,,,,,,,,,
3,American-Italian Cancer Foundation's Mobile Ma...,1,1,1,1,1,,2126289000.0,http://www.americanitaliancancer.org,N,...,,MANHATTAN,40.770141,-73.96367,8.0,4.0,128.0,1083890.0,1014058000.0,Upper East Side-Carnegie Hill ...
4,QHC-Diabetes Center,0,0,0,1,0,7188836000.0,7188833000.0,http://nyc.gov,N,...,,QUEENS,40.71725,-73.803433,8.0,24.0,1267.0,4442431.0,4068580000.0,Pomonok-Flushing Heights-Hillcrest ...


## define similarity

In [4]:
def SimpleSimilarity(x,y):
    '''Calculates Simple similarity for categorical binary variables
    Arguments: 
    x: series (array) of categorical observation for an feature
    y: series (array) of categorical observation for an feature
    Returns:
    Simple similarity (float)
    '''
    
    coocc = x + y
    m11 = np.sum(coocc==0) # in neither
    m00 = np.sum(coocc==2) # in both
    p = len(coocc)
    return (m11 + m00) * 1.0 / p

In [5]:
def JaccardSimilarity(x,y):
    '''Calculates Jaccard similarity for categorical binary variables
    Arguments: 
    x: series (array) of categorical observation for an feature
    y: series (array) of categorical observation for an feature
    Returns:
    Jaccard similarity (float)
    '''
    
    coocc = x + y
    a = (coocc==2).sum() # in both
    bc = (coocc==1).sum() #in either but not both
    return (a) * 1.0 / bc

In [9]:
print ("How similar is Brooklyn to the Bronx?")
    
print ("Simple Similarity: ", SimpleSimilarity(womenserv.Brooklyn.values, womenserv.Bronx.values), "\n",
      "Jaccard Similarity: ", JaccardSimilarity(womenserv.Brooklyn.values, womenserv.Bronx.values),
      "\n\n")

print ("How similar is Manhattan to the Bronx?")
    
print ("Simple Similarity: ", SimpleSimilarity(womenserv.Manhattan.values, womenserv.Bronx.values), "\n",
      "Jaccard Similarity: ", JaccardSimilarity(womenserv.Manhattan.values, womenserv.Bronx.values),
      "\n\n")

print ("How similar is Manhattan to the Brooklyn?")
    
print ("Simple Similarity: ", SimpleSimilarity(womenserv.Manhattan.values, womenserv.Brooklyn.values), "\n",
      "Jaccard Similarity: ", JaccardSimilarity(womenserv.Manhattan.values, womenserv.Brooklyn.values),
      "\n\n")

print ("How similar is Manhattan to the SI?")
    
print ("Simple Similarity: ", SimpleSimilarity(womenserv.Manhattan.values, womenserv["Staten Island"].values), "\n",
      "Jaccard Similarity: ", JaccardSimilarity(womenserv.Manhattan.values, womenserv["Staten Island"].values),
      "\n\n")


How similar is Brooklyn to the Bronx?
Simple Similarity:  0.770706190061 
 Jaccard Similarity:  0.733840304183 


How similar is Manhattan to the Bronx?
Simple Similarity:  0.502179598954 
 Jaccard Similarity:  0.353765323993 


How similar is Manhattan to the Brooklyn?
Simple Similarity:  0.426329555362 
 Jaccard Similarity:  0.313069908815 


How similar is Manhattan to the SI?
Simple Similarity:  0.531822144725 
 Jaccard Similarity:  0.329608938547 


