In [1]:
import pandas as pd
import numpy as np
from scipy.spatial import distance
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

In [2]:
df = pd.read_csv('yelp_data.csv')
df.columns

Index([u'Unnamed: 0', u'restaurants', u'bars', u'coffee & tea',
       u'health & medical', u'arts & entertainment', u'fitness & instruction',
       u'grocery', u'education', u'haircut', u'boutique', u'PjAreaCode'],
      dtype='object')

In [3]:
df = df.rename(columns={'Unnamed: 0': 'Neighborhood'})
activities = df.columns.tolist()
activities.pop(0)
activities.pop(-1)
activities

['restaurants',
 'bars',
 'coffee & tea',
 'health & medical',
 'arts & entertainment',
 'fitness & instruction',
 'grocery',
 'education',
 'haircut',
 'boutique']

In [4]:
# source: http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/
def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in xrange(steps):
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in xrange(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = np.dot(P,Q)
        e = 0
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in xrange(K):
                        e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2) )
        if e < 0.001:
            break
    return P, Q.T

In [5]:
R = np.array(df[activities])

N = len(R)
M = len(R[0])
K = 2

P = np.random.rand(N,K)
Q = np.random.rand(M,K)

nP, nQ = matrix_factorization(R, P, Q, K)
nR = np.dot(nP, nQ.T)

In [6]:
df_factorized = pd.DataFrame(columns=df.columns.tolist())
df_factorized['Neighborhood'] = df['Neighborhood']
for i in df_factorized.index:
    count = 0
    for act in activities:
        df_factorized.loc[i,act] = nR[i,count]
        count += 1
df_factorized['PjAreaCode'] = df['PjAreaCode']
df_factorized

Unnamed: 0,Neighborhood,restaurants,bars,coffee & tea,health & medical,arts & entertainment,fitness & instruction,grocery,education,haircut,boutique,PjAreaCode
0,Upper West Side,3.623001,3.398488,3.686642,3.064728,3.263632,3.003748,3.049628,2.634861,3.703549,2.58884,MN12
1,Stuyvesant Town,3.535625,3.305909,3.615987,2.939197,3.178787,2.934066,2.944678,2.593292,3.56543,2.549056,MN50
2,"SoHo, TriBeCa and Little Italy",3.720195,3.48995,3.785042,3.148362,3.351354,3.084254,3.132302,2.704944,3.804244,2.657669,MN24
3,Clinton,3.894155,3.621124,4.017098,3.1399,3.489562,3.2368,3.184062,2.89771,3.834952,2.850263,MN15
4,Theater District and Garment District,3.826342,3.53001,3.995388,2.948815,3.412581,3.187726,3.045633,2.905322,3.639212,2.860488,MN17
5,East Harlem,2.805529,2.582226,2.93984,2.13281,2.49867,2.338853,2.215275,2.1427,2.640618,2.11021,MN34
6,Washington Heights,2.683195,2.514888,2.73382,2.25986,2.415872,2.225104,2.252546,1.955583,2.733511,1.921629,MN35
7,Turtle Bay,3.768779,3.517744,3.865052,3.103018,3.384844,3.129155,3.120611,2.777078,3.772186,2.730318,MN19
8,Gramercy,3.650928,3.370835,3.807659,2.826538,3.25767,3.040899,2.913862,2.766636,3.484579,2.723688,MN21
9,Upper East Side,3.69838,3.548866,3.62634,3.515852,3.377575,3.045534,3.348727,2.524766,4.146813,2.472715,MN40


In [7]:
df_dist = pd.DataFrame(columns=['Neighborhood1','Neighborhood2','Cosine_Distance','Similarity'])
count = 0
for i in df_factorized.index:
    nh1 = df_factorized.iloc[i].tolist()
    nh1.pop(-1)
    name1 = nh1.pop(0)
    for j in df_factorized.index:
        if not j==i:
            count += 1
            nh2 = df_factorized.iloc[j].tolist()
            nh2.pop(-1)
            name2 = nh2.pop(0)
            row = [name1,name2,distance.cosine(nh1,nh2),1-distance.cosine(nh1,nh2)]
            df_dist.loc[count] = row
df_dist

Unnamed: 0,Neighborhood1,Neighborhood2,Cosine_Distance,Similarity
1,Upper West Side,Stuyvesant Town,3.382848e-05,0.999966
2,Upper West Side,"SoHo, TriBeCa and Little Italy",2.288107e-08,1.000000
3,Upper West Side,Clinton,2.513075e-04,0.999749
4,Upper West Side,Theater District and Garment District,9.198563e-04,0.999080
5,Upper West Side,East Harlem,1.197567e-03,0.998802
6,Upper West Side,Washington Heights,2.141228e-06,0.999998
7,Upper West Side,Turtle Bay,8.103212e-05,0.999919
8,Upper West Side,Gramercy,8.343480e-04,0.999166
9,Upper West Side,Upper East Side,1.655951e-03,0.998344
10,Upper West Side,Greenwich Village,3.549512e-04,0.999645


In [8]:
# normalizing the similarity score
for i in df_dist.Neighborhood1.unique():
    loc_max = max(df_dist[df_dist.Neighborhood1==i]['Similarity'])
    loc_min = min(df_dist[df_dist.Neighborhood1==i]['Similarity'])
    for j in df_dist[df_dist.Neighborhood1==i].index:
        df_dist.loc[j,'Similarity'] = (df_dist.loc[j,'Similarity']-loc_min)/(loc_max-loc_min)
df_dist

Unnamed: 0,Neighborhood1,Neighborhood2,Cosine_Distance,Similarity
1,Upper West Side,Stuyvesant Town,3.382848e-05,0.992830
2,Upper West Side,"SoHo, TriBeCa and Little Italy",2.288107e-08,1.000000
3,Upper West Side,Clinton,2.513075e-04,0.946700
4,Upper West Side,Theater District and Garment District,9.198563e-04,0.804896
5,Upper West Side,East Harlem,1.197567e-03,0.745991
6,Upper West Side,Washington Heights,2.141228e-06,0.999551
7,Upper West Side,Turtle Bay,8.103212e-05,0.982817
8,Upper West Side,Gramercy,8.343480e-04,0.823033
9,Upper West Side,Upper East Side,1.655951e-03,0.648764
10,Upper West Side,Greenwich Village,3.549512e-04,0.924717


In [9]:
df_weighted = pd.DataFrame(columns=df.columns.tolist())
df_weighted['Neighborhood'] = df['Neighborhood']
row = -1
for i in df_dist.Neighborhood1.unique():
    j = df_dist[(df_dist.Neighborhood1==i)&(df_dist.Similarity==1.0)]['Neighborhood2']
    j = j.tolist()[0]
    row = df[df.Neighborhood==i].index.values[0]
    row_sim = df[df.Neighborhood==j].index.values[0]
    for act in activities:
        df_weighted.loc[row,act] = df.loc[row,act]-df.loc[row_sim,act]
df_weighted['PjAreaCode'] = df['PjAreaCode']
df_weighted

Unnamed: 0,Neighborhood,restaurants,bars,coffee & tea,health & medical,arts & entertainment,fitness & instruction,grocery,education,haircut,boutique,PjAreaCode
0,Upper West Side,-0.0688125,-0.14525,-0.10625,-0.1675,1.08125,-0.11375,-0.68625,-0.18,0.16125,-0.8925,MN12
1,Stuyvesant Town,-0.41325,0.313375,0.445,-0.0775,0.78625,0.1625,0.215,0.055,0.68125,0.07625,MN50
2,"SoHo, TriBeCa and Little Italy",0.0688125,0.14525,0.10625,0.1675,-1.08125,0.11375,0.68625,0.18,-0.16125,0.8925,MN24
3,Clinton,0.4473125,0.7265,0.24375,0.61,0.7475,0.61625,0.1332237,0.56375,0.84875,0.4575,MN15
4,Theater District and Garment District,0.8374375,0.6445,0.29875,0.40875,0.86,0.0675,0.055,0.03375,-0.2175,0.09125,MN17
5,East Harlem,-0.35825,-0.257375,-1.0225,-0.51625,-0.2325,-0.38875,-0.715,-0.7725,-1.0775,-0.465625,MN34
6,Washington Heights,-0.8895,-1.246875,-1.81,-0.74125,-0.7165789,-0.4225,-1.24375,-0.44125,-2.11625,-0.955,MN35
7,Turtle Bay,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MN19
8,Gramercy,-0.5890625,-0.51475,0.0275,-0.1925,-0.7475,0.03125,0.57125,-0.255,0.05625,0.21,MN21
9,Upper East Side,0.18275,0.29225,0.0,0.5125,0.20625,-0.11375,0.16125,0.03125,0.23,-0.31125,MN40


In [10]:
rec_dict = {}
for i in df_weighted.index:
    nbhd = df_weighted.loc[i,'Neighborhood']
    loc_min = min(df_weighted.loc[i,activities].tolist())
    for act in activities:
        if df_weighted.loc[i,act]==loc_min:
            rec_dict[nbhd] = act
rec_dict

{'Bedford-Stuyvesant': 'coffee & tea',
 'Brooklyn Heights and Cobble Hill': 'haircut',
 'Bushwick': 'haircut',
 'Carroll Gardens and Red Hook': 'grocery',
 'Chelsea, Flatiron District and Union Square': 'health & medical',
 'Chinatown': 'haircut',
 'Clinton': 'grocery',
 'Clinton Hill': 'arts & entertainment',
 'Crown Heights': 'health & medical',
 'Downtown Brooklyn, DUMBO, Vinegar Hill and Boerum Hill': 'arts & entertainment',
 'East Harlem': 'haircut',
 'East Village': 'fitness & instruction',
 'East Williamsburg': 'arts & entertainment',
 'Financial District and Battery Park City': 'boutique',
 'Fort Greene': 'boutique',
 'Gramercy': 'arts & entertainment',
 'Greenpoint': 'haircut',
 'Greenwich Village': 'haircut',
 'Harlem': 'grocery',
 'Lower East Side': 'education',
 'Morningside Heights': 'fitness & instruction',
 'Murray Hill and Kips Bay': 'boutique',
 'Park Slope and Gowanus': 'health & medical',
 'Prospect Heights': 'haircut',
 'SoHo, TriBeCa and Little Italy': 'arts & ente

In [11]:
df_nbhd = pd.DataFrame(columns=['PjAreaCode','Neighborhood'])
df_nbhd['PjAreaCode'] = df['PjAreaCode']
df_nbhd['Neighborhood'] = df['Neighborhood']
df_rec = pd.DataFrame(list(rec_dict.iteritems()),columns=['Neighborhood','RecBiz'])
df_out = pd.merge(df_nbhd, df_rec, on=['Neighborhood'])
df_out.to_csv('recbiz_mtxfact.csv')