In [9]:
import pandas as pd
import numpy as np
import os
import json
import pprint
import matplotlib.pylab as pylab
from pylab import plot,show
import numpy as np
import cPickle as pickle
import time
from sklearn import linear_model
from sklearn.svm import SVC


%matplotlib inline

# Limit rows disp# Limit rows displayed in notebook
pd.set_option('display.max_rows', 20)
pd.set_option('display.precision', 2)
pylab.rcParams['figure.figsize'] = 16, 12

Loading business details

In [2]:
pd_lasvegas = pickle.load( open('pd_lasvegas.pkl', 'rb'))
pd_lasvegas.rename(columns={'date_x': 'review_date'}, inplace=True)
pd_lasvegas.rename(columns={'date_y': 'start_date'}, inplace=True)
pd_lasvegas.rename(columns={'stars_y': 'stars'}, inplace=True)
pd_lasvegas.reset_index(level=0, inplace=True)
pd_old_business_vegas = pd_lasvegas[pd_lasvegas['new'] == False]
pd_new_business_vegas = pd_lasvegas[pd_lasvegas['new'] == True]

Creating business vector

In [4]:
''' Business Attributes '''
attrs = [u'attributes_Accepts Credit Cards', 
         u'attributes_Accepts Insurance',
         u'attributes_Ages Allowed', 
         u'attributes_Alcohol',
         u'attributes_Ambience', 
         u'attributes_Attire', 
         u'attributes_BYOB',
         u'attributes_BYOB/Corkage', 
         u'attributes_By Appointment Only',
         u'attributes_Caters', 
         u'attributes_Coat Check',
         u'attributes_Corkage', 
         u'attributes_Delivery',
         u'attributes_Dietary Restrictions', 
         u'attributes_Dogs Allowed',
         u'attributes_Drive-Thru', 
         u'attributes_Good For',
         u'attributes_Good For Dancing', 
         u'attributes_Good For Groups',
         u'attributes_Good For Kids', 
         u'attributes_Good for Kids',
         u'attributes_Hair Types Specialized In', 
         u'attributes_Happy Hour',
         u'attributes_Has TV', 
         u'attributes_Music',
         u'attributes_Noise Level', 
         u'attributes_Open 24 Hours',
         u'attributes_Order at Counter', 
         u'attributes_Outdoor Seating',
         u'attributes_Parking', 
         u'attributes_Payment Types',
         u'attributes_Price Range', 
         u'attributes_Smoking',
         u'attributes_Take-out', 
         u'attributes_Takes Reservations',
         u'attributes_Waiter Service', 
         u'attributes_Wheelchair Accessible',
         u'attributes_Wi-Fi', 
         u'categories',
         u'hours_Friday', 
         u'hours_Monday', 
         u'hours_Saturday',
         u'hours_Sunday', 
         u'hours_Thursday', 
         u'hours_Tuesday',
         u'hours_Wednesday',
         u'open']

def getHourAttr(old, new):
    if old[u'close'] == new[u'close'] and old[u'open'] == new[u'open']:
        return 1
    else:
        return -1

def getParkingAttr(old, new):
    #print "Old: ", old, 
    #print "New: ", new
    parking_keys = [u'garage', u'street', u'validated', u'lot', u'valet']
    for key in parking_keys:
        if old[key]!= new[key]:
            return -1
    return 1

def getCategoryAttr(old, new):
    #print old, new
    oldCategories = set(old.encode('utf8').split(','))
    newCategories = set(new.encode('utf8').split(','))
    if len(oldCategories.intersection(newCategories)) > 0:
        return 1
    else:
        return -1
    
def getVector(old, new):
    res = []
    for attr in attrs:
        #print attr, old[attr], new[attr]
        if pd.isnull(old[attr]) and pd.isnull(new[attr]):
            res.append(1)
            continue
            
        elif pd.isnull(old[attr]) or pd.isnull(new[attr]):
            res.append(-1)
            continue
            
        if 'hours_' in attr:
            res.append(getHourAttr(old[attr], new[attr]))
            continue
            
        if attr == u'categories':
            res.append(getCategoryAttr(old[attr], new[attr]))
            continue
            
        if attr == u'attributes_Parking':
            res.append(getParkingAttr(old[attr], new[attr]))
            continue

        elif old[attr] == new[attr]:
            res.append(1)
        else:
            res.append(-1)
    return res


''' Testing Out Vector Nonsense 
c = 0
pd_business =  pd_lasvegas[pd_lasvegas['cluster'] == c]
pd_old_business = pd_business_cluster[pd_business['new'] == False]
pd_new_business = pd_business_cluster[pd_business['new'] == True]
print getVector(pd_old_business_cluster.iloc[0], pd_new_business_cluster.iloc[0])
'''

" Testing Out Vector Nonsense \nc = 0\npd_business =  pd_lasvegas[pd_lasvegas['cluster'] == c]\npd_old_business = pd_business_cluster[pd_business['new'] == False]\npd_new_business = pd_business_cluster[pd_business['new'] == True]\nprint getVector(pd_old_business_cluster.iloc[0], pd_new_business_cluster.iloc[0])\n"

Calculating Conditional Mean For Each Business

In [11]:
import datetime
''' Helper for Date in Pandas'''
def todate(d):
    return datetime.datetime.strptime(d, '%Y-%m-%d')


''' Calculate mean_after - mean_before : returns +1 if mean improves -1 if mean reduces'''
def calcAverage(df):
    #print "Printing DF", df
    df_before = df[df['before'] == True]
    df_after = df[df['before'] == False]
    
    #print "before", df_before
    #print "after", df_after
    
    l1 = len(df_before['stars'])
    l2 = len(df_after['stars'])

    if l1 == 0 or l2 == 0:
        return -1
    else:
        a = sum(df_before['stars']) / (len(df_before['stars'])*1.0)
        b = sum(df_after['stars']) / (len(df_after['stars'])*1.0)
        if a - b > 0:
            return -1
        else:
            return 1
        


def createTrainingData(c=0, date_range=60, sampleAll=True, maxsamples=0):
    # Use cluster 0,1 for training
    X, Y = [], []

    diff = datetime.timedelta(days=date_range)
    pd_business_cluster =  pd_lasvegas[pd_lasvegas['cluster'] == c]
    pd_old_business = pd_business_cluster[pd_business_cluster['new'] == False]
    pd_old_business_Y = pd_business_cluster[["business_id", "review_date", "stars"]] # This is to simplify Y calculation
    pd_new_business = pd_business_cluster[pd_business_cluster['new'] == True]
    gb_new_business = pd_new_business.groupby('business_id')
    gb_old_business = pd_old_business.groupby('business_id')
    
    print "Calculating vector for cluster ", c
    start_time = time.time()
    for new_business_id, new_business_details in gb_new_business:
        print "@ ", new_business_id 
        start_date = todate(pd_new_business.iloc[0]['start_date'])
        new_business = new_business_details.iloc[0]
        
        ''' Calculating X for Current New Business '''
        temp_x = []
        for old_business_id, old_business_details in gb_old_business:
            old_business = old_business_details.iloc[0]
            temp_x.append(getVector(old_business, new_business))
        
        ''' Calculating Y for Current New Business '''
        # Getting all old business reviews between start_date -diff to start_date + diff
        pd_old_business_tempY = pd_old_business_Y[pd_old_business_Y.review_date.apply(todate) <= start_date + diff]
        pd_old_business_tempY = pd_old_business_tempY[pd_old_business_tempY.review_date.apply(todate) >= start_date - diff]
        
        # Labelling All Reviews Before Start Date as True
        pd_old_business_tempY['before'] = (pd_old_business_tempY['review_date'].apply(todate) < start_date)
        
        #print pd_old_business_tempY.groupby('business_id')
        gb_old_business_tempY = pd_old_business_tempY.groupby(['business_id'])
        temp_y = gb_old_business_tempY.apply(calcAverage).values.tolist()
 
        X.extend(temp_x)
        Y.extend(temp_y)
        
        if sampleAll == False and len(X) > maxsamples:
            break

    end_time = time.time()
    print "Time taken for cluster ", c, " is ", end_time - start_time
    print "Completed calculating vector for cluster ", c
    return X, Y

X, Y = createTrainingData()

Calculating vector for cluster  0
@  -lRpLc285MIpDKHQk3Tqug
@  -tCiAYMimz6yQaaXiK7e6Q
@  0GCpesRgXSVluFoB2otXlA
@  0IV5L36XeQShzWiUhom9bQ
@  0vmWkXuouMhbdTakyvZZ-g
@  0w3SqrrhkxB_7t0Vhr6tQQ
@  1Vaz1OBQGj7xiCxIBuQIvA
@  1ZiGYjB9TMwmmvbJRWrB1g
@  1hY0Q3BYDq6CJDQay2xRQA
@  1onzRxmG-8J5ecUdHgkRhg
@  2eR5ncG5RA2AaREzQRVTxA
@  2lt03ZLGvgPApOKS7eKGOQ
@  2pK1ecVaqpgp1k1OFJQ9Gg
@  2tP7IwNgrcLepEj0wUh-Dw
@  3FhOLs1Sxc89loLnIAQl1g
@  3GTtkOthmbfipurFNwUZ7Q
@  3Jhosy4nKJO3oUgU2ZfMIQ
@  3_H-sx_fq0hJJ8S2W02frA
@  3g0GUgnRE3ZNVXqKXebBAg
@  3l2rx67A8ZuTnFTQrl-75Q
@  3uUYfeMGAywRy1aN01n-Sg
@  4V20KwXSSWz7A2OuiixVww
@  5CuhWcKsiqBxAt_ZHwW8Vw
@  5m2HPI3fp6Eq565mlquL0w
@  5muXkVtzR6oDRW7J7XuS6w
@  6UXw7_U13Th0PZlMXZbjMg
@  6gjfXqaxZgv0C2aHyl4yuQ
@  6xgBVwvBxN0hqOmw0fQckQ
@  79oVSwNtdAgrBmqVD5PUPw
@  7R8LHGueCu1yEtceLh6C5g
@  8FnivFJz2rtxD7F1Z1y38w
@  8MjrBpH29UkcH7kVUvzERw
@  8iD3ajO_AA8F4i56iCfXBQ
@  9eM-JEUMWFuoLUzRZ-15eA
@  9gOBcfarkqeoCl1OBjMpZQ
@  9sNgoMqKIQZ-IYZhJWWtVw
@  AOdQfAlwm4h8Ba3M7yS6yA
@  A

Learning Algorithm: Logistic Regression/SVM

In [17]:
# Change to random sampling
sampling_mask_train =np.random.randint(len(X),size=10000)
sampling_mask_test =np.random.randint(len(X),size=100)

X_np, Y_np = np.array(X), np.array(Y)

X_train, Y_train = X_np[sampling_mask_train,:], Y_np[sampling_mask_train,:]
X_test, Y_test = X_np[sampling_mask_test,:], Y_np[sampling_mask_test,:]

print X_train.shape, Y_train.shape

logistic = linear_model.LogisticRegression()
print('LogisticRegression score: %f'
      % logistic.fit(X_train, Y_train).score(X_test, Y_test))
clf = SVC()
print('SVM score: %f' % clf.fit(X_train, Y_train).score(X_test, Y_test))

IndexError: too many indices for array