In [11]:
%matplotlib inline
import re
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
sns.set(color_codes=True)

from time import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import os; import json; import fileinput
#from numba import jit
from IPython.display import display

In [12]:
st = time()
fileinput.close() # sometimes fileinput is already active

def load_file(filename, transformer=None, max_lines = 50000):
    data_dir = "yelp_dataset_challenge_round9"
    filepath = os.path.join(data_dir, filename)

    data = []
    for line in fileinput.input(filepath):
        d = json.loads(line)
                
        if transformer:
            d = transformer(d)

        data.append(d)
        if len(data) > max_lines: break
            
    fileinput.close()
    return pd.DataFrame(data)

# These functions transform some values for later
def transf_checkin(d):
    d['n_time'] = len(d['time'])
    return d

def transf_review(d):
    d['date'] = datetime.strptime(d['date'], '%Y-%m-%d')
    d['weekday'] = d['date'].weekday()
    return d

def transf_user(d):    
    d['n_friends'] = len(d['friends'])
    d['n_elite'] = len(d['elite'])
    del d['friends']
    del d['elite']
    #del d['compliment_writer'] # are there more than one type?
    return d

df_bus = load_file("yelp_academic_dataset_business.json", None, 900000)
df_checkin = load_file("yelp_academic_dataset_checkin.json", transf_checkin)
df_review = load_file("yelp_academic_dataset_review.json", transf_review, 900000)
df_tip = load_file("yelp_academic_dataset_tip.json")
df_user = load_file("yelp_academic_dataset_user.json", transf_user)

def hours_to_matrix(hours):
    mat = np.zeros((7,), dtype=np.int8)
    if hours is None: 
        return mat
    day = 0
    for h in hours:
        length = 0
        rr = re.findall("[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE]\d+)?", h)
        if len(rr):
            length = int(rr[2]) - int(rr[0])
            if length < 0:
                length += 24
                
        if h.startswith('Mon'): day = 0
        if h.startswith('Tue'): day = 1
        if h.startswith('Wed'): day = 2
        if h.startswith('Thu'): day = 3
        if h.startswith('Fri'): day = 4
        if h.startswith('Sat'): day = 5
        if h.startswith('Sun'): day = 6
        mat[day] = length
    return mat

# add attributes columns
# TODO unfinished for "GoodForMeal", "Ambience"
def add_attribute_cols(attr):
    s = {}
    if attr is not None:
        for a in attr:
            t = a.split(':')
            if "True" in t[1]:
                s["attr_" + t[0].strip()] = 1
            else:
                s["attr_" + t[0].strip()] = 0
            if "RestaurantsPriceRange2" in a:
                s["attr_RestaurantsPriceRange2"] = int(t[1])
            if "WiFi: free" in a:
                s["attr_WiFi"] = 1
            if "BusinessParking" in a: # improvable
                counter = 0
                for b in ["'garage': True", "'street': True", "'lot': True", "'valet': True"]:
                    if b in a:
                        counter += 1
                s["n_parking"] = counter
    return pd.Series(s, dtype=np.int8)

df_bus['opening_hours'] = df_bus['hours'].apply(hours_to_matrix)
df_bus = df_bus.merge(df_bus['attributes'].apply(add_attribute_cols).fillna(0, downcast='infer'), left_index=True, right_index=True)
#df_bus = df_bus.fillna(0, downcast='infer')

df_review = df_review.sort_values(['business_id'])

#df_user['friends']
#df_user['elite']

# Memory optimization
# Technical stuff, contributes nothing to analysis
for df in [df_bus, df_checkin, df_review, df_tip, df_user]:
    conv = df.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='unsigned')
    df[conv.columns] = conv

# Show memory usage
#df_bus.info(memory_usage='deep')

print('Done in {} s'.format(time() - st))

Done in 154.9442162513733 s


In [13]:
# prepare data
reviews = df_review[df_review['date'] > datetime(2015, 1, 1)]
#reviews = df_review
reviews = reviews.drop(['review_id', 'user_id'], axis=1)
reviews = reviews.sort_values(['business_id', 'weekday'])
reviews = reviews.merge(right=df_bus[['business_id', 'opening_hours', 'is_open']], on='business_id')
reviews = reviews[reviews['opening_hours'].notnull()]
reviews['hours_open'] = reviews.apply(lambda x: x['opening_hours'][x['weekday']], axis=1)    
def is_open(hours):
    if hours > 0: return 1
    else: return 0
reviews['open'] = reviews['hours_open'].apply(is_open)
#reviews = reviews[reviews['hours_open']>0]

print(len(reviews))
reviews.head(1)

403444


Unnamed: 0,business_id,cool,date,funny,stars,text,type,useful,weekday,opening_hours,is_open,hours_open,open
0,--6MefnULPED_I942VcFNA,0,2015-09-28,0,3,"The char siu is pretty tasty here, but was lac...",review,1,0,"[0, 0, 0, 0, 0, 0, 0]",1,0,0


In [14]:
# split data
X = reviews['text']
y = reviews['open']

print('Data prepared. Rows: {}'.format(len(X)))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=999)        

# train model
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, roc_auc_score, classification_report, explained_variance_score
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import Ridge, Lasso, LogisticRegression
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA

st = time()
no_numbers = lambda x: re.sub(r'(\d[\d\.])+', '', x.lower())
model = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.5, preprocessor=no_numbers)),
   # ('scaler', StandardScaler(with_mean=False)),
#    ('classifier', LogisticRegression(class_weight='balanced'))
    ('reduce_dim', PCA(n_components=100)),
#    ('best', TruncatedSVD(n_components=100)),
    ('classifier', MLPClassifier(max_iter=5, hidden_layer_sizes=(100,)))
#    ('model', Ridge(alpha=10, normalize=False, max_iter=50))   
#    ('model', MLPRegressor(alpha=0.01, max_iter=50, hidden_layer_sizes=(100,50)))
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("R^2: %1.3f" % r2_score(y_test, y_pred))
print("Explained var: {:3f}".format(explained_variance_score(y_test, y_pred)))
if 'classifier' in model.named_steps:
    print(classification_report(y_test, y_pred))
    print("auc: ", roc_auc_score(y_test, y_pred, average='weighted'))
print('Done in {} s'.format(time() - st))

if False:
    from sklearn.model_selection import GridSearchCV
    print('Start cv grid search...')
    params = {
        'classifier__C': [1e-1, 1, 1e1]
     #  'model__alpha': [1e-1, 1, 1e1],
     #   'model__normalize': [True, False],
     #   'model__max_iter': [50, 100, 200]
    }

    grid = GridSearchCV(model, cv=3, param_grid=params)
    grid.fit(X_train, y_train)
    
    print(grid.best_params_)
    
# best results so far
# auc: 0.55
# 


Data prepared. Rows: 403444
R^2: -0.175
Explained var: 0.025447
             precision    recall  f1-score   support

          0       0.78      0.09      0.17     16649
          1       0.81      0.99      0.89     64040

avg / total       0.80      0.81      0.74     80689

auc:  0.543658774084
Done in 118.07643485069275 s


In [15]:
"""
tvec = model.named_steps['vectorizer']
weights = np.asarray(tvec.idf_).ravel().tolist()
weights_df = pd.DataFrame({'term': tvec.get_feature_names(), 'weight': weights})
weights_s = weights_df.sort_values(by='weight', ascending=False)
display(weights_s.head(10))
display(weights_s.tail(10))
"""
#print(list(y_pred))

%bell -n notify print 'hello'

ERROR:root:Line magic function `%bell` not found.
