# Classification modeling - Restaurants

I create two sets of models: a set of binary classifiers (is useful/is not useful) and a set of ordinal classifiers with 3/4 levels of usefulesness.

## Import modules and data

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import time
import re

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

In [23]:
correct_index = np.load('../data/rests_eng_index.npy')

In [24]:
rests = pd.read_csv('../data/restaurants.csv', compression='gzip')

  interactivity=interactivity, compiler=compiler, result=result)


In [25]:
def useful_mapper(x):
    if x == 0:
        return 0
    elif x in (1, 2):
        return "Undetermined"
    elif x >= 3:
        return 1

In [26]:
rests['Usefulness'] = rests['useful'].map(useful_mapper)

In [27]:
rests['isUseful'] = (rests['useful'] > 0).astype(int)

In [28]:
rests.drop(['useful','text', 'cool', 'state'], 1, inplace=True)

In [29]:
rests = rests[rests.index.isin(correct_index)]

In [30]:
lsa_matrix = np.load('../data/lsa_matrix.npy')

In [31]:
rests.drop([1841405, 1841406], 0, inplace=True)

In [32]:
lsa_matrix = np.delete(lsa_matrix, [1841405, 1841406], 0)

In [33]:
rests.columns

Index(['stars', 'funny', 'active_life', 'arts_and_entertainment', 'automotive',
       'beauty_and_spas', 'education', 'event_planning_and_services',
       'financial_services', 'food', 'health_and_medical', 'home_services',
       'hotels_and_travel', 'local_flavor', 'local_services', 'mass_media',
       'nightlife', 'pets', 'professional_services',
       'public_services_and_government', 'religious_organizations',
       'restaurants', 'shopping', 'review_length', 'Usefulness', 'isUseful'],
      dtype='object')

In [34]:
left_array = rests[rests.columns[:-2]].values

In [35]:
left_array.shape

(2984419, 24)

In [36]:
features = np.hstack((left_array, lsa_matrix))

In [21]:
del rests, lsa_matrix, correct_index

## Modeling Pipeline

1. Features for reviews from review dataset
2. Topic weights from topic model
3. Scaler
4. GridSearched models

## Set target and feature vectors, train/test/split, normalize

In [37]:
X = features
y = rests['isUseful']

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [41]:
ss = StandardScaler()

In [42]:
X_train = ss.fit_transform(X_train)



In [43]:
X_test = ss.transform(X_test)



### Try modeling with only the LSA feature weights

In [None]:
X = dt_matrix
y = rests['isUseful']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)

In [None]:
X_test = ss.transform(X_test)

In [None]:
lr = GridSearchCV(LogisticRegression(), param_grid={'random_state': [32], 'C': [1e-4, 1e-3]})

In [None]:
lr.fit(X_train, y_train)

In [None]:
lr.score(X_train, y_train), lr.score(X_test, y_test)

## LR model with LSA feature weights and features

In [48]:
lr = GridSearchCV(LogisticRegression(), param_grid={'random_state': [32], 'C': np.logspace(1e-5, 1e-1, 25)})

In [None]:
lr.fit(X_train, y_train)

In [None]:
lr.score(X_train, y_train), lr.score(X_test, y_test)

## Random Forest

In [None]:
rf = GridSearchCV(RandomForestClassifier(), param_grid={'random_state': [32], 
                                                        'min_samples_split': range(5, 10, 1), 
                                                        'min_samples_leaf': [2, 3, 4]
                                                        'n_jobs': [-1]})