# Prep data for modeling - Restaurants

## Import modules 

In [1]:
import pandas as pd
import numpy as np

## Load the review metadata set `rests` and the doc-topic matrix `dt_matrix`

In [2]:
rests = pd.read_csv('../data/restaurants.csv', compression='gzip')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
dt_matrix = np.load('../data/rests_dt_matrix.npy')

In [4]:
rests.shape, dt_matrix.shape

((3055990, 28), (2984421, 200))

To correctly combine our metadata and topic vectors into one feature matrix, we need to ensure they are the same shape; we can do this using the index array of the English reviews created in [Notebook 2](02_Text_Preprocessing_and_Tokenizing.ipynb).

In [5]:
correct_index = np.load('../data/rests_eng_index.npy')
rests = rests[rests.index.isin(correct_index)]

In [6]:
rests = rests.reset_index(drop=True)

In [7]:
rests.shape, dt_matrix.shape

((2984421, 28), (2984421, 200))

## Drop null values

In [8]:
rests[rests.isnull().any(axis=1)]

Unnamed: 0,stars,text,useful,funny,cool,state,active_life,arts_and_entertainment,automotive,beauty_and_spas,...,local_services,mass_media,nightlife,pets,professional_services,public_services_and_government,religious_organizations,restaurants,shopping,review_length
1798074,5.0,Best Creole\/Cajun food in Nevada! Order the ...,,,,,,,,,...,,,,,,,,,,
1798075,All the goodness of the deep South! Seasoned a...,1,0.0,1.0,NV,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,36.0,


In [9]:
rests.drop([1798074, 1799075], 0, inplace=True)
rests.shape

(2984419, 28)

In [10]:
right_matrix = np.delete(dt_matrix, [1798074, 1799075], 0)
right_matrix.shape

(2984419, 200)

## Create validation set following usefulness logic

In [11]:
def useful_mapper(x):
    if x == 0:
        return 0
    elif x in (1, 2):
        return "Validation"
    elif x >= 3:
        return 1

In [12]:
rests['Usefulness'] = rests['useful'].map(useful_mapper)

In [13]:
rests['Usefulness'].value_counts()

0             1667146
Validation     944735
1              372537
Name: Usefulness, dtype: int64

## Split out validation and train topic vectors

In [14]:
rests = rests.reset_index(drop=True)

In [15]:
valid_index = rests[rests['Usefulness'] == 'Validation'].index

In [16]:
valid_index

Int64Index([      1,       3,      10,      12,      19,      20,      29,
                 30,      31,      33,
            ...
            2984391, 2984394, 2984395, 2984396, 2984399, 2984403, 2984404,
            2984408, 2984412, 2984418],
           dtype='int64', length=944735)

In [17]:
right_matrix_valid = right_matrix[valid_index]
right_matrix_valid.shape

(944735, 200)

In [18]:
right_matrix_train = np.delete(right_matrix, valid_index, 0)
right_matrix_train.shape

(2039684, 200)

## Create validation and train metadata datasets and save to csv

In [None]:
rests_valid = rests[rests.index.isin(valid_index)]

rests_valid.to_csv('../data/rests_validation.csv', index=False)

In [None]:
rests_train = rests[~rests.index.isin(valid_index)]

rests_train.to_csv('../data/rests_train.csv', index=False)

In [None]:
rests_train[0].shape, rests_valid[0].shape

## Create train feature matrix

In [None]:
rests_train.drop(['text', 'useful', 'cool', 'state'], 1, inplace=True)

left_matrix_train = rests_train[rests_train.columns[:-1]].values

In [None]:
left_matrix_train.shape, right_matrix_train.shape

del bus, bus_train, correct_index, dt_matrix, right_matrix

features = np.hstack((left_matrix_train, right_matrix_train))

## Create validation feature matrix

In [None]:
rests_valid = pd.read_csv('../data/rests_validation.csv')

rests_valid.columns

rests_valid.drop(['text', 'useful', 'cool', 'state'], 1, inplace=True)

left_matrix_valid = rests_valid[rests_valid.columns[:-1]].values

In [None]:
left_matrix_valid.shape, right_matrix_valid.shape

In [None]:
del bus, bus_valid, correct_index, dt_matrix, right_matrix

valid_features = np.hstack((left_matrix_valid, right_matrix_valid))

np.save('../data/valid_features_rests.npy', valid_features)

## Create train target vector

In [None]:
rests_train = pd.read_csv('../data/rests_train.csv')

In [None]:
np.save('../data/rests_target.npy', rests_train.usefulness.values)