# Prep data for modeling - Restaurants

## Import modules 

In [1]:
import pandas as pd
import numpy as np

## Load the review metadata set `rests` and the doc-topic matrix `dt_matrix`

In [2]:
rests = pd.read_csv('../data/restaurants.csv', compression='gzip')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
dt_matrix = np.load('../data/rests_dt_matrix.npy')

In [4]:
rests.shape, dt_matrix.shape

((3055990, 28), (2984421, 200))

To correctly combine our metadata and topic vectors into one feature matrix, we need to ensure they are the same shape; we can do this using the index array of the English reviews created in [Notebook 2](02_Text_Preprocessing_and_Tokenizing.ipynb).

In [5]:
correct_index = np.load('../data/rests_eng_index.npy')
rests = rests[rests.index.isin(correct_index)]

In [6]:
rests = rests.reset_index(drop=True)

In [7]:
rests.shape, dt_matrix.shape

((2984421, 28), (2984421, 200))

## Create validation set following usefulness logic

In [8]:
def useful_mapper(x):
    if x == 0:
        return 0
    elif x in (1, 2):
        return "Validation"
    elif x >= 3:
        return 1

In [9]:
rests['Usefulness'] = rests['useful'].map(useful_mapper)

In [10]:
rests['Usefulness'].value_counts()

0             1667146
Validation     944736
1              372537
Name: Usefulness, dtype: int64

## Drop null values

In [11]:
rests[rests.isnull().any(axis=1)]

Unnamed: 0,stars,text,useful,funny,cool,state,active_life,arts_and_entertainment,automotive,beauty_and_spas,...,mass_media,nightlife,pets,professional_services,public_services_and_government,religious_organizations,restaurants,shopping,review_length,Usefulness
1798074,5.0,Best Creole\/Cajun food in Nevada! Order the ...,,,,,,,,,...,,,,,,,,,,
1798075,All the goodness of the deep South! Seasoned a...,1,0.0,1.0,NV,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,36.0,,0.0
2560639,5,Cornish Pasty saved our Mothers Day! After hav...,-1.0,0.0,-1,AZ,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,94.0,


In [12]:
rests.drop([1798074, 1798075, 2560639], 0, inplace=True)
rests.shape

(2984418, 29)

In [13]:
right_matrix = np.delete(dt_matrix, [1798074, 1798075, 2560639], 0)
right_matrix.shape

(2984418, 200)

## Split out validation and train topic vectors

In [14]:
rests = rests.reset_index(drop=True)

In [15]:
valid_index = rests[rests['Usefulness'] == 'Validation'].index

In [16]:
valid_index

Int64Index([      1,       3,      10,      12,      19,      20,      29,
                 30,      31,      33,
            ...
            2984390, 2984393, 2984394, 2984395, 2984398, 2984402, 2984403,
            2984407, 2984411, 2984417],
           dtype='int64', length=944736)

In [17]:
right_matrix_valid = right_matrix[valid_index]
right_matrix_valid.shape

(944736, 200)

In [18]:
right_matrix_train = np.delete(right_matrix, valid_index, 0)
right_matrix_train.shape

(2039682, 200)

## Create validation and train metadata datasets and save to csv

In [19]:
rests_valid = rests[rests.index.isin(valid_index)]

rests_valid.to_csv('../data/rests_validation.csv', index=False)

In [20]:
rests_train = rests[~rests.index.isin(valid_index)]

rests_train.to_csv('../data/rests_train.csv', index=False)

In [21]:
rests_train.shape, rests_valid.shape

((2039682, 29), (944736, 29))

## Create train feature matrix

In [22]:
rests_train.drop(['text', 'useful', 'cool', 'state'], 1, inplace=True)

left_matrix_train = rests_train[rests_train.columns[:-1]].values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [23]:
left_matrix_train.shape, right_matrix_train.shape

((2039682, 24), (2039682, 200))

In [24]:
del rests, rests_train, correct_index, dt_matrix, right_matrix

features = np.hstack((left_matrix_train, right_matrix_train))

In [25]:
del right_matrix_train, left_matrix_train

In [26]:
whos

Variable             Type          Data/Info
--------------------------------------------
features             ndarray       2039682x224: 456888768 elems, type `object`, 3655110144 bytes (3485.78466796875 Mb)
np                   module        <module 'numpy' from '/op<...>kages/numpy/__init__.py'>
pd                   module        <module 'pandas' from '/o<...>ages/pandas/__init__.py'>
rests_valid          DataFrame             stars            <...>944736 rows x 29 columns]
right_matrix_valid   ndarray       944736x200: 188947200 elems, type `float64`, 1511577600 bytes (1441.552734375 Mb)
useful_mapper        function      <function useful_mapper at 0x7fe4a6384b70>
valid_index          Int64Index    Int64Index([      1,     <...>e='int64', length=944736)


In [27]:
np.save('../data/rests_train_features.npy', features)

## Create validation feature matrix

In [28]:
rests_valid.drop(['text', 'useful', 'cool', 'state'], 1, inplace=True)

left_matrix_valid = rests_valid[rests_valid.columns[:-1]].values

In [29]:
left_matrix_valid.shape, right_matrix_valid.shape

((944736, 24), (944736, 200))

In [30]:
valid_features = np.hstack((left_matrix_valid, right_matrix_valid))

np.save('../data/valid_features_rests.npy', valid_features)

## Create train target vector

In [31]:
rests_train = pd.read_csv('../data/rests_train.csv')

In [32]:
rests_train.Usefulness.value_counts()

0    1667145
1     372537
Name: Usefulness, dtype: int64

In [33]:
np.save('../data/rests_target.npy', rests_train.Usefulness.values)