In [23]:
import pandas as pd
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import twokenize
import unidecode

# We now use the data from Indianapolis and go through the NLP pipeline steps
reviews = pd.read_csv('csv_data/santa_barbara_reviews.csv')

In [24]:
# removing accents and making the text lowercase
reviews['text'] = [unidecode.unidecode(review_text).lower() for review_text in reviews['text']]

# tokenizing with spacy
spacy_tokenizer = English()
reviews['spacy_token'] = [[token.text for token in spacy_tokenizer(review_text)] for review_text in reviews['text']]

# tokenizing with twokenize
reviews['twokenize_token'] = [twokenize.tokenizeRawTweetText(review_text) for review_text in reviews['text']]

reviews.head()  # check results

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,spacy_token,twokenize_token
0,ZKvDG2sBvHVdF5oBNUOpAQ,wSTuiTk-sKNdcFyprzZAjg,B5XSoSG3SfvQGtKEGQ1tSQ,3.0,1.0,1.0,0.0,this easter instead of going to lopez lake we ...,2016-03-30 22:46:33,"[this, easter, instead, of, going, to, lopez, ...","[this, easter, instead, of, going, to, lopez, ..."
1,pUycOfUwM8vqX7KjRRhUEA,59MxRhNVhU9MYndMkz0wtw,gebiRewfieSdtt17PTW6Zg,3.0,0.0,0.0,0.0,had a party of 6 here for hibachi. our waitres...,2016-07-25 07:31:06,"[had, a, party, of, 6, here, for, hibachi, ., ...","[had, a, party, of, 6, here, for, hibachi, ., ..."
2,L0jv8c2FbpWSlfNC6bbUEA,bFPdtzu11Oi0f92EAcjqmg,IDtLPgUrqorrpqSLdfMhZQ,5.0,0.0,0.0,0.0,what a great addition to the funk zone! grab ...,2016-10-13 22:50:47,"[what, a, great, addition, to, the, funk, zone...","[what, a, great, addition, to, the, funk, zone..."
3,2u5Skti5mZam_-XTKPelvA,IMd3NQbclta91pFKk3AJZg,qO9dNNIvNbCBd8ZgjxMxgQ,5.0,0.0,0.0,0.0,"farmhouse, rustic, chic.helpful staff with gre...",2017-04-08 00:33:11,"[farmhouse, ,, rustic, ,, chic.helpful, staff,...","[farmhouse, ,, rustic, ,, chic, ., helpful, st..."
4,4zopEEPqfwm-c_FNpeHZYw,JYYYKt6TdVA4ng9lLcXt_g,SZU9c8V2GuREDN5KgyHFJw,5.0,0.0,0.0,0.0,we were a bit weary about trying the shellfish...,2016-05-31 02:14:54,"[we, were, a, bit, weary, about, trying, the, ...","[we, were, a, bit, weary, about, trying, the, ..."


In [25]:
# removing stop words
reviews['spacy_token'] = [list(filter(lambda word: word not in STOP_WORDS, list_of_tokens)) for list_of_tokens in reviews['spacy_token']]
reviews['twokenize_token'] = [list(filter(lambda word: word not in STOP_WORDS, list_of_tokens)) for list_of_tokens in reviews['twokenize_token']]
reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,spacy_token,twokenize_token
0,ZKvDG2sBvHVdF5oBNUOpAQ,wSTuiTk-sKNdcFyprzZAjg,B5XSoSG3SfvQGtKEGQ1tSQ,3.0,1.0,1.0,0.0,this easter instead of going to lopez lake we ...,2016-03-30 22:46:33,"[easter, instead, going, lopez, lake, went, lo...","[easter, instead, going, lopez, lake, went, lo..."
1,pUycOfUwM8vqX7KjRRhUEA,59MxRhNVhU9MYndMkz0wtw,gebiRewfieSdtt17PTW6Zg,3.0,0.0,0.0,0.0,had a party of 6 here for hibachi. our waitres...,2016-07-25 07:31:06,"[party, 6, hibachi, ., waitress, brought, sepa...","[party, 6, hibachi, ., waitress, brought, sepa..."
2,L0jv8c2FbpWSlfNC6bbUEA,bFPdtzu11Oi0f92EAcjqmg,IDtLPgUrqorrpqSLdfMhZQ,5.0,0.0,0.0,0.0,what a great addition to the funk zone! grab ...,2016-10-13 22:50:47,"[great, addition, funk, zone, !, , grab, bite...","[great, addition, funk, zone, !, grab, bite, ,..."
3,2u5Skti5mZam_-XTKPelvA,IMd3NQbclta91pFKk3AJZg,qO9dNNIvNbCBd8ZgjxMxgQ,5.0,0.0,0.0,0.0,"farmhouse, rustic, chic.helpful staff with gre...",2017-04-08 00:33:11,"[farmhouse, ,, rustic, ,, chic.helpful, staff,...","[farmhouse, ,, rustic, ,, chic, ., helpful, st..."
4,4zopEEPqfwm-c_FNpeHZYw,JYYYKt6TdVA4ng9lLcXt_g,SZU9c8V2GuREDN5KgyHFJw,5.0,0.0,0.0,0.0,we were a bit weary about trying the shellfish...,2016-05-31 02:14:54,"[bit, weary, trying, shellfish, company, wharf...","[bit, weary, trying, shellfish, company, wharf..."
