In [1]:
import pandas as pd
import spacy 
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

pd.set_option('display.max_colwidth', None)
nlp = spacy.load('en')

In [2]:
dataset_review_meta = pd.read_csv('/mnt/d/data/amazon-review-data/amazon-cell-phone-review-meta.csv', nrows=5)
dataset_review = pd.read_csv('/mnt/d/data/amazon-review-data/amazon-cell-phone-reviews.csv', nrows=5)
dataset_review.columns = map(str.lower, dataset_review.columns)
data_review_text = dataset_review[['body']]
sample = data_review_text.sample(5)

In [3]:
def preprocess(text):
    ''' 
    this function does simple text pre-processing such as, 
        - remove stop words
        - remove punctuation
        - convert to lowercase
        - lemmatizaation
    '''
    preprocessed_text = " ".join(simple_preprocess(text))
    preprocessed_text_doc = nlp(preprocessed_text)
    clean_text = " ".join([token.lemma_.strip().lower() for token in preprocessed_text_doc if token.lemma_ != "-PRON-"])
    return clean_text

def chunk_noun_phrases(text):
    ''' 
    this function returns the noun phrase chunk
    '''
    preprocessed_doc = nlp(preprocess(text))
    return [phrase.text for phrase in preprocessed_doc.noun_chunks]

In [4]:
%%time
sample['preprocess_text'] = sample['body'].map(preprocess)
sample['noun_phrase'] = sample['body'].map(chunk_noun_phrases)
sample_noun_phrase = sample['noun_phrase'].values.tolist()
# sample_noun_phrase[0]

CPU times: user 674 ms, sys: 0 ns, total: 674 ms
Wall time: 677 ms


In [5]:
print(sample_noun_phrase)

[['the phone', 'every purpose offer', 'the day buy', 'the case', 'the case', 'own picture', 'the jaket', 'the store', 'an employee', 'the casing', 'half', 'deal', 'the clear casing', 'case', 'the time'], ['software issue', 'nokia', 'this phone text messaging capability', 'sprint system', 'software patch', 'the next few month', 'at least hour', 'sprint award', 'customer service team', 'someone', 'who', 'the problem', 'that nokia design phone', 'incoming message', 'the way', 'most provider work sprint', 'people', 'inbox compose reply', 'the sprint server', 'innovation', 'money'], ['the phone', 'didn', 'the price', 'the bill', 'one', 'also ve have phone', 'little over two month', 'free accessory', 'the phone', 'the company', 'couple', 'week', 'the phone', 'the phone call'], ['great reliable phone', 'this phone', 'samsung', 'the menu', 'speed dialing', 'around number voice dialing', 'nice feature', 'the only thing', 'the games nokia', 'snake', 'phone', 'skydiving game bowling', 'tennis', '

In [7]:
sample['preprocess_text'].values.tolist()

['the phone have be great for every purpose offer except the day buy could not get the case off can take the case off to put own picture in the jaket which be super cool but when take back to the store an employee say be mean to be hard to get off well could barely even get off come close to snap the casing in half be never able to get off after that which be not that big of deal but get dirty under the clear casing and look really dirty now make sure can get case on and off oh and have to charge all off the time',
 'due to software issue between nokia and sprint this phone text messaging capability don work with sprint system and win until software patch come out some time in the next few month will have to spend at least hour with sprint award win customer service team to find someone who will admit this to the problem be that nokia design phone so that incoming message be retrieve quickly and then view offline the way most provider work sprint however like to have people hook up to 