In [119]:
import pandas as pd
import numpy as np
import json
import re
import nltk

from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Read data file into a python array
with open('../data/test_tip.json', 'rb') as f:
    bus_data = f.readlines()

# remove the trailing "\n" from each line
bus_data = map(lambda x: x.rstrip(), bus_data)
# put individual business JSON objects into list
data_json = "[" + ','.join(bus_data) + "]"

# Create pandas df
bus_df = pd.read_json(data_json)

In [125]:
bus_df

Unnamed: 0,business_id,date,likes,text,type,user_id
0,cE27W9VPgO88Qxe4ol6y_g,2013-04-18,0,Don't waste your time.,tip,-6rEfobYjMxpUWLNxszaxQ
1,mVHrayjG3uZ_RLHkLj-AMg,2013-01-06,1,Your GPS will not allow you to find this place...,tip,EZ0r9dKKtEGVx2CdnowPCw
2,KayYbHCt-RkbGcPdGOThNg,2013-12-03,0,Great drink specials!,tip,xb6zEQCw9I-Gl0g06e1KsQ
3,KayYbHCt-RkbGcPdGOThNg,2015-07-08,0,"Friendly staff, good food, great beer selectio...",tip,QawZN4PSW7ng_9SP7pjsVQ
4,1_lU0-eSWJCRvNGk78Zh9Q,2015-10-25,0,Beautiful restoration.,tip,MLQre1nvUtW-RqMTc4iC9A
5,1_lU0-eSWJCRvNGk78Zh9Q,2015-01-06,0,Home to Stage 62 theatre group.,tip,bvu13GyOUwhEjPum2xjiqQ
6,_qopVQ6_Mz6W7-Pmbi56GQ,2013-02-13,0,A God send if you're not a gear head!,tip,bvu13GyOUwhEjPum2xjiqQ
7,_qopVQ6_Mz6W7-Pmbi56GQ,2010-08-27,0,Great people ... great service ... always busy,tip,_QFom7aSHKNCDsNXKd-3xQ
8,wJr6kSA5dchdgOdwH6dZ2w,2013-07-22,0,Sarah rocks! Best waitress here! Be sure to ge...,tip,fvTivrsJoUMYXnOJw9wZfw
9,Cdcus0NADzyY3XiJM2O5Sg,2011-10-12,0,Unleaded 3.42,tip,bvu13GyOUwhEjPum2xjiqQ


In [151]:
# Helper functions for normalising text data

# Convert all words to lowercase, remove punctuation, tokenise and stem
# and remove stopwords, threshold = 10%
def norm_corpus(document_list):
    norm_doc_list = []
    
    # lowercase
    document_list = [word.lower() for word in document_list]

    
    # remove symbols in text
    symbols = ",.?!"
    for sym in symbols:
        document_list = [word.replace(sym,'') for word in document_list]
    
    
    # loop through each string i.e. review in the column
    for doc in document_list:
        doc = nltk.word_tokenize(doc)
        
        # remove stopwords
        doc = [word for word in doc if word not in stopwords.words('english')]
        
        # stem words
        stemmer = SnowballStemmer("english")
        doc = [stemmer.stem(word) for word in doc]
        
        # make tokenised text one string
        norm_doc = " ".join(doc)
        norm_doc_list.append(norm_doc)
    
    return norm_doc_list

print len(norm_corpus(bus_df.text)), ":", norm_corpus(bus_df.text)

10 : [u"n't wast time", u'gps allow find place put rankin polic depart instead direct across street', u'great drink special', u'friend staff good food great beer select relax atmospher', u'beauti restor', u'home stage 62 theatr group', u'god send re gear head', u'great peopl great servic alway busi', u'sarah rock best waitress sure get compliment glass', u'unlead 342']


In [None]:
# Helper functions for vectorise normalised data

In [189]:
# Vectorise keywords from normalised text to vector including only nouns and adjectives
def review_vector(norm_doc_list):
    review_list = []

    # select all words categorised as nouns or adjectives
    # loop through each string i.e. review in the df column
    for doc in norm_doc_list:
        review_keyword_list = []
        doc = nltk.word_tokenize(doc)
        # create tuple for each word in list: (word, tag)
        token_category = nltk.pos_tag(doc)
        
        
        for word, tag in token_category:    
            
            # nouns
            if (tag == 'NN' or tag == 'NNS' or tag == 'NNP' or tag == 'NNPS'):
                review_keyword_list.append(word)
                
            # adjectives
            elif (tag == 'JJ' or tag == 'JJS' or tag == 'JJP' or tag == 'JJPS'):
                review_vector_list.append(word)
            else:
                pass     
        
        review_keywords = " ".join(review_keyword_list)
        review_list.append(review_keywords)
        
        # vectorise string
        WORD = re.compile(r'\w+')
        review_vector = [collections.Counter(WORD.findall(word)) for word in review_list]
    
    
    return review_vector
    
# test = norm_corpus(bus_df.text)
review_vector(test)

[Counter({u'time': 1}),
 Counter({u'depart': 1, u'gps': 1, u'place': 1, u'street': 1}),
 Counter(),
 Counter({u'beer': 1, u'food': 1, u'select': 1, u'staff': 1}),
 Counter({u'beauti': 1, u'restor': 1}),
 Counter({u'group': 1, u'home': 1, u'stage': 1, u'theatr': 1}),
 Counter({u'head': 1, u're': 1, u'send': 1}),
 Counter({u'alway': 1, u'busi': 1, u'peopl': 1}),
 Counter({u'glass': 1, u'rock': 1, u'sarah': 1, u'waitress': 1}),
 Counter()]

In [190]:
# Vectorise keywords from normalised text to vector including only nouns and adjectives
norm_doc_list = norm_corpus(bus_df.text)
review_list = []

    # select all words categorised as nouns or adjectives
    # loop through each string i.e. review in the df column
for doc in norm_doc_list:
    review_keyword_list = []
    doc = nltk.word_tokenize(doc)
        # create tuple for each word in list: (word, tag)
    token_category = nltk.pos_tag(doc)
    
    

    print token_category
        
#         review_keywords = " ".join(review_keyword_list)
#         review_list.append(review_keywords)
        
#         # vectorise string
#     WORD = re.compile(r'\w+')
#     review_vector = [collections.Counter(WORD.findall(word)) for word in review_list]
    
    
#     return review_vector
    
# test = norm_corpus(bus_df.text)
# review_vector(test)

[(u"n't", 'RB'), (u'wast', 'JJ'), (u'time', 'NN')]
[(u'gps', 'NN'), (u'allow', 'VB'), (u'find', 'JJ'), (u'place', 'NN'), (u'put', 'VBD'), (u'rankin', 'JJ'), (u'polic', 'JJ'), (u'depart', 'NN'), (u'instead', 'RB'), (u'direct', 'JJ'), (u'across', 'IN'), (u'street', 'NN')]
[(u'great', 'JJ'), (u'drink', 'VBP'), (u'special', 'JJ')]
[(u'friend', 'JJ'), (u'staff', 'NN'), (u'good', 'JJ'), (u'food', 'NN'), (u'great', 'JJ'), (u'beer', 'NN'), (u'select', 'NN'), (u'relax', 'VBZ'), (u'atmospher', 'RB')]
[(u'beauti', 'NN'), (u'restor', 'NN')]
[(u'home', 'NN'), (u'stage', 'NN'), (u'62', 'CD'), (u'theatr', 'NN'), (u'group', 'NN')]
[(u'god', 'JJ'), (u'send', 'NN'), (u're', 'NN'), (u'gear', 'VBP'), (u'head', 'NN')]
[(u'great', 'JJ'), (u'peopl', 'NN'), (u'great', 'JJ'), (u'servic', 'JJ'), (u'alway', 'NN'), (u'busi', 'NN')]
[(u'sarah', 'NN'), (u'rock', 'NN'), (u'best', 'JJS'), (u'waitress', 'NN'), (u'sure', 'JJ'), (u'get', 'VB'), (u'compliment', 'JJ'), (u'glass', 'NN')]
[(u'unlead', 'JJ'), (u'342', 'CD')]