In [54]:
import pandas as pd
import numpy as np
import json
import re
import nltk

from collections import Counter
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Read data file into a python array
with open('../data/test_tip.json', 'rb') as f:
    bus_data = f.readlines()

# remove the trailing "\n" from each line
bus_data = map(lambda x: x.rstrip(), bus_data)
# put individual business JSON objects into list
data_json = "[" + ','.join(bus_data) + "]"

# Create pandas df
bus_df = pd.read_json(data_json)

In [68]:
bus_df.head()

Unnamed: 0,business_id,date,likes,text,type,user_id
0,cE27W9VPgO88Qxe4ol6y_g,2013-04-18,0,Don't waste your time.,tip,-6rEfobYjMxpUWLNxszaxQ
1,mVHrayjG3uZ_RLHkLj-AMg,2013-01-06,1,Your GPS will not allow you to find this place...,tip,EZ0r9dKKtEGVx2CdnowPCw
2,KayYbHCt-RkbGcPdGOThNg,2013-12-03,0,Great drink specials!,tip,xb6zEQCw9I-Gl0g06e1KsQ
3,KayYbHCt-RkbGcPdGOThNg,2015-07-08,0,"Friendly staff, good food, great beer selectio...",tip,QawZN4PSW7ng_9SP7pjsVQ
4,1_lU0-eSWJCRvNGk78Zh9Q,2015-10-25,0,Beautiful restoration.,tip,MLQre1nvUtW-RqMTc4iC9A


In [56]:
bus_df.text[1]

u'Your GPS will not allow you to find this place. Put Rankin police department in instead. They are directly across the street.'

In [67]:
# Helper functions for normalising text data

# Convert all words to lowercase, remove punctuation, tokenise and stem
# and remove stopwords, threshold = 10%
def norm_corpus(document):

    # lowercase and remove symbols
    tokenizer = RegexpTokenizer(r'\w+')
    doc_tokens = tokenizer.tokenize(document.lower())
        
    # remove stopwords
    doc_tokens = [word for word in doc_tokens if word not in stopwords.words('english')]
        
    # stem words
    stemmer = SnowballStemmer("english")
    doc_stem = [stemmer.stem(word) for word in doc_tokens]
        
    # make tokenised text one string
    norm_doc = " ".join(doc_stem)
    
    return norm_doc

0                                            wast time
1    gps allow find place put rankin polic depart i...
2                                  great drink special
3    friend staff good food great beer select relax...
4                                        beauti restor
5                           home stage 62 theatr group
6                                   god send gear head
7                  great peopl great servic alway busi
8    sarah rock best waitress sure get compliment g...
9                                          unlead 3 42
Name: text, dtype: object

In [41]:
# Helper functions for vectorise normalised data

In [70]:
# Vectorise keywords from normalised text to vector including only nouns and adjectives
def review_vector(norm_doc):
    review_list = []

    # select all words categorised as nouns or adjectives
    # loop through each string i.e. review in the df column
    for doc in norm_doc_list:
        review_keyword_list = []
        doc = nltk.word_tokenize(doc)
        # create tuple for each word in list: (word, tag)
        token_category = nltk.pos_tag(doc)
        
        
        for word, tag in token_category:    
            
            # nouns
            if (tag == 'NN' or tag == 'NNS' or tag == 'NNP' or tag == 'NNPS'):
                review_keyword_list.append(word)
                
            # adjectives
            elif (tag == 'JJ' or tag == 'JJS' or tag == 'JJP' or tag == 'JJPS'):
                review_keyword_list.append(word)
            else:
                pass     
        
        review_keywords = " ".join(review_keyword_list)
        review_list.append(review_keywords)
        
        # vectorise string
        WORD = re.compile(r'\w+')
        review_vector = [Counter(WORD.findall(word)) for word in review_list]
    
    
    return review_vector
    
output_df = bus_df.copy()
output_df.text = output_df.text.apply(lambda x: norm_corpus(x))
output_df.text = output_df.text.apply(lambda x: review_vector(x))
output_df.text[1]

[Counter({u'g': 1}),
 Counter({u'p': 1}),
 Counter({u's': 1}),
 Counter(),
 Counter(),
 Counter({u'l': 1}),
 Counter({u'l': 1}),
 Counter({u'o': 1}),
 Counter({u'w': 1}),
 Counter(),
 Counter({u'f': 1}),
 Counter({u'i': 1}),
 Counter({u'n': 1}),
 Counter({u'd': 1}),
 Counter(),
 Counter({u'p': 1}),
 Counter({u'l': 1}),
 Counter(),
 Counter({u'c': 1}),
 Counter({u'e': 1}),
 Counter(),
 Counter({u'p': 1}),
 Counter({u'u': 1}),
 Counter({u't': 1}),
 Counter(),
 Counter({u'r': 1}),
 Counter(),
 Counter({u'n': 1}),
 Counter({u'k': 1}),
 Counter({u'i': 1}),
 Counter({u'n': 1}),
 Counter(),
 Counter({u'p': 1}),
 Counter({u'o': 1}),
 Counter({u'l': 1}),
 Counter({u'i': 1}),
 Counter({u'c': 1}),
 Counter(),
 Counter({u'd': 1}),
 Counter({u'e': 1}),
 Counter({u'p': 1}),
 Counter(),
 Counter({u'r': 1}),
 Counter({u't': 1}),
 Counter(),
 Counter({u'i': 1}),
 Counter({u'n': 1}),
 Counter({u's': 1}),
 Counter({u't': 1}),
 Counter({u'e': 1}),
 Counter(),
 Counter({u'd': 1}),
 Counter(),
 Counter({u'd

In [217]:
output_df = bus_df[['business_id', 'user_id', 'date']]  
output_df['tip'] = norm_corpus(bus_df.text)
output_df.tip = review_vector(output_df.tip)
output_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,business_id,user_id,date,tip
0,cE27W9VPgO88Qxe4ol6y_g,-6rEfobYjMxpUWLNxszaxQ,2013-04-18,"{u'wast': 1, u'time': 1}"
1,mVHrayjG3uZ_RLHkLj-AMg,EZ0r9dKKtEGVx2CdnowPCw,2013-01-06,"{u'depart': 1, u'polic': 1, u'rankin': 1, u'di..."
2,KayYbHCt-RkbGcPdGOThNg,xb6zEQCw9I-Gl0g06e1KsQ,2013-12-03,"{u'great': 1, u'special': 1}"
3,KayYbHCt-RkbGcPdGOThNg,QawZN4PSW7ng_9SP7pjsVQ,2015-07-08,"{u'great': 1, u'good': 1, u'food': 1, u'beer':..."
4,1_lU0-eSWJCRvNGk78Zh9Q,MLQre1nvUtW-RqMTc4iC9A,2015-10-25,"{u'beauti': 1, u'restor': 1}"
5,1_lU0-eSWJCRvNGk78Zh9Q,bvu13GyOUwhEjPum2xjiqQ,2015-01-06,"{u'home': 1, u'theatr': 1, u'group': 1, u'stag..."
6,_qopVQ6_Mz6W7-Pmbi56GQ,bvu13GyOUwhEjPum2xjiqQ,2013-02-13,"{u'god': 1, u'head': 1, u'send': 1, u're': 1}"
7,_qopVQ6_Mz6W7-Pmbi56GQ,_QFom7aSHKNCDsNXKd-3xQ,2010-08-27,"{u'peopl': 1, u'great': 2, u'busi': 1, u'servi..."
8,wJr6kSA5dchdgOdwH6dZ2w,fvTivrsJoUMYXnOJw9wZfw,2013-07-22,"{u'sarah': 1, u'sure': 1, u'best': 1, u'glass'..."
9,Cdcus0NADzyY3XiJM2O5Sg,bvu13GyOUwhEjPum2xjiqQ,2011-10-12,{u'unlead': 1}


In [225]:
output_df = bus_df[['business_id', 'user_id', 'date', 'stars', 'votes']]
output_df['review'] = norm_corpus(bus_df.text)
print "review text normalised, next: vectorise"
output_df.review = review_vector(output_df.review)
output_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


review text normalised, next: vectorise


Unnamed: 0,business_id,user_id,date,stars,votes,review
0,5UmKMjUEUNdYWqANhGckJw,PUFPaY9KxDAcGqfsorJp3Q,2012-08-01,4,"{u'funny': 0, u'useful': 0, u'cool': 0}","{u'fashion': 1, u'old': 1, u'hoagi': 2, u'burg..."
1,5UmKMjUEUNdYWqANhGckJw,Iu6AxdBYGR4A0wspR9BYHA,2014-02-13,5,"{u'funny': 0, u'useful': 0, u'cool': 0}","{u'machin': 1, u'tradit': 1, u'use': 1, u'food..."
2,5UmKMjUEUNdYWqANhGckJw,auESFwWvW42h6alXgFxAXQ,2015-10-31,5,"{u'funny': 0, u'useful': 0, u'cool': 0}","{u'pennysav': 1, u'win': 1, u'realli': 1, u'se..."
3,UsFtqoBl7naz8AVUBZMjQQ,uK8tzraOp4M5u3uYrqIBXg,2013-11-08,5,"{u'funny': 0, u'useful': 0, u'cool': 0}","{u'dine': 1, u'simpli': 1, u'at': 1, u'famili'..."
4,UsFtqoBl7naz8AVUBZMjQQ,I_47G-R2_egp7ME5u_ltew,2014-03-29,3,"{u'funny': 0, u'useful': 0, u'cool': 0}","{u'anytim': 1, u'item': 1, u'crispi': 1, u'res..."
5,UsFtqoBl7naz8AVUBZMjQQ,PP_xoMSYlGr2pb67BbqBdA,2014-10-29,1,"{u'funny': 0, u'useful': 0, u'cool': 0}","{u'butter': 1, u'water': 1, u'good': 2, u'mayb..."
6,UsFtqoBl7naz8AVUBZMjQQ,JPPhyFE-UE453zA6K0TVgw,2014-11-28,4,"{u'funny': 0, u'useful': 0, u'cool': 0}","{u'fish': 2, u'cheap': 1, u'sandwich': 1, u'ho..."
7,3eu6MEFlq2Dg7bQh8QbdOg,2d5HeDvZTDUNVog_WuUpSg,2014-02-27,5,"{u'funny': 0, u'useful': 0, u'cool': 0}","{u'issu': 1, u'realli': 1, u'high': 1, u'mecha..."
8,3eu6MEFlq2Dg7bQh8QbdOg,BShxMIUwaJS378xcrz4Nmg,2015-06-16,5,"{u'funny': 0, u'useful': 0, u'cool': 0}","{u'shop': 2, u'neighborhood': 1, u'fair': 1, u..."
9,cE27W9VPgO88Qxe4ol6y_g,fhNxoMwwTipzjO8A9LFe8Q,2012-08-19,3,"{u'funny': 0, u'useful': 1, u'cool': 0}","{u'rang': 3, u'golf': 1, u'close': 1, u'open':..."


In [45]:
import string
document = "string. With. Punctuation?"
doc = document.translate(string.maketrans("",""), string.punctuation)

In [65]:
snowball_stemmer = SnowballStemmer('english')
test = tokenizer.tokenize('hello there hi')
snowball_stemmer.stem('annoying things')

u'annoying th'