In [3]:
import pandas as pd
import numpy as np
import feather
import feather
import json
import re
import nltk
import unicodedata

from collections import Counter
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer


# Read data file into a pandas dataframe
read_df = feather.read_dataframe('../parsed_data/filtered_tip_data.feather', 'rb')

In [5]:
test = read_df.head(300)
# type(bus_df.stars[1])

In [16]:
# Helper functions for normalising text data

# Convert all words to lowercase, remove punctuation, tokenise and stem
# and remove stopwords, threshold = 10%
def norm_corpus(document):
    
    # unicode decode
    document = document.decode('utf-8')
    
    # lowercase and remove symbols
    tokenizer = RegexpTokenizer(r'\w+')
    doc_tokens = tokenizer.tokenize(document.lower())
        
    # remove stopwords
    doc_tokens = [word for word in doc_tokens if word not in stopwords.words('english')]
        
    # stem words
    stemmer = SnowballStemmer("english")
    doc_stem = [stemmer.stem(word) for word in doc_tokens]
        
    # make tokenised text one string
    norm_doc = " ".join(doc_stem)
    
    return norm_doc


# Vectorise keywords from normalised text to vector including only nouns and adjectives
def review_vector(norm_doc):

    # select all words categorised as nouns or adjectives
    # loop through each string i.e. review in the df column
    review_keyword_list = []
    doc = nltk.word_tokenize(norm_doc)

    # create tuple for each word in list: (word, tag)
    token_category = nltk.pos_tag(doc)  

    for word, tag in token_category:   
            
        # nouns
        if (tag == 'NN' or tag == 'NNS' or tag == 'NNP' or tag == 'NNPS'):
            review_keyword_list.append(word)
                
        # adjectives
        elif (tag == 'JJ' or tag == 'JJS' or tag == 'JJP' or tag == 'JJPS'):
            review_keyword_list.append(word)
        
        else:
            pass     
        
    review_keywords = " ".join(review_keyword_list)
        
    # vectorise string
    WORD = re.compile(r'\w+')
    review_vector = Counter(WORD.findall(review_keywords))
    
    
    return dict(review_vector)

In [8]:
test.columns.tolist()

['business_id',
 'date',
 'likes',
 'text',
 'type',
 'user_id',
 'latitude',
 'longitude',
 'name',
 'city',
 'stars',
 'review_count',
 'food_drink']

In [17]:
## Tip data
output_df = test.ix[:,['business_id', 'date', 'likes', 'text', 'type',
                       'user_id', 'latitude', 'longitude', 'name', 'city',
                       'stars', 'review_count', 'food_drink']]
output_df.text = test.text.apply(lambda x: norm_corpus(x))
print "tip text normalised, next: vectorise"
output_df.text = output_df.text.apply(lambda x: review_vector(x))

## Review data
# bus_df = bus_df[bus_df.stars > 3]
# output_df = bus_df.ix[:,['business_id', 'user_id', 'date', 'stars', 'votes', 'text']]
# output_df.text = output_df.text.apply(lambda x: norm_corpus(x))
# print "tip text normalised, next: vectorise"
# output_df.text = output_df.text.apply(lambda x: review_vector(x))
# output_df.head()

tip text normalised, next: vectorise


In [18]:
output_df.head()

Unnamed: 0,business_id,date,likes,text,type,user_id,latitude,longitude,name,city,stars,review_count,food_drink
0,wqu7ILomIOPSduRwoWp4AQ,2011-12-17,0,{u'grit': 1},tip,bvu13GyOUwhEjPum2xjiqQ,40.39114,-80.073788,Denny's,Pittsburgh,4.0,9,1
1,wqu7ILomIOPSduRwoWp4AQ,2014-04-27,0,"{u'hair': 1, u'second': 1, u'time': 2, u'food'...",tip,UxfFAw2-cTpeWvRROF1HEw,40.39114,-80.073788,Denny's,Pittsburgh,4.0,9,1
2,wqu7ILomIOPSduRwoWp4AQ,2015-06-15,0,"{u'food': 1, u'servic': 1, u'good': 1}",tip,Dmvqb5TVcfHq8TMW20zJww,40.39114,-80.073788,Denny's,Pittsburgh,4.0,9,1
3,8Nm_jcCYtMXYW0ODSHDiXA,2015-05-13,0,"{u'food': 1, u'tri': 1, u'alll': 1, u'best': 1}",tip,geqTlvuRIXV3kUUVnx2zzA,40.440004,-80.090911,Sapporo Japanese Steakhouse,Pittsburgh,4.5,7,1
4,_jsJFrAmFVPRio0eEVExbA,2013-02-02,0,"{u'good': 1, u'mcd': 1, u'drive': 1, u'coffe':...",tip,_BV9_YrP3sQlNVzaJo2z_w,40.492067,-80.06235,McDonald's,Pittsburgh,2.0,6,1


In [6]:
## Group by same month in year to count number of emails per topic
## i.e. -> emails sent per topic monthly
# make date column index
output_df.index = pd.to_datetime(output_df.date, 
                                 format='%m/%d/%Y')

# perform groupby, summinng up dummies for count
monthly = pd.DataFrame.groupby(output_df, 
                               by=[output_df.index.year, 
                                   output_df.index.month]).aggregate(np.sum)

In [33]:
feather.write_dataframe(new_df, '../parsed_data/parsed_tip_data.feather')

In [37]:
new_df = pd.DataFrame()

In [43]:
new_df['test'] = {'test': dict(), 'also': dict()}

In [44]:
new_df

Unnamed: 0,test
also,test
test,also
