In [1]:
import json
import pandas as pd
from pprint import pprint
from collections import Counter, defaultdict
from itertools import chain
import re

import arrow 

import googlemaps

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
import plotly.graph_objs as go

import cufflinks as cf

import spacy
import scattertext as st
from scattertext import word_similarity_explorer

from gender import GenderDetector

In [2]:
nlp = spacy.load('en')

In [3]:
init_notebook_mode(connected=True)

In [4]:
class T:
    
    def __init__(self):

        self.r = json.load(open('data/reviews_brisbane.json'))
        self.u = json.load(open('data/users_brisbane.json'))
        self.a = json.load(open('data/attractions_brisbane.json'))
        
        # some stats
        review_ids = set()
        attr_ids = set()
        user_names = set()
        dates_exp = set()
        
        for r in self.r:
            
            review_ids.add(r['id'])
            attr_ids.add(r['attr_id'])
            user_names.add(r['by_user'])
            
            if r['date_of_experience']:
                dates_exp.add(arrow.get(r['date_of_experience'], 'MM/YYYY'))
        
        self.user_stats = defaultdict(list)
        
        for u in self.u:
            for attr in 'tags age gender name'.split():   
                if u[attr]:
                    self.user_stats[attr].append(u[attr])
                
        print('data: {:,} reviews for {:,} attractions by {:,} users\nreviews written between {} and {}' \
                  .format(len(review_ids), len(attr_ids), len(user_names), 
                          min(dates_exp).format("MMM, YYYY"), max(dates_exp).format("MMM, YYYY")))
        
        for attr in 'tags age gender'.split():
            print('users with {}: {:,} ({:.1f})%'.format(attr, len(self.user_stats[attr]), 
                                                           100*len(self.user_stats[attr])/len(self.user_stats["name"])))

        # convert everything to pandas
        
        self.u_df = pd.DataFrame(self.u).dropna(subset=['name'])
        self.r_df = pd.DataFrame(self.r).dropna(subset=['attr_id'])
              
        self.u_all_tags = [tg for tg in chain.from_iterable(self.u_df['tags'])]
        self.tag_cols = list(set(self.u_all_tags))
              
        self.gmaps = googlemaps.Client(key=open('creds/geocoding_api.key').readline().strip())
        
        self.gd = GenderDetector()
        
    def _tags_to_cols(self, tag_list):
        
        if not tag_list:
            return [None]*len(self.tag_cols)
        
        return ['yes' if tag in tag_list else 'no' for tag in self.tag_cols]
    
    def tags_to_cols(self):
        
        self.u_df = pd.concat([self.u_df, 
                            pd.DataFrame(self.u_df['tags'].apply(self._tags_to_cols).to_list(), 
                                         columns=self.tag_cols)], axis=1).drop('tags', axis=1)
        
        return self

              
    def _fix_location(self, s):
              
        """
        using Google Geocoding API to clarify users location
        """
        
        loc = dict()
        
        if not (isinstance(s, str) and s.strip()):
            return loc
        
        geocode_result = self.gmaps.geocode(s)
        
        # take only the top result
        if geocode_result:
            res = geocode_result[0]
        else:
            print(f'geocoding api can\'t find this location: {s}!')
            return loc

        try:
            loc = {'location': res['formatted_address'],
                   'coordinates': res['geometry']['location'],
                   'locality': ''.join([_['long_name'] for _ in res['address_components'] if 'locality' in _['types']]),
                   'country': ''.join([_['long_name'] for _ in res['address_components'] if 'country' in _['types']])}
        except:
            print('locationd fields couldn\'t be retrieved from geocoding result!')
              
        return loc
    
    def impute_location(self):
        
        localities = []
        countries = []
        
        for row in self.u_df.iterrows():
            
            r = self._fix_location(row[1].location) 
            
            if 'country' in r:
                countries.append(r['country'])
            elif 'australia' in set(row[1].location.lower().split()):
                countries.append('australia')
            elif 'england' in set(row[1].location.lower().split()):
                countries.append('united kingdom')
            else:
                countries.append(None)
            if 'locality' in r:
                localities.append(r['locality'])
            else:
                localities.append(None)
        
        self.u_df['locality'] = localities
        self.u_df['country'] = countries
        
        return self
    
    def _impute_gender(self, s):
        
        # separate nicknames like TrevorJ into trevor J; or Mike23 into Mike 23
        s = re.sub(r'([a-z]{1})([A-Z0-9]+)', r'\1 \2', s)
        
        return self.gd.gender(s)
    
    def impute_gender(self):
        
        self.u_df['gender'] = self.u_df['gender'] \
                                .apply(lambda s: s if str(s) in 'm f'.split() else self._impute_gender(str(s)))
        return self
    
    def merge_data(self):
        
        self.data = self.r_df.join(self.u_df.set_index('name'), on='by_user', how='inner')
        self.data.to_csv('data/brisb.csv', index=False)
        
        return self
    
    def selector(self, dk):

        if not (set(dk) <= set(t.data.columns)):
            print('wrong segments!')
            raise Exception()
        
        out = t.data
    
        for k in dk:
            out = out[out[k] == dk[k]]
        
        if not out.empty:
            return out
        else:
            print('empty result!')
            raise Exception()

In [5]:
if __name__ == '__main__':
    
    t = T().tags_to_cols().merge_data()

data: 21,564 reviews for 6 attractions by 15,166 users
reviews written between Apr, 2010 and Feb, 2019
users with tags: 3,064 (20.2)%
users with age: 5,240 (34.6)%
users with gender: 5,097 (33.6)%


In [6]:
docs = [nlp(review) for review in t.data.text.tolist()]

In [7]:
ents = [d.ents for d in docs]

In [9]:
ents[:12]

[(Southbank, Koala, 20),
 (dusk,),
 (Koala, Koalas, hundred),
 (Australian,),
 (Koala,),
 (),
 (Brisbane,),
 (22 years ago, one, one),
 (),
 (CBD, Australian, Kangaroo, the afternoon, The Sheep Dog Show),
 (the Brisbane River,),
 (about 7 miles,
  Brisbane,
  first,
  1927,
  130,
  100,
  Australian,
  about $20,
  US,
  early in the morning,
  up to 30 minutes,
  Tasmanian Devil,
  early in the morning,
  daily 9am to 5pm)]

In [18]:
for d in docs:
    print(d.vocab)
    print(dir(d))
    print(d.text)
    print(d.ents)
    print([e.label_ for e in d.ents])
    break

<spacy.vocab.Vocab object at 0x11b9c69c8>
['_', '__bytes__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__pyx_vtable__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', '_bulk_merge', '_py_tokens', '_realloc', '_vector', '_vector_norm', 'cats', 'char_span', 'count_by', 'doc', 'ents', 'extend_tensor', 'from_array', 'from_bytes', 'from_disk', 'get_extension', 'get_lca_matrix', 'has_extension', 'has_vector', 'is_parsed', 'is_sentenced', 'is_tagged', 'mem', 'merge', 'noun_chunks', 'noun_chunks_iterator', 'print_tree', 'remove_extension', 'retokenize', 'sentiment', 'sents', 'set_extension', 'similarity', 'tensor', 'text', 'text_with_ws', 'to_array', 'to_bytes', 'to_disk', 'user_data', 'user_hooks', 'user_span_hook

In [None]:
# NER separately
from spacy.pipeline import EntityRecognizer

In [6]:
t.data.head()

Unnamed: 0,attr_id,by_user,date_of_experience,date_of_writing,id,rating,text,title,age,gender,...,60+ traveller,luxury traveller,art and architecture lover,thrifty traveller,family holiday maker,like a local,history buff,beach goer,urban explorer,vegetarian
0,d256511,Gilliebean63,07/2018,13/02/2019,651830536,5.0,"Easy access by bus, or a boat which leaves fro...",Koala's and kangaroos,,,...,,,,,,,,,,
12486,d1206454,Gilliebean63,11/2018,13/02/2019,651832420,5.0,Great place to take visitors to our fair city....,Brisbane ity views,,,...,,,,,,,,,,
1,d256511,Rob I,02/2019,13/02/2019,651820141,4.0,This is a nice little place to visit and the s...,Nice place but poor Koalas,50-64,m,...,no,yes,no,no,no,no,yes,no,no,no
2,d256511,Julio974T,01/2019,13/02/2019,651817806,5.0,Great park-zoo with all the Australian animals...,Australian fauna close the city,,,...,,,,,,,,,,
3,d256511,PooleCollectors,09/2018,13/02/2019,651758533,5.0,You can even have your photo taken with a Koal...,Wonderful caring place,,,...,,,,,,,,,,


In [127]:
t.data.to_csv('brisb.csv', index=False)

In [44]:
Counter(t.u_all_ages)

Counter({'50-64': 2429,
         '35-49': 1688,
         '25-34': 910,
         '18-24': 202,
         '13-17': 11})

In [59]:
seg1 = t.selector({'gender': 'm', 'age': '35-49'})[['title', 'text']]
seg2 = t.selector({'gender': 'f', 'age': '50-64'})[['title', 'text']]

In [60]:
seg1['segment'] = '1'
seg2['segment'] = '2'

In [61]:
df = pd.concat([seg1, seg2])

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3046 entries, 16 to 21469
Data columns (total 3 columns):
title      3046 non-null object
text       3046 non-null object
segment    3046 non-null object
dtypes: object(3)
memory usage: 95.2+ KB


In [63]:
corpus = st.CorpusFromPandas(df, category_col='segment', text_col='text', nlp=nlp).build()

In [64]:
# term frequency data frame; note that terms become index!
term_freq_df = corpus.get_term_freq_df()

In [69]:
print(term_freq_df.head())

         1 freq  2 freq
term                   
we          487    1348
visited      50      81
lone        110     149
pine        109     148
koala       265     367


In [73]:
# get scaled f-scores for each term and place in new column
term_freq_df['seg1_score'] = corpus.get_scaled_f_scores('1')
term_freq_df['seg2_score'] = corpus.get_scaled_f_scores('2')

In [74]:
term_freq_df.head()

AttributeError: 'LazyConfigValue' object has no attribute 'lower'

         1 freq  2 freq  seg1_score  seg2_score
term                                           
we          487    1348    0.058948    0.941052
visited      50      81    0.884160    0.115840
lone        110     149    0.938651    0.061349
pine        109     148    0.938323    0.061677
koala       265     367    0.935692    0.064308

In [48]:
term_freq_df['vegetarians'] = corpus.get_scaled_f_scores('yes')
term_freq_df['non-vegetarians'] = corpus.get_scaled_f_scores('no')

In [83]:
# top 10 
print(term_freq_df.sort_values(by='seg1_score', ascending=False).iloc[:10])

            1 freq  2 freq  seg1_score  seg2_score
term                                              
this place      60      40    1.000000    0.000000
zoo             62      47    0.991748    0.008252
kids           122     108    0.989334    0.010666
a nice          74      67    0.983243    0.016757
most            83      77    0.983050    0.016950
the place       48      30    0.978767    0.021233
koalas         201     212    0.970127    0.029873
cbd             67      68    0.967087    0.032913
kangaroo        87      95    0.965207    0.034793
lot             52      45    0.962620    0.037380


In [84]:
print(term_freq_df.sort_values(by='seg2_score', ascending=False).iloc[:10])

           1 freq  2 freq  seg1_score  seg2_score
term                                             
lovely         45     381    0.000000    1.000000
a lovely       22     155    0.007498    0.992502
on and         28     139    0.024315    0.975685
visitors       53     232    0.026315    0.973685
hop            34     153    0.026879    0.973121
southbank      34     148    0.029375    0.970625
city cat       52     209    0.031320    0.968680
and off        14     102    0.031347    0.968653
off           100     390    0.033129    0.966871
travel         40     155    0.035362    0.964638


In [88]:
html = st.produce_scattertext_explorer(corpus, category='1', 
                                       category_name='Segment 1',  # for presentation only
                                       not_category_name='2',
                                       width_in_pixels=1000, 
                                       metadata=df['segment'])


read_table is deprecated, use read_csv instead, passing sep='\t'.



In [89]:
open("segs.html", 'wb').write(html.encode('utf-8'))

2027066

In [90]:
feat_builder = st.FeatsFromOnlyEmpath()

In [91]:
empath_corpus = st.CorpusFromParsedDocuments(df, 
                                             category_col='segment', 
                                             feats_from_spacy_doc=feat_builder, 
                                             parsed_col='text').build()

In [93]:
html = st.produce_scattertext_explorer(empath_corpus, 
                                       category='1', 
                                       category_name='Segment 1',
                                       not_category_name='Segment 2',
                                       width_in_pixels=1000,
                                       metadata=df['segment'],
                                       use_non_text_features=True,
                                       use_full_doc=True,
                                       topic_model_term_lists=feat_builder.get_top_model_term_lists())

In [94]:
open("segs_empath.html", 'wb').write(html.encode('utf-8'))

2409083

In [95]:
html = word_similarity_explorer(corpus,
category='1',
category_name='Segment 1',
not_category_name='Segment 2',
target_term='koala',
minimum_term_frequency=5,
pmi_threshold_coefficient=4,
width_in_pixels=1000,
metadata=df['segment'],
alpha=0.01,
max_p_val=0.05,
save_svg_button=True)


read_table is deprecated, use read_csv instead, passing sep='\t'.



In [96]:
open("similarity_koala.html", 'wb').write(html.encode('utf-8'))

2155644