In [155]:
import spacy
import scattertext as st
import spacy
import json
import pandas as pd
from pprint import pprint
from collections import Counter
from itertools import chain

from google.oauth2 import service_account
from apiclient.discovery import build

import googlemaps

In [159]:
class T:
    
    def __init__(self):

        self.r = json.load(open('data/perth/reviews_perth.json'))
        self.u = json.load(open('data/perth/users_perth.json'))
        self.a = json.load(open('data/perth/attractions_perth.json'))
        
        # calculate some stats
        
        u_names = {_['name'] for _ in self.u}
        u_ages = [_['age'] for _ in self.u if _['age']]
        u_gnds = [_['gender'] for _ in self.u if _['gender']]
        u_tags = [_['tags'] for _ in self.u if _['tags']]
        u_locs = [_['location'] for _ in self.u if _['location']]
        
        print(f'total users: {len(u_names):,}\n{"".join(["-"]*19)}')
        print(f'age: {len(u_ages)} ({100*len(u_ages)/len(u_names):.1f}%)')
        print(f'gender: {len(u_gnds)} ({100*len(u_gnds)/len(u_names):.1f}%)')
        print(f'tags: {len(u_tags)} ({100*len(u_tags)/len(u_names):.1f}%)')
        print(f'locations: {len(u_locs)} ({100*len(u_locs)/len(u_names):.1f}%)')
              
        u_all_tags = [tg for tg in chain.from_iterable(u_tags)]
        
        print('\nmost popular traveller tags:\n')
        for t, pct in [(t, round(100*c/len(u_names),1))  for t, c in Counter(u_all_tags).most_common()]:
              print(f'{t}: {pct}%')
              
        self.gmaps = googlemaps.Client(key=open('creds/geocoding_api.key').readline().strip())
              
    def _fix_location(self, s):
              
        """
        using Google Geocoding API to clarify users location
        """
              
        geocode_result = gmaps.geocode(s)
        
        # take only the top result
        if geocode_result:
            res = geocode_result[0]
        else:
            print(f'geocoding api can\'t find this location: {s}!')
        
        loc = {'location': res['formatted_address'],
               'coordinates': res['geometry']['location'],
               'locality': ''.join([_['long_name'] for _ in res['address_components'] if 'locality' in _['types']]),
               'country': ''.join([_['long_name'] for _ in res['address_components'] if 'country' in _['types']])}
              
        return loc

In [160]:
if __name__ == '__main__':
    
    t = T()

total users: 11,773
-------------------
age: 4337 (36.8%)
gender: 4246 (36.1%)
tags: 2461 (20.9%)
locations: 10517 (89.3%)

most popular traveller tags:

foodie: 10.7%
like a local: 8.4%
nature lover: 5.9%
urban explorer: 5.9%
history buff: 4.7%
60+ traveller: 4.6%
art and architecture lover: 4.5%
peace and quiet seeker: 4.2%
family holiday maker: 2.8%
shopping fanatic: 2.6%
luxury traveller: 2.6%
thrill seeker: 2.5%
beach goer: 2.2%
thrifty traveller: 2.1%
eco-tourist: 1.0%
vegetarian: 0.9%
nightlife seeker: 0.8%
backpacker: 0.5%
trendsetter: 0.3%


In [162]:
t._fix_location('harnsby australia')

{'location': 'Hornsby NSW 2077, Australia',
 'coordinates': {'lat': -33.70489999999999, 'lng': 151.09901},
 'locality': 'Hornsby',
 'country': 'Australia'}

In [94]:
name = []
gender = []
age = []

for rec in t.u:
    if rec['name'] and rec['gender']:
        name.append(rec['name'].strip())
        gender.append(rec['gender'])
        
user_df = pd.DataFrame({'name': name, 
                        'gender': gender})

In [95]:
user_df.count()

name      4246
gender    4246
dtype: int64

In [96]:
by_user = []
text = []
rating = []

for rec in t.r:
    if rec['by_user'] and rec['text'] and rec['rating']:
        by_user.append(rec['by_user'])
        text.append(rec['text'])
        rating.append(rec['rating'])
review_df = pd.DataFrame({'by_user': by_user, 
                         'text': text,
                         'rating': rating})

In [100]:
get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""

In [114]:
df = review_df.join(user_df.set_index('name'), on='by_user').query('gender in ("m", "f")')[['gender', 'text']]

In [115]:
df.head()

Unnamed: 0,gender,text
0,m,This is large open park with many well kept wa...
3,m,Kings Park is the must see attraction in Perth...
7,m,When we visited the park we didn’t have time t...
9,f,My husband and I biked along the water and too...
11,m,We first went through the Park on the hop on -...


In [116]:
nlp = spacy.load('en')

In [117]:
corpus = st.CorpusFromPandas(df, category_col='gender', text_col='text', nlp=nlp).build()

In [118]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['females'] = corpus.get_scaled_f_scores('f')
term_freq_df['males'] = corpus.get_scaled_f_scores('m')

In [119]:
pprint(list(term_freq_df.sort_values(by='females', ascending=False).index[:10]))

['lovely',
 'gorgeous',
 'a lovely',
 'children',
 'love',
 'play',
 'enjoyed',
 'loved',
 'a picnic',
 'picnic']


In [120]:
pprint(list(term_freq_df.sort_values(by='males', ascending=False).index[:10]))

['cbd',
 'world',
 'the world',
 'australia',
 'great views',
 'coins',
 'centre',
 'a good',
 'down',
 'excellent']


In [122]:
html = st.produce_scattertext_explorer(corpus, category='f',category_name='female', 
                                       not_category_name='male', width_in_pixels=1000, metadata=df['gender'])

  names=['word', 'background'])


In [124]:
open("kings_park_viz.html", 'wb').write(html.encode('utf-8'))

3011462