In [26]:
# import spacy
import scattertext as st
import json
import pandas as pd
from pprint import pprint
from collections import Counter, defaultdict
from itertools import chain
import re

import arrow 

import googlemaps

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
import plotly.graph_objs as go

import cufflinks as cf

from gender import GenderDetector

In [27]:
init_notebook_mode(connected=True)

In [31]:
class T:
    
    def __init__(self):

        self.r = json.load(open('data/sydney/reviews_sydney.json'))
        self.u = json.load(open('data/sydney/users_sydney.json'))
        self.a = json.load(open('data/sydney/attractions_sydney.json'))
        
        # convert everything to pandas
        
        self.u_df = pd.DataFrame(self.u).dropna(subset=['name']).drop('attr_ids', axis=1)
        self.r_df = pd.DataFrame(self.r).dropna(subset=['attr_id'])
              
        self.u_all_tags = [tg for tg in chain.from_iterable(self.u_df['tags'])]
        self.tag_cols = list(set(self.u_all_tags))
              
        self.gmaps = googlemaps.Client(key=open('creds/geocoding_api.key').readline().strip())
        
        self.gd = GenderDetector()
        
    def _tags_to_cols(self, tag_list):
        
        if not tag_list:
            return [None]*len(self.tag_cols)
        
        return [1 if tag in tag_list else 0 for tag in self.tag_cols]
    
    def tags_to_cols(self):
        
        self.u_df = pd.concat([self.u_df, 
                            pd.DataFrame(self.u_df['tags'].apply(self._tags_to_cols).to_list(), 
                                         columns=self.tag_cols)], axis=1).drop('tags', axis=1)
        
        return self

              
    def _fix_location(self, s):
              
        """
        using Google Geocoding API to clarify users location
        """
        
        loc = dict()
        
        if not (isinstance(s, str) and s.strip()):
            return loc
        
        geocode_result = self.gmaps.geocode(s)
        
        # take only the top result
        if geocode_result:
            res = geocode_result[0]
        else:
            print(f'geocoding api can\'t find this location: {s}!')
            return loc

        try:
            loc = {'location': res['formatted_address'],
                   'coordinates': res['geometry']['location'],
                   'locality': ''.join([_['long_name'] for _ in res['address_components'] if 'locality' in _['types']]),
                   'country': ''.join([_['long_name'] for _ in res['address_components'] if 'country' in _['types']])}
        except:
            print('locationd fields couldn\'t be retrieved from geocoding result!')
              
        return loc
    
    def impute_location(self):
        
        localities = []
        countries = []
        
        for row in self.u_df.iterrows():
            
            r = self._fix_location(row[1].location) 
            
            if 'country' in r:
                countries.append(r['country'])
            elif 'australia' in set(row[1].location.lower().split()):
                countries.append('australia')
            elif 'england' in set(row[1].location.lower().split()):
                countries.append('united kingdom')
            else:
                countries.append(None)
            if 'locality' in r:
                localities.append(r['locality'])
            else:
                localities.append(None)
        
        self.u_df['locality'] = localities
        self.u_df['country'] = countries
        
        return self
    
    def _impute_gender(self, s):
        
        # separate nicknames like TrevorJ into trevor J; or Mike23 into Mike 23
        s = re.sub(r'([a-z]{1})([A-Z0-9]+)', r'\1 \2', s)
        
        return self.gd.gender(s)
    
    def impute_gender(self):
        
        self.u_df['gender'] = self.u_df['gender'] \
                                .apply(lambda s: s if str(s) in 'm f'.split() else self._impute_gender(str(s)))
        return self
    
    def merge_data(self):
        
        self.data = self.r_df.join(self.u_df.set_index('name'), on='by_user', how='inner')
        
        return self

In [32]:
if __name__ == '__main__':
    
    t = T().tags_to_cols().merge_data()

In [33]:
t.data.head()

Unnamed: 0,attr_id,by_user,date_of_experience,date_of_writing,id,rating,text,title,age,gender,...,nature lover,like a local,luxury traveller,art and architecture lover,eco-tourist,vegetarian,thrifty traveller,shopping fanatic,urban explorer,thrill seeker
0,d257278,b2buck,02/2019,12/02/2019,651598753,5.0,A UNESCO heritage site for its stand out shape...,Who doesn’t know what this is?,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,d257278,gtreadgold,02/2019,12/02/2019,651583323,5.0,I was lucky enough to get a solo ticket for th...,Worth the trip,,,...,,,,,,,,,,
2,d257278,Fel1011,02/2019,12/02/2019,651572865,5.0,Sat on a ferry from Birkenhead Point to Circul...,Beautiful,18-24,f,...,,,,,,,,,,
3,d257278,chrisdtraveller,02/2019,12/02/2019,651557612,5.0,Its nice to walk at the opera during sunset wi...,The Opera House,,,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,d257278,cynthiasT218QV,02/2019,12/02/2019,651529223,5.0,I heard this was a good tour and everyone who ...,A Great Tour,,,...,,,,,,,,,,


In [11]:
Counter(t.u_all_tags).most_common()

[('foodie', 1151),
 ('like a local', 858),
 ('urban explorer', 727),
 ('art and architecture lover', 595),
 ('nature lover', 561),
 ('history buff', 519),
 ('thrill seeker', 349),
 ('60+ traveller', 341),
 ('luxury traveller', 334),
 ('peace and quiet seeker', 317),
 ('family holiday maker', 276),
 ('beach goer', 261),
 ('shopping fanatic', 257),
 ('thrifty traveller', 165),
 ('vegetarian', 142),
 ('eco-tourist', 96),
 ('nightlife seeker', 95),
 ('backpacker', 63),
 ('trendsetter', 31)]

In [5]:
t.data.head(20)

Unnamed: 0,attr_id,by_user,date_of_experience,date_of_writing,id,rating,text,title,age,attr_ids,gender,location,tags
0,d257278,b2buck,02/2019,12/02/2019,651598753,5.0,A UNESCO heritage site for its stand out shape...,Who doesn’t know what this is?,,[d257278],,"Wichita, KS","[foodie, urban explorer, shopping fanatic, bea..."
1,d257278,gtreadgold,02/2019,12/02/2019,651583323,5.0,I was lucky enough to get a solo ticket for th...,Worth the trip,,[d257278],,"Houston, Texas",[]
2,d257278,Fel1011,02/2019,12/02/2019,651572865,5.0,Sat on a ferry from Birkenhead Point to Circul...,Beautiful,18-24,[d257278],f,"Singapore, Singapore",[]
3,d257278,chrisdtraveller,02/2019,12/02/2019,651557612,5.0,Its nice to walk at the opera during sunset wi...,The Opera House,,[d257278],,"Sydney, Australia","[foodie, urban explorer, like a local, thrill ..."
4,d257278,cynthiasT218QV,02/2019,12/02/2019,651529223,5.0,I heard this was a good tour and everyone who ...,A Great Tour,,[d257278],,"Frederick, Maryland",[]
5,d257278,ScarlettOtto,02/2019,11/02/2019,651449584,5.0,What to say? it's one of the most famous build...,Iconic,,[d257278],,,[]
6,d257278,wombatdavid,02/2019,11/02/2019,651410428,5.0,"For the lunar New Year, there was a giant pig ...",Loved the year of the pig sculpture,35-49,[d257278],m,Tasmania,"[urban explorer, like a local, history buff, p..."
7,d257278,Helen S,02/2019,11/02/2019,651400456,5.0,What an unbelievable building. Superb in every...,Iconic,,[d257278],,"Concord, Massachusetts, United States","[foodie, 60+ traveller, luxury traveller]"
5326,d257278,Helen S,06/2016,14/02/2017,459580753,5.0,Attending a performance or just strolling arou...,A wonderful spot to visit ascacresident of Syd...,,[d257278],,"Concord, Massachusetts, United States","[foodie, 60+ traveller, luxury traveller]"
8,d257278,Steven B,12/2018,11/02/2019,651360473,4.0,A beautiful place to view the architecture and...,Beautiful,35-49,[d257278],m,"San Francisco, CA",[]


In [102]:
geocode_result = t.gmaps.geocode('Yorkshire')

In [None]:
geocode_result

In [53]:
def selector(dk):

    """
    dk is {'column': value}
    """
    
    if not (set(dk) <= set(t.data.columns)):
        print('wrong segments!')
        raise Exception()
        
    out = t.data
    
    for k in dk:
        out = out[out[k] == dk[k]]
        
    if not out.empty:
        return out
    else:
        print('empty result!')
        raise Exception()

In [58]:
female_lux_df = selector({'gender': 'f', 'luxury traveller': 1})
male_foodie_df = selector({'gender': 'm', 'foodie': 1})

In [59]:
d1 = female_lux_df[['date_of_experience', 'id']].groupby(['date_of_experience']).count().reset_index()
d1['date_of_experience'] = d1['date_of_experience'].apply(lambda x: arrow.get(x, 'MM/YYYY'))
d2 = male_foodie_df[['date_of_experience', 'id']].groupby(['date_of_experience']).count().reset_index()
d2['date_of_experience'] = d2['date_of_experience'].apply(lambda x: arrow.get(x, 'MM/YYYY'))

In [76]:
d1_scatter = go.Scatter(x=d1.date_of_experience, 
                            y=d1.id, mode='markers', 
                               marker=dict(size=12, line=dict(width=0),color="orange"),
                                name='Luxury Females',)

d2_scatter = go.Scatter(x=d2.date_of_experience, 
                            y=d2.id, mode='markers', 
                               marker=dict(size=12, line=dict(width=0),color="navy"),
                                name='Foodie Males',)

fig_data = [d1_scatter, d2_scatter]

layout_both = go.Layout(
                title='Number of Reviews for Sydney Opera House',
                hovermode='closest',
                    xaxis=dict(title='date', ticklen=5, zeroline=False, gridwidth=2),
                    yaxis=dict(title='# reviews', ticklen=5, gridwidth=2),
                    )

fig = go.Figure(data=fig_data, layout=layout_both)

In [77]:
iplot(fig)

In [117]:
corpus = st.CorpusFromPandas(df, category_col='gender', text_col='text', nlp=nlp).build()

In [118]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['females'] = corpus.get_scaled_f_scores('f')
term_freq_df['males'] = corpus.get_scaled_f_scores('m')

In [119]:
pprint(list(term_freq_df.sort_values(by='females', ascending=False).index[:10]))

['lovely',
 'gorgeous',
 'a lovely',
 'children',
 'love',
 'play',
 'enjoyed',
 'loved',
 'a picnic',
 'picnic']


In [120]:
pprint(list(term_freq_df.sort_values(by='males', ascending=False).index[:10]))

['cbd',
 'world',
 'the world',
 'australia',
 'great views',
 'coins',
 'centre',
 'a good',
 'down',
 'excellent']


In [122]:
html = st.produce_scattertext_explorer(corpus, category='f',category_name='female', 
                                       not_category_name='male', width_in_pixels=1000, metadata=df['gender'])

  names=['word', 'background'])


In [124]:
open("kings_park_viz.html", 'wb').write(html.encode('utf-8'))

3011462

In [78]:

iplot(cf.datagen.lines().iplot(asFigure=True,
                               kind='scatter',xTitle='Dates',yTitle='Returns',title='Returns'))

ValueError: 
    Invalid value of type 'builtins.str' received for the 'bgcolor' property of layout.legend
        Received value: 'pearl02'

    The 'bgcolor' property is a color and may be specified as:
      - A hex string (e.g. '#ff0000')
      - An rgb/rgba string (e.g. 'rgb(255,0,0)')
      - An hsl/hsla string (e.g. 'hsl(0,100%,50%)')
      - An hsv/hsva string (e.g. 'hsv(0,100%,100%)')
      - A named CSS color:
            aliceblue, antiquewhite, aqua, aquamarine, azure,
            beige, bisque, black, blanchedalmond, blue,
            blueviolet, brown, burlywood, cadetblue,
            chartreuse, chocolate, coral, cornflowerblue,
            cornsilk, crimson, cyan, darkblue, darkcyan,
            darkgoldenrod, darkgray, darkgrey, darkgreen,
            darkkhaki, darkmagenta, darkolivegreen, darkorange,
            darkorchid, darkred, darksalmon, darkseagreen,
            darkslateblue, darkslategray, darkslategrey,
            darkturquoise, darkviolet, deeppink, deepskyblue,
            dimgray, dimgrey, dodgerblue, firebrick,
            floralwhite, forestgreen, fuchsia, gainsboro,
            ghostwhite, gold, goldenrod, gray, grey, green,
            greenyellow, honeydew, hotpink, indianred, indigo,
            ivory, khaki, lavender, lavenderblush, lawngreen,
            lemonchiffon, lightblue, lightcoral, lightcyan,
            lightgoldenrodyellow, lightgray, lightgrey,
            lightgreen, lightpink, lightsalmon, lightseagreen,
            lightskyblue, lightslategray, lightslategrey,
            lightsteelblue, lightyellow, lime, limegreen,
            linen, magenta, maroon, mediumaquamarine,
            mediumblue, mediumorchid, mediumpurple,
            mediumseagreen, mediumslateblue, mediumspringgreen,
            mediumturquoise, mediumvioletred, midnightblue,
            mintcream, mistyrose, moccasin, navajowhite, navy,
            oldlace, olive, olivedrab, orange, orangered,
            orchid, palegoldenrod, palegreen, paleturquoise,
            palevioletred, papayawhip, peachpuff, peru, pink,
            plum, powderblue, purple, red, rosybrown,
            royalblue, saddlebrown, salmon, sandybrown,
            seagreen, seashell, sienna, silver, skyblue,
            slateblue, slategray, slategrey, snow, springgreen,
            steelblue, tan, teal, thistle, tomato, turquoise,
            violet, wheat, white, whitesmoke, yellow,
            yellowgreen