In [14]:
# import spacy
import scattertext as st
import json
import pandas as pd
from pprint import pprint
from collections import Counter
from itertools import chain

import arrow 

import googlemaps

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
import plotly.graph_objs as go

import cufflinks as cf

In [15]:
init_notebook_mode(connected=True)

In [52]:
class T:
    
    def __init__(self):

        self.r = json.load(open('data/perth/reviews_perth.json'))
        self.u = json.load(open('data/perth/users_perth.json'))
        self.a = json.load(open('data/perth/attractions_perth.json'))
        
        # convert everything to pandas
        
        self.u_df = pd.DataFrame(self.u).dropna(subset=['name'])
        self.r_df = pd.DataFrame(self.r).dropna(subset=['attr_id'])

        # how complete is the data?
        print('\nuser data availability:\n')
        print(self.u_df.count()/len(self.u_df['name'])*100)
              
        self.u_all_tags = [tg for tg in chain.from_iterable(self.u_df['tags'])]
        
        self.rs = self.r_df[['date_of_experience', 'id', 'rating', 'by_user']].join(self.u_df.set_index('name'), on='by_user', how='inner')
              
        self.gmaps = googlemaps.Client(key=open('creds/geocoding_api.key').readline().strip())
              
    def _fix_location(self, s):
              
        """
        using Google Geocoding API to clarify users location
        """
        
        loc = dict()
        
        if not (isinstance(s, str) and s.strip()):
            return loc
        
        geocode_result = self.gmaps.geocode(s)
        
        # take only the top result
        if geocode_result:
            res = geocode_result[0]
        else:
            print(f'geocoding api can\'t find this location: {s}!')
        
        
        
        try:
            loc = {'location': res['formatted_address'],
                   'coordinates': res['geometry']['location'],
                   'locality': ''.join([_['long_name'] for _ in res['address_components'] if 'locality' in _['types']]),
                   'country': ''.join([_['long_name'] for _ in res['address_components'] if 'country' in _['types']])}
        except:
            print('locationd fields couldn\'t be retrieved from geocoding result!')
              
        return loc

In [53]:
if __name__ == '__main__':
    
    t = T()


user data availability:

age          36.841658
attr_ids    100.000000
gender       36.068637
location     89.339110
name        100.000000
tags        100.000000
dtype: float64


In [60]:
t.rs['country'] = t.rs['location'].apply(lambda x: str(x).split(',')[-1].strip())

In [61]:
t.rs.head()

Unnamed: 0,date_of_experience,id,rating,by_user,age,attr_ids,gender,location,tags,country
0,05/2018,650627906,5.0,TrevorMP,50-64,[d256589],m,"Sydney, Australia",[],Australia
1,01/2019,650572483,5.0,PeterXaghraGozoMalta,,[d256589],,"Island of Gozo, Malta",[],Malta
2,02/2019,650526409,4.0,aroundtheworld247365,,[d256589],,"Cape Town, South Africa",[],South Africa
10312,02/2019,650526724,4.0,aroundtheworld247365,,[d256589],,"Cape Town, South Africa",[],South Africa
3,01/2019,650475561,5.0,Pete M,50-64,[d256589],m,"Malvern, United Kingdom",[urban explorer],United Kingdom


In [62]:
Counter(t.rs.country).most_common()

[('Australia', 7599),
 ('United Kingdom', 1626),
 ('None', 1625),
 ('Singapore', 487),
 ('Malaysia', 366),
 ('Perth', 279),
 ('New Zealand', 262),
 ('Canada', 158),
 ('Indonesia', 110),
 ('Western Australia', 91),
 ('Ireland', 88),
 ('India', 82),
 ('California', 73),
 ('South Africa', 72),
 ('China', 60),
 ('Sydney', 60),
 ('Germany', 49),
 ('perth', 47),
 ('Texas', 46),
 ('Melbourne', 45),
 ('Brisbane', 43),
 ('England', 40),
 ('Florida', 38),
 ('The Netherlands', 34),
 ('Thailand', 33),
 ('Philippines', 31),
 ('UK', 31),
 ('Denmark', 28),
 ('Switzerland', 27),
 ('Sweden', 27),
 ('United Arab Emirates', 27),
 ('France', 27),
 ('New York', 27),
 ('Kuala Lumpur', 24),
 ('Italy', 24),
 ('Vietnam', 22),
 ('WA', 21),
 ('Washington', 21),
 ('Perth Australia', 21),
 ('australia', 21),
 ('Perth WA', 21),
 ('Spain', 19),
 ('Perth Western Australia', 17),
 ('singapore', 16),
 ('Norway', 16),
 ('Ohio', 15),
 ('United States', 15),
 ('Finland', 15),
 ('Belgium', 15),
 ('North Carolina', 15),
 ('

In [33]:
d = t.r_df[['date_of_experience', 'attr_id']].groupby(['date_of_experience']).count().reset_index()
d = d.dropna(subset=['date_of_experience'])
# d['date_of_experience'] = d['date_of_experience'].apply(lambda x: arrow.get(x, 'MM/YYYY'))

data = [go.Bar(x=d.date_of_experience, y=d.attr_id)]

In [34]:
iplot(data)

In [117]:
corpus = st.CorpusFromPandas(df, category_col='gender', text_col='text', nlp=nlp).build()

In [118]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['females'] = corpus.get_scaled_f_scores('f')
term_freq_df['males'] = corpus.get_scaled_f_scores('m')

In [119]:
pprint(list(term_freq_df.sort_values(by='females', ascending=False).index[:10]))

['lovely',
 'gorgeous',
 'a lovely',
 'children',
 'love',
 'play',
 'enjoyed',
 'loved',
 'a picnic',
 'picnic']


In [120]:
pprint(list(term_freq_df.sort_values(by='males', ascending=False).index[:10]))

['cbd',
 'world',
 'the world',
 'australia',
 'great views',
 'coins',
 'centre',
 'a good',
 'down',
 'excellent']


In [122]:
html = st.produce_scattertext_explorer(corpus, category='f',category_name='female', 
                                       not_category_name='male', width_in_pixels=1000, metadata=df['gender'])

  names=['word', 'background'])


In [124]:
open("kings_park_viz.html", 'wb').write(html.encode('utf-8'))

3011462

In [78]:

iplot(cf.datagen.lines().iplot(asFigure=True,
                               kind='scatter',xTitle='Dates',yTitle='Returns',title='Returns'))

ValueError: 
    Invalid value of type 'builtins.str' received for the 'bgcolor' property of layout.legend
        Received value: 'pearl02'

    The 'bgcolor' property is a color and may be specified as:
      - A hex string (e.g. '#ff0000')
      - An rgb/rgba string (e.g. 'rgb(255,0,0)')
      - An hsl/hsla string (e.g. 'hsl(0,100%,50%)')
      - An hsv/hsva string (e.g. 'hsv(0,100%,100%)')
      - A named CSS color:
            aliceblue, antiquewhite, aqua, aquamarine, azure,
            beige, bisque, black, blanchedalmond, blue,
            blueviolet, brown, burlywood, cadetblue,
            chartreuse, chocolate, coral, cornflowerblue,
            cornsilk, crimson, cyan, darkblue, darkcyan,
            darkgoldenrod, darkgray, darkgrey, darkgreen,
            darkkhaki, darkmagenta, darkolivegreen, darkorange,
            darkorchid, darkred, darksalmon, darkseagreen,
            darkslateblue, darkslategray, darkslategrey,
            darkturquoise, darkviolet, deeppink, deepskyblue,
            dimgray, dimgrey, dodgerblue, firebrick,
            floralwhite, forestgreen, fuchsia, gainsboro,
            ghostwhite, gold, goldenrod, gray, grey, green,
            greenyellow, honeydew, hotpink, indianred, indigo,
            ivory, khaki, lavender, lavenderblush, lawngreen,
            lemonchiffon, lightblue, lightcoral, lightcyan,
            lightgoldenrodyellow, lightgray, lightgrey,
            lightgreen, lightpink, lightsalmon, lightseagreen,
            lightskyblue, lightslategray, lightslategrey,
            lightsteelblue, lightyellow, lime, limegreen,
            linen, magenta, maroon, mediumaquamarine,
            mediumblue, mediumorchid, mediumpurple,
            mediumseagreen, mediumslateblue, mediumspringgreen,
            mediumturquoise, mediumvioletred, midnightblue,
            mintcream, mistyrose, moccasin, navajowhite, navy,
            oldlace, olive, olivedrab, orange, orangered,
            orchid, palegoldenrod, palegreen, paleturquoise,
            palevioletred, papayawhip, peachpuff, peru, pink,
            plum, powderblue, purple, red, rosybrown,
            royalblue, saddlebrown, salmon, sandybrown,
            seagreen, seashell, sienna, silver, skyblue,
            slateblue, slategray, slategrey, snow, springgreen,
            steelblue, tan, teal, thistle, tomato, turquoise,
            violet, wheat, white, whitesmoke, yellow,
            yellowgreen