In [10]:
# import spacy
import scattertext as st
import json
import pandas as pd
from pprint import pprint
from collections import Counter, defaultdict
from itertools import chain
import re

import arrow 

import googlemaps

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
import plotly.graph_objs as go

import cufflinks as cf

from gender import GenderDetector

In [11]:
init_notebook_mode(connected=True)

In [12]:
class T:
    
    def __init__(self):

        self.r = json.load(open('data/reviews_brisbane.json'))
        self.u = json.load(open('data/users_brisbane.json'))
        self.a = json.load(open('data/attractions_brisbane.json'))
        
        # convert everything to pandas
        
        self.u_df = pd.DataFrame(self.u).dropna(subset=['name'])
        self.r_df = pd.DataFrame(self.r).dropna(subset=['attr_id'])
              
        self.u_all_tags = [tg for tg in chain.from_iterable(self.u_df['tags'])]
        self.tag_cols = list(set(self.u_all_tags))
              
        self.gmaps = googlemaps.Client(key=open('creds/geocoding_api.key').readline().strip())
        
        self.gd = GenderDetector()
        
    def _tags_to_cols(self, tag_list):
        
        if not tag_list:
            return [None]*len(self.tag_cols)
        
        return ['yes' if tag in tag_list else 'no' for tag in self.tag_cols]
    
    def tags_to_cols(self):
        
        self.u_df = pd.concat([self.u_df, 
                            pd.DataFrame(self.u_df['tags'].apply(self._tags_to_cols).to_list(), 
                                         columns=self.tag_cols)], axis=1).drop('tags', axis=1)
        
        return self

              
    def _fix_location(self, s):
              
        """
        using Google Geocoding API to clarify users location
        """
        
        loc = dict()
        
        if not (isinstance(s, str) and s.strip()):
            return loc
        
        geocode_result = self.gmaps.geocode(s)
        
        # take only the top result
        if geocode_result:
            res = geocode_result[0]
        else:
            print(f'geocoding api can\'t find this location: {s}!')
            return loc

        try:
            loc = {'location': res['formatted_address'],
                   'coordinates': res['geometry']['location'],
                   'locality': ''.join([_['long_name'] for _ in res['address_components'] if 'locality' in _['types']]),
                   'country': ''.join([_['long_name'] for _ in res['address_components'] if 'country' in _['types']])}
        except:
            print('locationd fields couldn\'t be retrieved from geocoding result!')
              
        return loc
    
    def impute_location(self):
        
        localities = []
        countries = []
        
        for row in self.u_df.iterrows():
            
            r = self._fix_location(row[1].location) 
            
            if 'country' in r:
                countries.append(r['country'])
            elif 'australia' in set(row[1].location.lower().split()):
                countries.append('australia')
            elif 'england' in set(row[1].location.lower().split()):
                countries.append('united kingdom')
            else:
                countries.append(None)
            if 'locality' in r:
                localities.append(r['locality'])
            else:
                localities.append(None)
        
        self.u_df['locality'] = localities
        self.u_df['country'] = countries
        
        return self
    
    def _impute_gender(self, s):
        
        # separate nicknames like TrevorJ into trevor J; or Mike23 into Mike 23
        s = re.sub(r'([a-z]{1})([A-Z0-9]+)', r'\1 \2', s)
        
        return self.gd.gender(s)
    
    def impute_gender(self):
        
        self.u_df['gender'] = self.u_df['gender'] \
                                .apply(lambda s: s if str(s) in 'm f'.split() else self._impute_gender(str(s)))
        return self
    
    def merge_data(self):
        
        self.data = self.r_df.join(self.u_df.set_index('name'), on='by_user', how='inner')
        self.data.to_csv('data/brisb.csv', index=False)
        
        return self

In [13]:
if __name__ == '__main__':
    
    t = T().tags_to_cols().merge_data()

In [7]:
t.data.head()

Unnamed: 0,attr_id,by_user,date_of_experience,date_of_writing,id,rating,text,title,age,gender,...,vegetarian,urban explorer,nightlife seeker,eco-tourist,shopping fanatic,thrifty traveller,backpacker,peace and quiet seeker,foodie,trendsetter
0,d256511,Gilliebean63,07/2018,13/02/2019,651830536,5.0,"Easy access by bus, or a boat which leaves fro...",Koala's and kangaroos,,,...,,,,,,,,,,
12486,d1206454,Gilliebean63,11/2018,13/02/2019,651832420,5.0,Great place to take visitors to our fair city....,Brisbane ity views,,,...,,,,,,,,,,
1,d256511,Rob I,02/2019,13/02/2019,651820141,4.0,This is a nice little place to visit and the s...,Nice place but poor Koalas,50-64,m,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,d256511,Julio974T,01/2019,13/02/2019,651817806,5.0,Great park-zoo with all the Australian animals...,Australian fauna close the city,,,...,,,,,,,,,,
3,d256511,PooleCollectors,09/2018,13/02/2019,651758533,5.0,You can even have your photo taken with a Koal...,Wonderful caring place,,,...,,,,,,,,,,


In [14]:
Counter(t.u_all_tags).most_common()

[('foodie', 1543),
 ('like a local', 1322),
 ('urban explorer', 956),
 ('nature lover', 822),
 ('art and architecture lover', 718),
 ('history buff', 665),
 ('60+ traveller', 528),
 ('peace and quiet seeker', 503),
 ('family holiday maker', 403),
 ('thrill seeker', 401),
 ('beach goer', 377),
 ('luxury traveller', 374),
 ('thrifty traveller', 342),
 ('shopping fanatic', 333),
 ('eco-tourist', 153),
 ('vegetarian', 148),
 ('nightlife seeker', 113),
 ('backpacker', 76),
 ('trendsetter', 43)]

In [102]:
geocode_result = t.gmaps.geocode('Yorkshire')

In [15]:
def selector(dk):

    """
    dk is {'column': value}
    """
    
    if not (set(dk) <= set(t.data.columns)):
        print('wrong segments!')
        raise Exception()
        
    out = t.data
    
    for k in dk:
        out = out[out[k] == dk[k]]
        
    if not out.empty:
        return out
    else:
        print('empty result!')
        raise Exception()

In [58]:
female_lux_df = selector({'gender': 'f', 'luxury traveller': 1})
male_foodie_df = selector({'gender': 'm', 'foodie': 1})

In [59]:
d1 = female_lux_df[['date_of_experience', 'id']].groupby(['date_of_experience']).count().reset_index()
d1['date_of_experience'] = d1['date_of_experience'].apply(lambda x: arrow.get(x, 'MM/YYYY'))
d2 = male_foodie_df[['date_of_experience', 'id']].groupby(['date_of_experience']).count().reset_index()
d2['date_of_experience'] = d2['date_of_experience'].apply(lambda x: arrow.get(x, 'MM/YYYY'))

In [90]:
d1_scatter = go.Scatter(x=d1.date_of_experience, 
                            y=d1.id, mode='markers', 
                               marker=dict(size=12, line=dict(width=0),color="orange"),
                                name='Luxury Females', text='pidgeons diamonds ticket cruise nice',)

d2_scatter = go.Scatter(x=d2.date_of_experience, 
                            y=d2.id, mode='markers', 
                               marker=dict(size=12, line=dict(width=0),color="#2C72EC"),
                                name='Foodie Males', text='pizza expensive rip-off sucks chips',)

fig_data = [d1_scatter, d2_scatter]

layout_both = go.Layout(
                title='Reviews for Sydney Opera House',
                hovermode='closest',
                    xaxis=dict(title='date', ticklen=5, zeroline=False, gridwidth=2),
                    yaxis=dict(title='# reviews', ticklen=5, gridwidth=2),
                    )

fig = go.Figure(data=fig_data, layout=layout_both)

In [91]:
iplot(fig)

In [None]:
import spacy
nlp = spacy.load('en')

In [43]:
d = t.data[t.data['vegetarian'].notnull()]
d['vegetarian'] = d['vegetarian'].apply(lambda x: 'yes' if int(x) == 1 else 'no' if int(x) == 0 else None)

In [45]:
corpus = st.CorpusFromPandas(d, 
                             category_col='vegetarian', 
                             text_col='text', nlp=nlp).build()

In [46]:
list(corpus.get_scaled_f_scores_vs_background().index[:10])


read_table is deprecated, use read_csv instead, passing sep='\t'.



['koalas',
 'citycat',
 'koala',
 'kangaroos',
 'southbank',
 'coot',
 'brisbane',
 'parklands',
 'emus',
 'cootha']

In [47]:
term_freq_df = corpus.get_term_freq_df()

In [48]:
term_freq_df['vegetarians'] = corpus.get_scaled_f_scores('yes')
term_freq_df['non-vegetarians'] = corpus.get_scaled_f_scores('no')

In [49]:
pprint(list(term_freq_df.sort_values(by='vegetarians', ascending=False).index[:10]))

['anything',
 'hold the',
 'would be',
 'i do',
 'children',
 'every',
 'train',
 'a few',
 'town',
 'have to']


In [50]:
pprint(list(term_freq_df.sort_values(by='non-vegetarians', ascending=False).index[:10]))

['little',
 'lots',
 'lots of',
 'much',
 'all the',
 'when',
 'small',
 'what',
 'southbank',
 'restaurant']


In [53]:
html = st.produce_scattertext_explorer(corpus, category='vegetarian',category_name='yes', width_in_pixels=1000)

AssertionError: 

In [124]:
open("veg.html", 'wb').write(html.encode('utf-8'))

3011462

In [78]:

iplot(cf.datagen.lines().iplot(asFigure=True,
                               kind='scatter',xTitle='Dates',yTitle='Returns',title='Returns'))

ValueError: 
    Invalid value of type 'builtins.str' received for the 'bgcolor' property of layout.legend
        Received value: 'pearl02'

    The 'bgcolor' property is a color and may be specified as:
      - A hex string (e.g. '#ff0000')
      - An rgb/rgba string (e.g. 'rgb(255,0,0)')
      - An hsl/hsla string (e.g. 'hsl(0,100%,50%)')
      - An hsv/hsva string (e.g. 'hsv(0,100%,100%)')
      - A named CSS color:
            aliceblue, antiquewhite, aqua, aquamarine, azure,
            beige, bisque, black, blanchedalmond, blue,
            blueviolet, brown, burlywood, cadetblue,
            chartreuse, chocolate, coral, cornflowerblue,
            cornsilk, crimson, cyan, darkblue, darkcyan,
            darkgoldenrod, darkgray, darkgrey, darkgreen,
            darkkhaki, darkmagenta, darkolivegreen, darkorange,
            darkorchid, darkred, darksalmon, darkseagreen,
            darkslateblue, darkslategray, darkslategrey,
            darkturquoise, darkviolet, deeppink, deepskyblue,
            dimgray, dimgrey, dodgerblue, firebrick,
            floralwhite, forestgreen, fuchsia, gainsboro,
            ghostwhite, gold, goldenrod, gray, grey, green,
            greenyellow, honeydew, hotpink, indianred, indigo,
            ivory, khaki, lavender, lavenderblush, lawngreen,
            lemonchiffon, lightblue, lightcoral, lightcyan,
            lightgoldenrodyellow, lightgray, lightgrey,
            lightgreen, lightpink, lightsalmon, lightseagreen,
            lightskyblue, lightslategray, lightslategrey,
            lightsteelblue, lightyellow, lime, limegreen,
            linen, magenta, maroon, mediumaquamarine,
            mediumblue, mediumorchid, mediumpurple,
            mediumseagreen, mediumslateblue, mediumspringgreen,
            mediumturquoise, mediumvioletred, midnightblue,
            mintcream, mistyrose, moccasin, navajowhite, navy,
            oldlace, olive, olivedrab, orange, orangered,
            orchid, palegoldenrod, palegreen, paleturquoise,
            palevioletred, papayawhip, peachpuff, peru, pink,
            plum, powderblue, purple, red, rosybrown,
            royalblue, saddlebrown, salmon, sandybrown,
            seagreen, seashell, sienna, silver, skyblue,
            slateblue, slategray, slategrey, snow, springgreen,
            steelblue, tan, teal, thistle, tomato, turquoise,
            violet, wheat, white, whitesmoke, yellow,
            yellowgreen