In [74]:
import json

import pandas as pd
import spacy

import time

import numpy as np

from collections import Counter, defaultdict
from itertools import chain
import re

import arrow 

import googlemaps

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
import plotly.graph_objs as go

import cufflinks as cf

import scattertext as st
from scattertext import word_similarity_explorer

from gender import GenderDetector

In [39]:
nlp = spacy.load('en')

nlp.Defaults.stop_words |= {'probably'}
STOPWORDS = nlp.Defaults.stop_words

In [40]:
init_notebook_mode(connected=True)

In [135]:
class T:
    
    def __init__(self, review_file, users_file, attract_file):
        
        """
        collected TripAdvisor data comes as JSONs; this class does some data processing including imputation
        """

        self.r = json.load(open(review_file))
        self.u = json.load(open(users_file))
        self.a = json.load(open(attract_file))  

        # convert everything to pandas
        self.u_df = pd.DataFrame(self.u).dropna(subset=['name'])
        self.r_df = pd.DataFrame(self.r).dropna(subset=['attr_id']).dropna(subset=['text'])
        
        self.attribute_encodings = json.load(open('data/attribute_encodings.json'))
        
        self.attribute_encodings_rev = defaultdict(lambda: defaultdict(str))
        
        for attr in self.attribute_encodings:
            self.attribute_encodings_rev[attr] = {s: i for i, s in self.attribute_encodings[attr].items()}
            
        self.countries = json.load(open('data/countries.json'))
        self.KEY_COUNTRIES = [line.lower().strip() for line in open('data/key_countries.txt').readlines() 
                                              if line.lower().strip()]
        
        self.gmaps = googlemaps.Client(key=open('creds/geocoding_api.key').readline().strip())
        
        self.gd = GenderDetector()
        
    
    def drop_unusable(self, subs=['text']):
        
        self.r_df = self.r_df.dropna(subset=subs)
        
        return self
    
    def review_attributes(self):
        
        
        self.genders = ['all'] + [_.lower() for _ in set(self.u_df['gender']) if _.lower() in 'm f'.split()]
        self.age_groups = ['all'] + sorted([ag for ag in set(self.u_df['age']) if '-' in str(ag)], 
                                               key=lambda x: int(str(x).split('-')[0]))
        
        self.tags = list(set([tg for tg in chain.from_iterable(self.u_df['tags'])]))
        
        self.tourist_types = ['all'] + sorted(self.tags)
        
        return self
    
    
    def stats(self):
        
        self.user_stats = defaultdict(list)
        
        self.review_ids = set()
        self.attr_ids = set()
        self.user_names = set()
        self.dates_exp = set()
        
        for r in self.r:
            
            self.review_ids.add(r['id'])
            self.attr_ids.add(r['attr_id'])
            self.user_names.add(r['by_user'])
            
            if r['date_of_experience']:
                self.dates_exp.add(arrow.get(r['date_of_experience'], 'MM/YYYY'))
        
        print(f'DATA\n{"".join(["-"]*4)}')
        print('{:,} reviews written between {} and {} for {:,} attractions by {:,} users' \
                  .format(len(self.review_ids), 
                          min(self.dates_exp).format("MM/YYYY"), 
                          max(self.dates_exp).format("MM/YYYY"), 
                          len(self.attr_ids), 
                          len(self.user_names)))
        
        for u in self.u:
            for attr in 'tags age gender name'.split():   
                if u[attr]:
                    self.user_stats[attr].append(u[attr])
        
        print('user attribute availability:')
        print(' ~ '.join(['{}: {:,} ({:.1f})%'.format(attr, 
                                                    len(self.user_stats[attr]), 
                                                    100*len(self.user_stats[attr])/len(self.user_stats['name'])) 
                                                       for attr in 'tags age gender'.split()]))
              
        return self
        
    def _tags_to_cols(self, tag_list):
        
        if not tag_list:
            return [None]*len(self.tag_cols)
        
        return ['yes' if tag in tag_list else 'no' for tag in self.tag_cols]
    
    def tags_to_cols(self):
        
        self.u_df = pd.concat([self.u_df, 
                            pd.DataFrame(self.u_df['tags'].apply(self._tags_to_cols).to_list(), 
                                         columns=self.tag_cols)], axis=1).drop('tags', axis=1)
        
        return self

              
    def _fix_location(self, s):
              
        """
        using Google Geocoding API to clarify users location
        """
        
        loc = dict()
        
        if not (isinstance(s, str) and s.strip()):
            print('geocoding API needs a string argument!')
            return loc
        
        geocode_result = self.gmaps.geocode(s)
        
        # take only the top result
        if geocode_result:
            res = geocode_result[0]
        else:
            print(f'geocoding api can\'t find this location: {s}!')
            return loc
        
        if 'address_components' in res:
            for _ in res['address_components']:
                if 'country' in _['types']:
                    loc.update({'country': _['long_name']})
                if 'locality' in _['types']:
                    loc.update({'locality': _['long_name']})
        if 'formatted_address' in res:
            loc.update({'location': res['formatted_address']})
        
        try:
            loc.update({'coordinates': res['geometry']['location']})
        except:
            pass
        
        if not loc:
            print('locationd fields couldn\'t be retrieved from geocoding result!')
                               
        return loc

    
    def impute_location(self):
        
        print('imputing country...', end=' ')
              
        t0 = time.time()
              
        localities = []
        countries = []
    
        c_geo = 0
              
        in_str = lambda s1, s2: ' ' + s1 + ' ' in ' ' + s2 + ' '
        
        for i, row in enumerate(self.u_df.iterrows(), 1):
                               
            users_country = None
              
            if isinstance(row[1].location, str):
              
                loc_str = ' '.join(re.sub(r'[\-\_]', ' ', row[1].location).split()).lower()

                _found_countries = set()

                for country in self.countries:
              
                    if in_str(country['name'].lower(), loc_str):
                        _found_countries.add(country['name'].lower())
              
                    alt_names = country.get('other_names', None)
              
                    if alt_names:
                          for alt_name in alt_names:
                              if in_str(alt_name, loc_str):
                                  _found_countries.add(alt_name.lower())

                if len(_found_countries) == 1:
                    users_country = _found_countries.pop()
                else:
#                   # run geolocation
#                   r = self._fix_location(loc_str)
#                   c_geo += 1
              
#                   if 'country' in r:
#                      users_country = r['country'].lower()
              
                  users_country = None
            
#             print(f'#{i}: location: {row[1].location} -> country: {users_country}')
            
            countries.append(users_country)
                               
        self.u_df['country'] = [c if (not c) or (c in self.KEY_COUNTRIES) else 'other' for c in countries]
        
        m, s = divmod(time.time() - t0, 60)
              
        print(f'done. elapsed time: {m:.0f}:{s:.0f}')
              
#         print(f'ran geolocation {c_geo} times ({100*c_geo/len(self.u_df):.1f}%)')
        
        return self
    
    def _impute_gender(self, s):
        
        if not isinstance(s, str):
              return None
              
        # separate nicknames like TrevorJ into trevor J; or Mike23 into Mike 23
        s = re.sub(r'([a-z]{1})([A-Z0-9]+)', r'\1 \2', s)
        
        return self.gd.gender(s)
    
    def impute_gender(self):
        
        print('imputing gender...', end=' ')
              
        t0 = time.time()
        
        avail_msk = self.u_df['gender'].str.lower().isin(self.genders)
        
        tot_users = len(set(self.u_df['name']))
              
        g_avail_bf = len(set(self.u_df[avail_msk]['name']))
        
        av = self.u_df[avail_msk]
        nav = self.u_df[~avail_msk]
        
        nav['gender'] = nav['name'].apply(self._impute_gender)
              
        self.u_df = pd.concat([av, nav])
              
        m, s = divmod(time.time() - t0, 60)
              
        print(f'done. elapsed time: {m:.0f}:{s:.0f}')
              
        g_avail_af = len(set(self.u_df[self.u_df['gender'].str.lower().isin(self.genders)]['name']))
              
        print(f'availability +{100*g_avail_af/g_avail_bf - 100:.1f}%: now {g_avail_af:,} users ({100*g_avail_af/tot_users:.1f}%) was {g_avail_bf:,} ({100*g_avail_bf/tot_users:.1f}%)')
              
        return self
    
    def merge_data(self):
        
        self.data = self.r_df.join(self.u_df.set_index('name'), on='by_user', how='inner')
        
        return self
    
    def selector(self, req_dict):

        """
        return a data frame obtained from the original one (df) by filtering out all rows that don't match
        the required values provided in the dictionary req_dict which looks like, for example, 
        {'age': '13-17', 'gender': 'f',...}

        what if after all the filtering all that's left is an empty data frame? then just return that empty data frame
        """

        if self.data.empty:
            print('dataframe you\'re trying to select from is empty!')
            return self

        actual_cols = set(self.data.columns) | {'tourist_type'}
        required_cols = set(req_dict)

        if not (required_cols <= actual_cols):
            cols_na = ', '.join(required_cols - actual_cols)
            raise Exception(f'column(s) {cols_na} you\'re asking for are not available!')

        out = self.data

        for col in required_cols:

            if req_dict[col] != 'all':

                if col != 'tourist_type':
                    out = out[out[col].astype(str) == req_dict[col]]
                else:
                    out = out[out[req_dict[col]] == 'yes']
                if out.empty:
                    print('dataframe you\'re trying to select from became empty!')
                    break

        return out
              
    def prepr_(self, review_text):
              
        if not review_text.strip():
              return None
              
        review_ = defaultdict(list)
              
        review_['original'] = review_text
        
        doc = nlp(review_text)
              
        review_['ents'] = [e.text for e in doc.ents]
        review_['labels'] = [e.label_ for e in doc.ents]
              
        doc = nlp(review_text.lower())
        
        review_['lemmatised'] = [v for v in ['$' if w.is_currency else w.lemma_ for w in doc] if v.isalpha() and (len(v) > 1)]

        review_['nouns'] = [w.lemma_ for w in doc if w.pos_ == 'NOUN']
        review_['verbs'] = [w.lemma_ for w in doc if w.pos_ == 'VERB']
        
        return review_
              
    def get_textdata(self, seg1_dict, seg2_dict, min_frq=5):
        
        t0 = time.time()
              
        print('s1=', seg1_dict)
        print('s2=', seg2_dict)
        
        eligible = []
        seg_dfs = []
              
        for i, s in enumerate([seg1_dict, seg2_dict], 1):
              
              s_df = self.selector(s)
              
              if s_df.empty or (len(set(s_df['by_user'])) < 50):
                  print(f'not enough data for segment {i}')
                  eligible.append(0)
              else:
                  seg_dfs.append(s_df)
                  eligible.append(1)
        
        print(eligible)
              
        if not all(eligible):
              return self
        else:
              print('both good')
        
        t = []
              
        for i, df in enumerate(seg_dfs, 1):
              
            t_ = df['text'].apply(lambda x: ' '.join(self.prepr_(x)['lemmatised']))

            t.append(
                pd.concat(
                    [
                        pd.Series(['seg' + str(i)]*len(df)),
                        df['text'].apply(lambda x: ' '.join(self.prepr_(x)['lemmatised']))
                    ], axis=1, ignore_index=True
                    )
                  ).rename(columns={0: 'segment', 1: 'text'})
              
        print(t[0].head(3))

            
              
              
#         self.text_data = pd.concat([seg1_df[['lemmatised', 'segment']], 
#                                     seg2_df[['lemmatised', 'segment']]])
        
#         corpus = st.CorpusFromPandas(self.text_data, 
#                                      category_col='segment', 
#                                      text_col='lemmatised', 
#                                      nlp=nlp).build()
              
#         self.freq_data = corpus.get_term_freq_df().rename(columns={'seg1 freq': 'seg1_frq', 'seg2 freq': 'seg2_frq'})
#         self.freq_data['s1_score'] = corpus.get_scaled_f_scores('seg1')
#         self.freq_data['s2_score'] = corpus.get_scaled_f_scores('seg2')
              
#         # impose min frequency
#         self.freq_data = self.freq_data[(self.freq_data['seg1_frq'] >= min_frq) & (self.freq_data['seg2_frq'] >= min_frq)]
#         print(f'{len(self.freq_data):,} words occur at least {min_frq} times')
              
#         sc = np.vectorize(lambda s1, s2: 2*(-0.5+(s1 if s1>s2 else 1-s2 if s2>s1 else 0)))
              
#         self.freq_data['nfsc'] = sc(self.freq_data['s1_score'], self.freq_data['s2_score'])
              
#         # scale frequencies
#         seg1_frq_min, seg1_frq_max = self.freq_data['seg1_frq'].min(), self.freq_data['seg1_frq'].max()
#         seg2_frq_min, seg2_frq_max = self.freq_data['seg2_frq'].min(), self.freq_data['seg2_frq'].max()
              
#         self.freq_data['seg1_frq_sc'] = (self.freq_data['seg1_frq'] - seg1_frq_min)/(seg1_frq_max - seg1_frq_min)
#         self.freq_data['seg2_frq_sc'] = (self.freq_data['seg2_frq'] - seg2_frq_min)/(seg2_frq_max - seg2_frq_min)
              
#         # filename to save as .csv
#         fn_ = ['tdf']
        
#         for i, sdik in enumerate([seg1_dict, seg2_dict], 1):
              
#               fn_.append('-seg' + str(i) + '-')  # so now ['textdf', '-seg1-']
            
#               for attr in 'age gender tourist_type country'.split():
                    
#                    fn_.append(attr[0]) # so now ['textdf', 'seg1', 'a']
#                    # self.attribute_encodings_rev is like 'age': {'13-17': '1'},..
#                    fn_.append(self.attribute_encodings_rev[attr].get(sdik.get(attr, 'all'), 'all'))  # ['textdf', '-seg1-', 'a', '2']
        
#         fn = 'data/' + ''.join(fn_) + '.csv'

#         self.freq_data.to_csv(fn)
              
#         m, s = divmod(time.time() - t0, 60)
              
#         print(f'done. savet to {fn}. elapsed time: {m:.0f}:{s:.0f}')
        
        return self
    
    def generate_text_data(self):
        
        s1s = [{'age': ag, 'gender': g} for g in self.genders for ag in self.age_groups]
        s2s = [{'age': ag, 'gender': g} for g in self.genders for ag in self.age_groups]
        
        for s1 in s1s:
              for s2 in s2s:
                  self.get_textdata(s1, s2)
              
        return self
        

In [None]:
if __name__ == '__main__':
    
    t = T(review_file='mlb/revs.json',
         users_file='mlb/usrs.json',
         attract_file='mlb/attractions_melbourne.json').drop_unusable().review_attributes() \
                .stats().tags_to_cols().merge_data().generate_text_data()
#                 .impute_location() \
#                 .impute_gender() \
#                 .merge_data() \
                

DATA
----
22,264 reviews written between 01/2013 and 03/2019 for 67 attractions by 13,813 users
user attribute availability:
tags: 3,148 (22.8)% ~ age: 4,851 (35.1)% ~ gender: 6,261 (45.3)%
s1= {'age': 'all', 'gender': 'all'}
s2= {'age': 'all', 'gender': 'all'}
[1, 1]
both good


In [85]:
t.tourist_types

['all',
 '60+ traveller',
 'art and architecture lover',
 'backpacker',
 'beach goer',
 'eco-tourist',
 'family holiday maker',
 'foodie',
 'history buff',
 'like a local',
 'luxury traveller',
 'nature lover',
 'nightlife seeker',
 'peace and quiet seeker',
 'shopping fanatic',
 'thrifty traveller',
 'thrill seeker',
 'trendsetter',
 'urban explorer',
 'vegetarian']

In [44]:
TAKE = 300

df = t.freq_data.sort_values('seg1_frq', ascending=False)

s1_tb, s1_tu = df['seg1_frq_sc'].quantile(q=[0.20, 0.95])
s2_tb, s2_tu = df['seg2_frq_sc'].quantile(q=[0.20, 0.95])

df = df[((df['seg1_frq_sc'] < s1_tu) & (df['seg1_frq_sc'] > s1_tb)) & 
        ((df['seg2_frq_sc'] < s2_tu) & (df['seg2_frq_sc'] > s2_tb))].iloc[:TAKE]

colorscale=[[0, 'orange'], [1, '#2C72EC']]

layout= go.Layout(
#     title= 'Characteristic Words',
    hovermode= 'closest',
    xaxis= dict(
        title='Frequency in Reviews by Seg 1',
        ticklen= 5,
        tickmode='array',
        tickvals=np.linspace(df['seg1_frq_sc'].min(), df['seg1_frq_sc'].max(), num=5),
        ticktext=['low', '', 'medium', '', 'high'],
        zeroline= False,
        gridwidth= 2,
        showticklabels=True,
        showgrid=True,
    ),
    yaxis=dict(
        ticklen= 5,
        tickmode='array',
        tickvals=np.linspace(df['seg2_frq_sc'].min(), df['seg2_frq_sc'].max(), num=5),
        ticktext=['low', '', 'medium', '', 'high'],
        gridwidth= 2,
        zeroline=False,
        showticklabels=True,
        showgrid=True,
        tickangle=-90,
        title='Frequency in Reviews by Seg 2',
    ),
    legend=dict(orientation="h", x=0.5, y=1.1, yanchor="top"),
    annotations=[dict(text='Stronger Association with ', x=0.41, y=1.08, 
            showarrow=False, 
            xref="paper",
            yref="paper",
            yanchor="top"
                     )],
    showlegend= True
)

df_neg = df[df['nfsc']<0]
df_pos = df[df['nfsc']>0]

trace0 = go.Scatter(
    name='Seg1',
    x = df_neg['seg1_frq_sc'],
    y = df_neg['seg2_frq_sc'],
    mode = 'markers',
    hoverinfo='text', 
    marker=dict(
                color='orange', 
                size=10,
                opacity=0.85,
               ),
    text= df_neg.index)

trace1 = go.Scatter(
    name='Seg2',
    x = df_pos['seg1_frq_sc'],
    y = df_pos['seg2_frq_sc'],
    mode = 'markers',
    hoverinfo='text', 
    marker=dict(
                color='#2C72EC', 
                size=10, 
                opacity=0.85,
               ),
    text= df_pos.index)


fig= go.Figure(data=[trace0, trace1], layout=layout)

AttributeError: 'T' object has no attribute 'freq_data'

In [None]:
df.head(3)

In [None]:
iplot(fig)

In [86]:
t.data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22264 entries, 0 to 22262
Data columns (total 32 columns):
attr_id                       22264 non-null object
by_user                       22261 non-null object
date_of_experience            22160 non-null object
date_of_writing               22264 non-null object
id                            22264 non-null object
rating                        22264 non-null float64
text                          22263 non-null object
title                         22264 non-null object
age                           8442 non-null object
gender                        16569 non-null object
location                      19374 non-null object
real_name                     21565 non-null object
like a local                  6560 non-null object
family holiday maker          6560 non-null object
thrill seeker                 6560 non-null object
shopping fanatic              6560 non-null object
nature lover                  6560 non-null object
art and arch

In [87]:
corpus = st.CorpusFromPandas(t.data, 
...                              category_col='gender', 
...                              text_col='text',
...                              nlp=nlp).build()

In [91]:
term_freq_df = corpus.get_term_freq_df()

html = st.produce_scattertext_explorer(corpus, category='m', category_name='m', not_category_name='f', width_in_pixels=1000)


read_table is deprecated, use read_csv instead, passing sep='\t'.



In [92]:
open("melb.html", 'wb').write(html.encode('utf-8'))

7184955