In [1]:
import json

import pandas as pd
import spacy

from collections import Counter, defaultdict
from itertools import chain
import re

import arrow 

import googlemaps

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
import plotly.graph_objs as go

import cufflinks as cf

import scattertext as st
from scattertext import word_similarity_explorer

from gender import GenderDetector

In [2]:
nlp = spacy.load('en')

nlp.Defaults.stop_words |= {'probably'}
STOPWORDS = nlp.Defaults.stop_words

In [3]:
init_notebook_mode(connected=True)

In [10]:
class T:
    
    def __init__(self, review_file, users_file, attract_file, impute_country=False, impute_gender=False):
        
        """
        collected TripAdvisor data comes as JSONs; this class does some data processing including imputation
        """

        self.r = json.load(open(review_file))
        self.u = json.load(open(users_file))
        self.a = json.load(open(attract_file))  

        # convert everything to pandas
        self.u_df = pd.DataFrame(self.u).dropna(subset=['name'])
        
        self.r_df = pd.DataFrame(self.r).dropna(subset=['attr_id'])

        self.tag_cols = list(set([tg for tg in chain.from_iterable(self.u_df['tags'])]))
        
        if impute_country:
            self.gmaps = googlemaps.Client(key=open('creds/geocoding_api.key').readline().strip())
        if impute_gender:
            self.gd = GenderDetector()
    
    def stats(self):
        
        self.user_stats = defaultdict(list)
        
        self.review_ids = set()
        self.attr_ids = set()
        self.user_names = set()
        self.dates_exp = set()
        
        for r in self.r:
            
            self.review_ids.add(r['id'])
            self.attr_ids.add(r['attr_id'])
            self.user_names.add(r['by_user'])
            
            if r['date_of_experience']:
                self.dates_exp.add(arrow.get(r['date_of_experience'], 'MM/YYYY'))
        
        print(f'DATA\n{"".join(["-"]*4)}')
        print('{:,} reviews written between {} and {} for {:,} attractions by {:,} users' \
                  .format(len(self.review_ids), 
                          min(self.dates_exp).format("MM/YYYY"), 
                          max(self.dates_exp).format("MM/YYYY"), 
                          len(self.attr_ids), 
                          len(self.user_names)))
        
        for u in self.u:
            for attr in 'tags age gender name'.split():   
                if u[attr]:
                    self.user_stats[attr].append(u[attr])
        
        print('user attribute availability:')
        print(' ~ '.join(['{}: {:,} ({:.1f})%'.format(attr, 
                                                    len(self.user_stats[attr]), 
                                                    100*len(self.user_stats[attr])/len(self.user_stats['name'])) 
                                                       for attr in 'tags age gender'.split()]))
              
        return self
        
    def _tags_to_cols(self, tag_list):
        
        if not tag_list:
            return [None]*len(self.tag_cols)
        
        return ['yes' if tag in tag_list else 'no' for tag in self.tag_cols]
    
    def tags_to_cols(self):
        
        self.u_df = pd.concat([self.u_df, 
                            pd.DataFrame(self.u_df['tags'].apply(self._tags_to_cols).to_list(), 
                                         columns=self.tag_cols)], axis=1).drop('tags', axis=1)
        
        return self

              
    def _fix_location(self, s):
              
        """
        using Google Geocoding API to clarify users location
        """
        
        loc = dict()
        
        if not (isinstance(s, str) and s.strip()):
            print('geocoding API needs a string argument!')
            return loc
        
        geocode_result = self.gmaps.geocode(s)
        
        # take only the top result
        if geocode_result:
            res = geocode_result[0]
        else:
            print(f'geocoding api can\'t find this location: {s}!')
            return loc
        
        if 'address_components' in res:
            for _ in res['address_components']:
                if 'country' in _['types']:
                    loc.update({'country': _['long_name']})
                if 'locality' in _['types']:
                    loc.update({'locality': _['long_name']})
        if 'formatted_address' in res:
            loc.update({'location': res['formatted_address']})
        
        try:
            loc.update({'coordinates': res['geometry']['location']})
        except:
            pass
        
        if not loc:
            print('locationd fields couldn\'t be retrieved from geocoding result!')
                               
        return loc

    
    def impute_location(self):
        
        self.countries = {_['name'].lower() for _ in json.load(open('data/countries.json'))}
              
        localities = []
        countries = []
    
        c_geo = 0
              
        in_str = lambda s1, s2: ' ' + s1 + ' ' in ' ' + s2 + ' '
        
        for i, row in enumerate(self.u_df.iterrows(), 1):
                               
            users_country = None
              
            if isinstance(row[1].location, str):
              
                loc_str = ' '.join(re.sub(r'[\-\_]', ' ', row[1].location).split()).lower()

                _found_countries = set()

                for country in self.countries:
                    if in_str(country, loc_str):
                        _found_countries.add(country.title())

                if len(_found_countries) == 1:
                    users_country = _found_countries.pop()
                else:
                  # run geolocation
                  r = self._fix_location(loc_str)
                  c_geo += 1
              
                  if 'country' in r:
                     users_country = r['country']
            
            print(f'#{i}: location: {row[1].location} -> country: {users_country}')
            
            countries.append(users_country)
                               
        self.u_df['country'] = countries
              
        print(f'ran geolocation {c_geo} times ({100*c_geo/len(self.u_df):.1f}%)')
        
        return self
    
    def _impute_gender(self, s):
        
        # separate nicknames like TrevorJ into trevor J; or Mike23 into Mike 23
        s = re.sub(r'([a-z]{1})([A-Z0-9]+)', r'\1 \2', s)
        
        return self.gd.gender(s)
    
    def impute_gender(self):
        
        self.u_df['gender'] = self.u_df['gender'] \
                                .apply(lambda s: s if str(s) in 'm f'.split() else self._impute_gender(str(s)))
        return self
    
    def merge_data(self):
        
        self.data = self.r_df.join(self.u_df.set_index('name'), on='by_user', how='inner')
        
        return self
    
    def selector(self, dk):

        if not (set(dk) <= set(t.data.columns)):
            print('wrong segments!')
            raise Exception()
        
        out = t.data
    
        for k in dk:
            out = out[out[k] == dk[k]]
        
        if not out.empty:
            return out
        else:
            print('empty result!')
            raise Exception()
              
    def prepr_(self, review_text):
    
        review_ = defaultdict(list)
        review_['original'] = review_text
        
        doc = nlp(review_text)
              
        review_['ents'] = [e.text for e in doc.ents]
        review_['labels'] = [e.label_ for e in doc.ents]
              
        review_text = review_text.lower()
        doc = nlp(review_text)
              
        review_['lemmatised'] = ['$' if w.is_currency else w.lemma_ for w in doc if not any([w.is_stop, w.is_punct, len(w.lemma_) < 1])]
        review_['nouns'] = [w.lemma_ for w in doc if w.pos_ == 'NOUN']
        review_['verbs'] = [w.lemma_ for w in doc if w.pos_ == 'VERB']
        
        
        return review_

In [11]:
if __name__ == '__main__':
    
    t = T(review_file='data/reviews_melb.json',
         users_file='data/reviewers_melb.json',
         attract_file='data/attractions_melbourne.json').stats().tags_to_cols().merge_data()

DATA
----
9,409 reviews written between 12/2010 and 03/2019 for 35 attractions by 7,213 users
user attribute availability:
tags: 1,762 (24.4)% ~ age: 2,539 (35.2)% ~ gender: 3,231 (44.8)%


In [12]:
t.data.columns

Index(['attr_id', 'by_user', 'date_of_experience', 'date_of_writing', 'id',
       'rating', 'text', 'title', 'age', 'gender', 'location', 'real_name',
       'vegetarian', 'beach goer', 'backpacker', 'foodie', 'trendsetter',
       'art and architecture lover', 'eco-tourist', 'nature lover',
       'like a local', 'thrill seeker', 'luxury traveller',
       'family holiday maker', 'peace and quiet seeker', 'thrifty traveller',
       'history buff', 'shopping fanatic', 'urban explorer',
       'nightlife seeker', '60+ traveller'],
      dtype='object')

In [13]:
print(json.dumps(t.prepr_(t.data.iloc[3].text), indent=4))

{
    "original": "The immigration museum shows very well. It really illustrates the variety of peoples who have come to Australia and their trials and stories. It also brings home the fact that even though different, we all value the same items (family, food, friends, and culture). The museum does a very good job of describing both the high and low points of Australia's treatment of both indigenous peoples and immigrants.",
    "ents": [
        "Australia",
        "Australia"
    ],
    "labels": [
        "GPE",
        "GPE"
    ],
    "lemmatised": [
        "immigration",
        "museum",
        "show",
        "illustrate",
        "variety",
        "people",
        "come",
        "australia",
        "trial",
        "story",
        "bring",
        "home",
        "fact",
        "different",
        "value",
        "item",
        "family",
        "food",
        "friend",
        "culture",
        "museum",
        "good",
        "job",
        "describe",
       

In [8]:
do =  nlp('suthbank koala w apple Mongolia')

In [9]:
[w.label_ for w in do.ents]

['GPE']

In [None]:
ents[:12]

In [None]:
# NER separately
from spacy.pipeline import EntityRecognizer

In [None]:
df1 = df[['text', 'gender']].fillna('unknown')

In [None]:
df1.head()

In [None]:
corpus = st.CorpusFromPandas(df1, category_col='gender', text_col='text', nlp=nlp).build()

In [None]:
# term frequency data frame; note that terms become index!
term_freq_df = corpus.get_term_freq_df()

In [None]:
term_freq_df.head()

In [None]:
term_freq_df.to_csv('word_freq.csv')

In [None]:
# get scaled f-scores for each term and place in new column
term_freq_df['male_score'] = corpus.get_scaled_f_scores('m')
term_freq_df['female_score'] = corpus.get_scaled_f_scores('f')
term_freq_df['unknown_score'] = corpus.get_scaled_f_scores('unknown')

In [None]:
html = st.produce_scattertext_explorer(corpus,
category='foodie',
category_name='Foodies',
not_category_name='Hist Buffs',
width_in_pixels=1000,
metadata=df_test['type'])

In [None]:
open("foodies-historybuffs.html", 'wb').write(html.encode('utf-8'))

In [None]:
html = st.produce_scattertext_explorer(corpus, category='1', 
                                       category_name='Segment 1',  # for presentation only
                                       not_category_name='2',
                                       width_in_pixels=1000, 
                                       metadata=df['segment'])

In [None]:
open("segs.html", 'wb').write(html.encode('utf-8'))

In [None]:
feat_builder = st.FeatsFromOnlyEmpath()

In [None]:
empath_corpus = st.CorpusFromParsedDocuments(df, 
                                             category_col='segment', 
                                             feats_from_spacy_doc=feat_builder, 
                                             parsed_col='text').build()

In [None]:
html = st.produce_scattertext_explorer(empath_corpus, 
                                       category='1', 
                                       category_name='Segment 1',
                                       not_category_name='Segment 2',
                                       width_in_pixels=1000,
                                       metadata=df['segment'],
                                       use_non_text_features=True,
                                       use_full_doc=True,
                                       topic_model_term_lists=feat_builder.get_top_model_term_lists())

In [None]:
open("segs_empath.html", 'wb').write(html.encode('utf-8'))

In [None]:
x_min = term_freq_df.male_score.min()
x_max = term_freq_df.male_score.max()
x_mean = term_freq_df.male_score.mean()

In [None]:
y_min = term_freq_df.female_score.min()
y_max = term_freq_df.female_score.max()
y_mean = term_freq_df.female_score.mean()

In [None]:
trace0 = go.Scatter(
    x = term_freq_df.iloc[:1000].male_score,
    y = term_freq_df.iloc[:1000].female_score,
    mode = 'markers',
    name = 'markers',
    text= term_freq_df.iloc[:1000].index
)

In [None]:
data = [trace0]
iplot(data)

In [None]:
d = pd.read_csv('word_freq.csv')

In [None]:
d.head(3)