In [1]:
import json

import pandas as pd
import spacy

import time

import numpy as np

from collections import Counter, defaultdict
from itertools import chain
import re

import arrow 

import googlemaps

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
import plotly.graph_objs as go

import cufflinks as cf

import scattertext as st
from scattertext import word_similarity_explorer

from gender import GenderDetector

In [2]:
nlp = spacy.load('en')

nlp.Defaults.stop_words |= {'probably', 'perhaps', 'really'}
# STOPWORDS = nlp.Defaults.stop_words

In [3]:
init_notebook_mode(connected=True)

In [13]:
class T:
    
    def __init__(self, review_file, users_file, attract_file):
        
        """
        collected TripAdvisor data comes as JSONs; this class does some data processing including imputation
        """

        self.r = json.load(open(review_file))
        self.u = json.load(open(users_file))
        self.a = json.load(open(attract_file))  
        
        self.attribute_encodings = json.load(open('data/attribute_encodings.json')) 
        self.attribute_encodings_rev = defaultdict(lambda: defaultdict(str))
        
        self.countries = json.load(open('data/countries.json'))
        self.KEY_COUNTRIES = [line.lower().strip() for line in open('data/key_countries.txt').readlines() 
                                              if line.lower().strip()]
        
        for attr in self.attribute_encodings:
            self.attribute_encodings_rev[attr] = {s: i for i, s in self.attribute_encodings[attr].items()}
        
#         self.gmaps = googlemaps.Client(key=open('creds/geocoding_api.key').readline().strip())
#         self.gd = GenderDetector()
        
    
    def drop_unusable_reviews(self, subs=['text', 'attr_id']):
        
        self.r_df = self.r_df.dropna(subset=subs, how='any')
        
        return self
    
    def drop_unusable_users(self, subs=['name'], how='any'):
        
        self.u_df = self.u_df.dropna(subset=subs)
        
        return self
    
    def to_pandas(self):
        
        self.u_df = pd.DataFrame(self.u)
        self.r_df = pd.DataFrame(self.r)
        self.a_df = pd.DataFrame(self.a)
        
        return self
    
    def review_available_attributes(self):     
        
        self.genders = [_.lower() for _ in set(self.u_df['gender']) if _ and (_.lower() in 'm f'.split())]
        self.age_groups = sorted([ag for ag in set(self.u_df['age']) if '-' in str(ag)], 
                                               key=lambda x: int(str(x).split('-')[0]))
        
        self.tags = sorted(set([tg for tg in chain.from_iterable(self.u_df['tags'])]))
        
        print(f'available genders: {len(self.genders)}, age groups: {len(self.age_groups)}, tourist types: {len(self.tags)}')
        
        return self
    
    
    def show_stats(self):
        
        self.stats = defaultdict()
        
        self.stats['users'] = len(set(self.u_df['name']))
        self.stats['reviews'] = len(set(self.r_df['id']))
        self.stats['attractions'] = len(set(self.a_df['id']))
        
        self.stats['users_with_tags'] = len(set(self.u_df[self.u_df['tags'].apply(lambda _: isinstance(_, list) and len(_) > 0)]['name']))
        self.stats['users_with_gender'] = len(set(self.u_df[self.u_df['gender'].apply(lambda _: str(_) in 'm f'.split())]['name']))
        self.stats['users_with_age'] = len(set(self.u_df[self.u_df['age'].apply(lambda _: isinstance(_, str) and bool(re.search(r'\d+\-\d+', _)))]['name']))
        
        print(self.stats)
              
        return self
    
    def prepr_(self, review_text):
              
        """
        process a review text review_text provided as a string
        """
              
        if not review_text.strip():
              raise Exception('review with no text!')
              
        review_ = defaultdict(list)

        # create a doc from the original; this is needed to make sure the entities/labels are captured
        # as these are sensitive to lower/upper case
        doc = nlp(review_text)
        # store entities and labels     
        review_['ents'] = ' '.join([e.text for e in doc.ents]).strip()
        review_['labels'] = ' '.join([e.label_ for e in doc.ents]).strip()
         
        # now create a doc from the lowercased reviews
        doc = nlp(review_text.lower())
        
        review_['lemmatised'] = ' '.join([v for v in ['$' if w.is_currency else w.lemma_ for w in doc] if v.isalpha() and (len(v) > 1)]).strip()

        review_['nouns'] = ' '.join([w.lemma_ for w in doc if w.pos_ == 'NOUN']).strip()
        review_['verbs'] = ' '.join([w.lemma_ for w in doc if w.pos_ == 'VERB']).strip()
        
        return review_
    
    def add_processed_reviews(self):
        
        print(f'processing {len(self.r_df)} reviews..')
            
        dd = defaultdict()
        
        e_ = len(self.r_df)//10 
              
        for i, row in enumerate(self.r_df.iterrows(), 1):
              
            dd[row[1]['id']] = self.prepr_(row[1]['text'])
              
            if i%e_ == 0:
                print(f'#{i}/{len(self.r_df)} done..')
              
        self.r_df = self.r_df.join(pd.DataFrame.from_dict(dd, orient='index'), on='id', how='inner')
              
        return self
    
    def save_to_csv(self, what_lst):
        
        print('what_lst=', what_lst)
        
        if 'reviews' in what_lst:   
            self.r_df.to_csv('data/reviews.csv', index=False)
        if 'users' in what_lst:   
            self.u_df.to_csv('data/users.csv', index=False)
        if 'attractions' in what_lst:   
            self.a_df.to_csv('data/attractions.csv', index=False)
                
        return self
             
    def _tags_to_cols(self, tag_list):
        
        if not tag_list:
            return [None]*len(self.tags)
        
        return ['yes' if tag in tag_list else 'no' for tag in self.tags]
    
    def tags_to_cols(self):
        
        self.u_df = pd.concat([self.u_df, 
                            pd.DataFrame(self.u_df['tags'].apply(self._tags_to_cols).to_list(), 
                                         columns=self.tags)], axis=1).drop('tags', axis=1)
        
        return self

              
    def _fix_location(self, s):
              
        """
        using Google Geocoding API to clarify users location
        """
        
        loc = dict()
        
        if not (isinstance(s, str) and s.strip()):
            print('geocoding API needs a string argument!')
            return loc
        
        geocode_result = self.gmaps.geocode(s)
        
        # take only the top result
        if geocode_result:
            res = geocode_result[0]
        else:
            print(f'geocoding api can\'t find this location: {s}!')
            return loc
        
        if 'address_components' in res:
            for _ in res['address_components']:
                if 'country' in _['types']:
                    loc.update({'country': _['long_name']})
                if 'locality' in _['types']:
                    loc.update({'locality': _['long_name']})
        if 'formatted_address' in res:
            loc.update({'location': res['formatted_address']})
        
        try:
            loc.update({'coordinates': res['geometry']['location']})
        except:
            pass
        
        if not loc:
            print('locationd fields couldn\'t be retrieved from geocoding result!')
                               
        return loc

    
    def impute_location(self):
        
        print('imputing country...', end=' ')
              
        t0 = time.time()
              
        localities = []
        countries = []
    
        c_geo = 0
              
        in_str = lambda s1, s2: ' ' + s1 + ' ' in ' ' + s2 + ' '
        
        for i, row in enumerate(self.u_df.iterrows(), 1):
                               
            users_country = None
              
            if isinstance(row[1].location, str):
              
                loc_str = ' '.join(re.sub(r'[\-\_]', ' ', row[1].location).split()).lower()

                _found_countries = set()

                for country in self.countries:
              
                    if in_str(country['name'].lower(), loc_str):
                        _found_countries.add(country['name'].lower())
              
                    alt_names = country.get('other_names', None)
              
                    if alt_names:
                          for alt_name in alt_names:
                              if in_str(alt_name, loc_str):
                                  _found_countries.add(alt_name.lower())

                if len(_found_countries) == 1:
                    users_country = _found_countries.pop()
                else:
#                   # run geolocation
#                   r = self._fix_location(loc_str)
#                   c_geo += 1
              
#                   if 'country' in r:
#                      users_country = r['country'].lower()
              
                  users_country = None
            
#             print(f'#{i}: location: {row[1].location} -> country: {users_country}')
            
            countries.append(users_country)
                               
        self.u_df['country'] = [c if (not c) or (c in self.KEY_COUNTRIES) else 'other' for c in countries]
        
        m, s = divmod(time.time() - t0, 60)
              
        print(f'done. elapsed time: {m:.0f}:{s:.0f}')
              
#         print(f'ran geolocation {c_geo} times ({100*c_geo/len(self.u_df):.1f}%)')
        
        return self
    
    def _impute_gender(self, s):
        
        if not isinstance(s, str):
              return None
              
        # separate nicknames like TrevorJ into trevor J; or Mike23 into Mike 23
        s = re.sub(r'([a-z]{1})([A-Z0-9]+)', r'\1 \2', s)
        
        return self.gd.gender(s)
    
    def impute_gender(self):
        
        print('imputing gender...', end=' ')
              
        t0 = time.time()
        
        avail_msk = self.u_df['gender'].str.lower().isin(self.genders)
        
        tot_users = len(set(self.u_df['name']))
              
        g_avail_bf = len(set(self.u_df[avail_msk]['name']))
        
        av = self.u_df[avail_msk]
        nav = self.u_df[~avail_msk]
        
        nav['gender'] = nav['name'].apply(self._impute_gender)
              
        self.u_df = pd.concat([av, nav])
              
        m, s = divmod(time.time() - t0, 60)
              
        print(f'done. elapsed time: {m:.0f}:{s:.0f}')
              
        g_avail_af = len(set(self.u_df[self.u_df['gender'].str.lower().isin(self.genders)]['name']))
              
        print(f'availability +{100*g_avail_af/g_avail_bf - 100:.1f}%: now {g_avail_af:,} users ({100*g_avail_af/tot_users:.1f}%) was {g_avail_bf:,} ({100*g_avail_bf/tot_users:.1f}%)')
              
        return self
    
    def merge_data(self):
        
        self.data = self.r_df.join(self.u_df.set_index('name'), on='by_user', how='inner')
        
        self.data.to_csv('data/data_.csv')
        
        return self
    
    def selector(self, req_dict):

        """
        return a data frame obtained from the original one (df) by filtering out all rows that don't match
        the required values provided in the dictionary req_dict which looks like, for example, 
        {'age': '13-17', 'gender': 'f',...}

        what if after all the filtering all that's left is an empty data frame? then just return None
        """
        
        out = self.data
        
        if self.data.empty:
            print('dataframe you\'re trying to select from is empty!')
            return None

        actual_cols = set(self.data.columns) | {'tourist_type'}
        required_cols = set(req_dict)

        if not (required_cols <= actual_cols):
            cols_na = ', '.join(required_cols - actual_cols)
            print(f'column(s) {cols_na} you\'re asking for are not available!')
            return None

        for col in required_cols:

            if req_dict[col] != 'all':

                if col != 'tag':
                    out = out[out[col].astype(str) == req_dict[col]]
                else:
                    out = out[out[req_dict[col]] == 'yes']
                if out.empty:
                    print('dataframe you\'re trying to select from became empty!')
                    break
        if out.empty:
            return None
        else:
            return out
              
              
    def select_reviews_for_segments(self, seg1_dict, seg2_dict, min_frq=5):
        
        t0 = time.time()
        
        eligible = []
        seg_dfs = []
              
        for i, s in enumerate([seg1_dict, seg2_dict], 1):
            
            s_df = self.selector(s)
            
            if not isinstance(s_df, pd.DataFrame):
                eligible.append(False)
                break
            
            # what if selected not None but a data frame but there's not enough reviews?
              
            if len(set(s_df['by_user'])) < 50:
                print(f'not enough data for segment {i}')
                eligible.append(False)   
            else:
                eligible.append(True)
                seg_dfs.append(s_df)
        
        
        if not all(eligible):
            print('not enough data to compare segments!')
            return None
        else:
            
            k1 = seg_dfs[0]
            k1['segment'] = 'seg1'
            k2 = seg_dfs[1]
            k2['segment'] = 'seg2'
            
            kk = pd.concat([k1, k2])
            
                
        return kk
    
    def analyse_reviews_for_segments(self, df, min_frq=5):
        
        print('building corpus for segments ', set(df['segment']))
        
        corpus = st.CorpusFromPandas(df, 
                                     category_col='segment', 
                                     text_col='lemmatised', 
                                     nlp=nlp).build()
        
        print('done. now frequencies..')
              
        self.freq_data = corpus.get_term_freq_df().rename(columns={'seg1 freq': 'seg1_frq', 'seg2 freq': 'seg2_frq'})
        
        print('scaled f-scores..')
        
        self.freq_data['s1_score'] = corpus.get_scaled_f_scores('seg1')
        self.freq_data['s2_score'] = corpus.get_scaled_f_scores('seg2')
              
        # impose min frequency
        print('imposing min frequency ', min_frq)
        self.freq_data = self.freq_data[(self.freq_data['seg1_frq'] >= min_frq) & (self.freq_data['seg2_frq'] >= min_frq)]
        print(f'{len(self.freq_data):,} words occur at least {min_frq} times')
        
        print('scaling f-scores..')
        sc = np.vectorize(lambda s1, s2: 2*(-0.5+(s1 if s1>s2 else 1-s2 if s2>s1 else 0)))
              
        self.freq_data['nfsc'] = sc(self.freq_data['s1_score'], self.freq_data['s2_score'])
              
        # scale frequencies
        print('another scaling..')
        seg1_frq_min, seg1_frq_max = self.freq_data['seg1_frq'].min(), self.freq_data['seg1_frq'].max()
        seg2_frq_min, seg2_frq_max = self.freq_data['seg2_frq'].min(), self.freq_data['seg2_frq'].max()
              
        self.freq_data['seg1_frq_sc'] = (self.freq_data['seg1_frq'] - seg1_frq_min)/(seg1_frq_max - seg1_frq_min)
        self.freq_data['seg2_frq_sc'] = (self.freq_data['seg2_frq'] - seg2_frq_min)/(seg2_frq_max - seg2_frq_min)
        
        print(self.freq_data.head())
              
#         # filename to save as .csv
#         fn_ = ['tdf']
        
#         for i, sdik in enumerate([seg1_dict, seg2_dict], 1):
              
#               fn_.append('-seg' + str(i) + '-')  # so now ['textdf', '-seg1-']
            
#               for attr in 'age gender tourist_type country'.split():
                    
#                    fn_.append(attr[0]) # so now ['textdf', 'seg1', 'a']
#                    # self.attribute_encodings_rev is like 'age': {'13-17': '1'},..
#                    fn_.append(self.attribute_encodings_rev[attr].get(sdik.get(attr, 'all'), 'all'))  # ['textdf', '-seg1-', 'a', '2']
        
#         fn = 'data/' + ''.join(fn_) + '.csv'

#         self.freq_data.to_csv(fn)
              
#         m, s = divmod(time.time() - t0, 60)
              
#         print(f'done. savet to {fn}. elapsed time: {m:.0f}:{s:.0f}')
        
        return self
    
#     def generate_text_data(self):
        
#         s1s = [{'age': ag, 'gender': g} for g in self.genders for ag in self.age_groups]
#         s2s = [{'age': ag, 'gender': g} for g in self.genders for ag in self.age_groups]
        
#         for s1 in s1s:
#               for s2 in s2s:
#                   self.get_textdata(s1, s2)
              
#         return self
        

In [14]:
if __name__ == '__main__':
    
    t = T(review_file='mlb/revs.json',
             users_file='mlb/usrs.json',
             attract_file='mlb/attractions_melbourne.json') \
                .to_pandas() \
                .drop_unusable_reviews() \
                .drop_unusable_users() \
                .review_available_attributes() \
                .show_stats() \
                .add_processed_reviews() \
                .save_to_csv(['reviews', 'attractions', 'users']) \
                .tags_to_cols() \
                .merge_data()
    
#                 .stats().add_processed_reviews()
#     tags_to_cols().merge_data().generate_text_data()
#                 .impute_location() \
#                 .impute_gender() \
#                 .merge_data() \
                

available genders: 2, age groups: 5, tourist types: 19
defaultdict(None, {'users': 15618, 'reviews': 25131, 'attractions': 477, 'users_with_tags': 3424, 'users_with_gender': 6974, 'users_with_age': 5444})
processing 25131 reviews..
#2513/25131 done..
#5026/25131 done..
#7539/25131 done..
#10052/25131 done..
#12565/25131 done..
#15078/25131 done..
#17591/25131 done..
#20104/25131 done..
#22617/25131 done..
#25130/25131 done..
what_lst= ['reviews', 'attractions', 'users']


In [15]:
p = t.select_reviews_for_segments({'age': '50-64'}, {'gender': 'f'})

In [19]:
t.analyse_reviews_for_segments(p)

building corpus for segments  {'seg1', 'seg2'}
done. now frequencies..
scaled f-scores..
imposing min frequency  5
7,703 words occur at least 5 times
scaling f-scores..
another scaling..
           seg1_frq  seg2_frq  s1_score  s2_score      nfsc  seg1_frq_sc  \
term                                                                       
another         109       118  0.160207  0.839793 -0.679586     0.006645   
beautiful       571       931  0.039668  0.960332 -0.920664     0.036162   
art             405       454  0.095306  0.904694 -0.809388     0.025556   
gallery          97        82  0.883550  0.116450  0.767101     0.005878   
in             3783      4088  0.101740  0.898260 -0.796521     0.241375   

           seg2_frq_sc  
term                    
another       0.006658  
beautiful     0.054557  
art           0.026454  
gallery       0.004537  
in            0.240559  


<__main__.T at 0x129e1e828>

In [18]:
p.columns

Index(['attr_id', 'by_user', 'date_of_experience', 'date_of_writing', 'id',
       'rating', 'text', 'title', 'ents', 'labels', 'lemmatised', 'nouns',
       'verbs', 'age', 'gender', 'location', 'real_name', '60+ traveller',
       'art and architecture lover', 'backpacker', 'beach goer', 'eco-tourist',
       'family holiday maker', 'foodie', 'history buff', 'like a local',
       'luxury traveller', 'nature lover', 'nightlife seeker',
       'peace and quiet seeker', 'shopping fanatic', 'thrifty traveller',
       'thrill seeker', 'trendsetter', 'urban explorer', 'vegetarian',
       'segment'],
      dtype='object')

In [44]:
TAKE = 300

df = t.freq_data.sort_values('seg1_frq', ascending=False)

s1_tb, s1_tu = df['seg1_frq_sc'].quantile(q=[0.20, 0.95])
s2_tb, s2_tu = df['seg2_frq_sc'].quantile(q=[0.20, 0.95])

df = df[((df['seg1_frq_sc'] < s1_tu) & (df['seg1_frq_sc'] > s1_tb)) & 
        ((df['seg2_frq_sc'] < s2_tu) & (df['seg2_frq_sc'] > s2_tb))].iloc[:TAKE]

colorscale=[[0, 'orange'], [1, '#2C72EC']]

layout= go.Layout(
#     title= 'Characteristic Words',
    hovermode= 'closest',
    xaxis= dict(
        title='Frequency in Reviews by Seg 1',
        ticklen= 5,
        tickmode='array',
        tickvals=np.linspace(df['seg1_frq_sc'].min(), df['seg1_frq_sc'].max(), num=5),
        ticktext=['low', '', 'medium', '', 'high'],
        zeroline= False,
        gridwidth= 2,
        showticklabels=True,
        showgrid=True,
    ),
    yaxis=dict(
        ticklen= 5,
        tickmode='array',
        tickvals=np.linspace(df['seg2_frq_sc'].min(), df['seg2_frq_sc'].max(), num=5),
        ticktext=['low', '', 'medium', '', 'high'],
        gridwidth= 2,
        zeroline=False,
        showticklabels=True,
        showgrid=True,
        tickangle=-90,
        title='Frequency in Reviews by Seg 2',
    ),
    legend=dict(orientation="h", x=0.5, y=1.1, yanchor="top"),
    annotations=[dict(text='Stronger Association with ', x=0.41, y=1.08, 
            showarrow=False, 
            xref="paper",
            yref="paper",
            yanchor="top"
                     )],
    showlegend= True
)

df_neg = df[df['nfsc']<0]
df_pos = df[df['nfsc']>0]

trace0 = go.Scatter(
    name='Seg1',
    x = df_neg['seg1_frq_sc'],
    y = df_neg['seg2_frq_sc'],
    mode = 'markers',
    hoverinfo='text', 
    marker=dict(
                color='orange', 
                size=10,
                opacity=0.85,
               ),
    text= df_neg.index)

trace1 = go.Scatter(
    name='Seg2',
    x = df_pos['seg1_frq_sc'],
    y = df_pos['seg2_frq_sc'],
    mode = 'markers',
    hoverinfo='text', 
    marker=dict(
                color='#2C72EC', 
                size=10, 
                opacity=0.85,
               ),
    text= df_pos.index)


fig= go.Figure(data=[trace0, trace1], layout=layout)

AttributeError: 'T' object has no attribute 'freq_data'

In [None]:
df.head(3)

In [None]:
iplot(fig)

In [86]:
t.data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22264 entries, 0 to 22262
Data columns (total 32 columns):
attr_id                       22264 non-null object
by_user                       22261 non-null object
date_of_experience            22160 non-null object
date_of_writing               22264 non-null object
id                            22264 non-null object
rating                        22264 non-null float64
text                          22263 non-null object
title                         22264 non-null object
age                           8442 non-null object
gender                        16569 non-null object
location                      19374 non-null object
real_name                     21565 non-null object
like a local                  6560 non-null object
family holiday maker          6560 non-null object
thrill seeker                 6560 non-null object
shopping fanatic              6560 non-null object
nature lover                  6560 non-null object
art and arch

In [87]:
corpus = st.CorpusFromPandas(t.data, 
...                              category_col='gender', 
...                              text_col='text',
...                              nlp=nlp).build()

In [91]:
term_freq_df = corpus.get_term_freq_df()

html = st.produce_scattertext_explorer(corpus, category='m', category_name='m', not_category_name='f', width_in_pixels=1000)


read_table is deprecated, use read_csv instead, passing sep='\t'.



In [92]:
open("melb.html", 'wb').write(html.encode('utf-8'))

7184955