In [2]:
import numpy as np
import pandas as pd
import scipy.stats as st

# Load data

<div class="alert alert-warning">
<b>Warning:</b> Update the data directory path before running.
</div>

In [3]:
data_directory = "../../../../../../data/web_routineness_release_clean/"

In [4]:
selected_users = pd.read_csv(data_directory + '/pre_processed/selected_users.csv')

In [5]:
processed = pd.read_csv(data_directory + "/pre_processed/browsing_with_gap.csv")

In [6]:
USERS = pd.read_csv(data_directory + '/raw/users.csv').rename(columns={'panelist_id': 'user'})
USERS = selected_users.merge(USERS, on='user')

We focus on stationary predictability:

In [7]:
STAT_RES = pd.read_csv(data_directory + '/computation_output/stationary_trajectory/predictability.csv')

# Web browsing behavior

### User activity

In [8]:
user_activity = processed.groupby('panelist_id', as_index=False).agg({'active_seconds': 'sum', 'domain': 'count'})

In [9]:
user_activity = user_activity.merge(
    STAT_RES[['user', 'domain_pi_max']], left_on='panelist_id', right_on='user')
user_activity = selected_users.merge(user_activity.drop(columns=['panelist_id']), on='user')

In [10]:
user_activity.corr("pearson")["domain_pi_max"][["active_seconds", "domain"]]

active_seconds    0.400080
domain            0.007387
Name: domain_pi_max, dtype: float64

### User interests

In [11]:
processed['category'] = processed.category_names.apply(lambda x: x.split(","))

In [12]:
user_interest = processed.groupby('panelist_id', as_index=False) \
                        .agg({'domain': list, 'category': list}) \
                        .assign(
                            category=lambda df: df['category'].apply(np.concatenate),
                            domain_count=lambda df: df['domain'].apply(len),
                            un_domain=lambda df: df['domain'].apply(set),
                            un_category=lambda df: df['category'].apply(set),
                            un_domain_len=lambda df: df['un_domain'].apply(len),
                            un_category_len=lambda df: df['un_category'].apply(len)
                        ) \
                        .merge(STAT_RES[['user', 'domain_pi_max']], left_on='panelist_id', right_on='user')
user_interest = selected_users.merge(user_interest.drop(columns=['panelist_id']), on='user')

In [13]:
user_interest.corr("pearson")["domain_pi_max"][['un_domain_len', 'un_category_len']]

un_domain_len     -0.228612
un_category_len   -0.161655
Name: domain_pi_max, dtype: float64

### User stationarity

In [14]:
user_stationarity = processed.groupby('panelist_id', as_index=False) \
                            .agg({'domain': 'count', 'active_seconds': ['mean', 'median']})
user_stationarity.columns = [ '_'.join(c) if c[1] != '' else c[0] for c in user_stationarity.columns]
user_stationarity = user_stationarity.merge(STAT_RES[['user', 'domain_pi_max']], left_on='panelist_id', right_on='user')

In [15]:
user_stationarity = selected_users.merge(user_stationarity.drop(columns=['panelist_id']), on='user')

In [16]:
user_stationarity.corr("pearson")["domain_pi_max"][['active_seconds_mean', 'active_seconds_median']]

active_seconds_mean      0.650178
active_seconds_median    0.332005
Name: domain_pi_max, dtype: float64

# Gender- and age-based differences in browsing behavior

In [17]:
def get_conf_interval(arr, percentage=.9):
    return st.t.interval(percentage, len(arr)-1, loc=np.mean(arr), scale=st.sem(arr))


def get_mean_with_intervals(df, var, intervals):
    return (
        np.mean(df[var]),
        *[ get_conf_interval(df[var], i) for i in intervals ]
    )

### Gender

Number of distinct domains visited

In [18]:
for k, g in user_interest.merge(USERS, on='user').groupby('gender', as_index=False): 
    percentage = .95
    m, (lower, upper) = get_mean_with_intervals(g, 'un_domain_len', [percentage])
    print (k, "\t%1.2f, %d%% CI [%1.2f, %1.2f]" % (m, percentage*100, lower, upper))

female 	140.22, 95% CI [132.16, 148.29]
male 	161.47, 95% CI [151.81, 171.13]


Mean seconds spent per domain visit

In [19]:
for k, g in user_stationarity.merge(USERS, on='user').groupby('gender', as_index=False): 
    percentage = .95
    m, (lower, upper) = get_mean_with_intervals(g, 'active_seconds_mean', [percentage])
    print (k, "\t%1.3f, %d%% CI [%1.2f, %1.2f]" % (m, percentage*100, lower, upper))

female 	37.339, 95% CI [35.54, 39.14]
male 	33.147, 95% CI [31.47, 34.83]


### Age

Number of distinct domains visited

In [20]:
for k, g in user_interest.merge(USERS, on='user').groupby('age_recode', as_index=False): 
    percentage = .95
    m, (lower, upper) = get_mean_with_intervals(g, 'un_domain_len', [percentage])
    print (k, "%1.2f, %d%% CI [%1.2f, %1.2f]" % (m, percentage*100, lower, upper))

(24,34] 143.15, 95% CI [129.55, 156.75]
(34,44] 155.58, 95% CI [138.69, 172.47]
(44,54] 157.41, 95% CI [144.95, 169.88]
(54,64] 164.48, 95% CI [149.50, 179.46]
(64,80] 142.94, 95% CI [113.99, 171.90]
[18,24] 123.77, 95% CI [109.97, 137.58]


Mean seconds spent per domain visit

In [21]:
for k, g in user_stationarity.merge(USERS, on='user').groupby('age_recode', as_index=False): 
    percentage = .95
    m, (lower, upper) = get_mean_with_intervals(g, 'active_seconds_mean', [percentage])
    print (k, "%1.3f, %d%% CI [%1.2f, %1.2f]" % (m, percentage*100, lower, upper))

(24,34] 29.357, 95% CI [27.74, 30.97]
(34,44] 29.316, 95% CI [27.21, 31.42]
(44,54] 36.594, 95% CI [34.13, 39.06]
(54,64] 41.608, 95% CI [38.09, 45.13]
(64,80] 40.167, 95% CI [35.05, 45.29]
[18,24] 34.441, 95% CI [31.17, 37.72]
