In [1]:
import pandas as pd
import multiprocessing as mp
import glob
from functools import partial, reduce
from functional import seq
import seaborn as sns

In [2]:
import numpy as np

# Load data


<div class="alert alert-warning">
<b>Warning:</b> Update the data directory path before running. 
</div>

We tested this code using the data available at https://zenodo.org/record/4383164.

In [3]:
data_directory = "../../../../../../data/web_routineness_release_clean/"

In [4]:
users = pd.read_csv(data_directory + '/raw/users.csv')

In [5]:
urls = pd.read_csv(
    data_directory + '/raw/browsing.csv', dtype={'id': int, 'web_visits_id': int}, parse_dates=['used_at'])

In [6]:
domain_categories = pd.read_csv(
    data_directory + '/raw/domain_categories.csv', converters={'category': lambda x: tuple(sorted(x.split(',')))})

# Defining the category of a website

Cleaning up the data:

In [7]:
words_dashed = ['black', 'content', 'illegal', 'information', 'job', 'media', 
                'message', 'filter', 'real', 'search', 'social', 'streaming', 'virtual']

def put_dash(words_dashed, text):
    temp = list(filter(lambda w: text.startswith(w), words_dashed))
    return text if len(temp) < 1 else text.replace(temp[0], f'{temp[0]}-')


words_replaced = [('food', 'food and recipes'), ('filteravoidance', 'proxy and filter-avoidance'), 
                  ('translation', 'translators'), ('sport', 'sports')]

def replace(words_replaced, text):
    temp = list(filter(lambda p: p[0] in text, words_replaced))
    return text if len(temp) < 1 else temp[0][1]
    

dash = partial(put_dash, words_dashed)
words = partial(replace, words_replaced)

categories = pd.DataFrame(list(seq(domain_categories.category).reduce(lambda acc, i: acc.union( set(i)), set())), columns=['raw']) \
                .sort_values('raw') \
                .assign(name=lambda df: df.raw.apply(lambda x: words(dash(x.replace('and', ' and '))))) \
                .assign(id=lambda df: (df.name != df.name.shift()).cumsum())

#### Add Category Manually

We will add two new categories: email and productivity.

In [8]:
categories = categories.append(
    pd.DataFrame([(None, 'email', 0), (None, 'productivity', 0)], 
                 columns=['raw', 'name', 'id'])).sort_values(['name', 'raw']).assign(
    id=lambda df: (df.name != df.name.shift()).cumsum())

Let's export this:

In [9]:
categories.to_csv(data_directory + '/pre_processed/categories.csv', index=False)

#### Rebuild Domain Categories

- There was domain-category_ids match, now it's domain-category_names

In [10]:
def find_category_names(categories, t):
    return tuple(map(lambda c: categories.query(f"raw == '{c}'").iloc[0, 1], t))

category = partial(find_category_names, categories)
domain_categories = domain_categories.assign(category_names=domain_categories.category.apply(category))

Exporting the data:

In [11]:
domain_categories_processed = domain_categories[['domain', 'category_names']].assign(
    category_names=lambda df: df.category_names.apply(lambda l: ','.join(l)))

domain_categories_processed.to_csv(data_directory + '/pre_processed/processed_domain_categories.csv', index=False)

# Adding category info related to the subdomains

In [12]:
subdomains = urls[~urls.subdomain.isna()][["id", "url", "domain", "subdomain"]].copy()
subdomains = subdomains.merge(domain_categories_processed, on='domain')

### Examining 'mail' subdomains

1. Large companies which provide email services such as Google, Yahoo, T-Online, Vodafoen, GMX, Web.de.
2. Universities in Germany provide email services.
3. Other subdomains are seen in websites for the business purposes, i.e. subscription, login to buy, etc.

In [13]:
def get_subdomains(df, main, sub): 
    return df[(df.domain == main) & (df.subdomain.str.contains(sub))]

Let's see the ones that contains `mail`:

In [14]:
mail_subdomains = subdomains[subdomains.subdomain.str.contains('mail')]
top_mail_subdomains = mail_subdomains.groupby('domain', as_index=False).agg({'url': 'count'}).nlargest(400, 'url')
top_mail_subdomains.to_csv(data_directory + '/pre_processed/domains_containing_mail_raw.csv', index=False)

Next, we manually curated this .csv in order to generate a new file named `domains_contain_mail.csv`.

The method: 
- Look at the top domains with subdomain that has _mail_ string.
- If sample urls belong the domain seem meaningful urls, then check domain as email.
- Totally, 400 domain were evaluated manually.

### Creating subdomains of different types

* Emails Domains

In [15]:
urls_email = pd.concat(
    [
        subdomains[(subdomains.subdomain.str.endswith('mail.'))]
            .merge(pd.read_csv(data_directory + '/pre_processed/domains_containing_mail.csv', 
                               usecols=['domain', 'is_email']).query('is_email == 1'), on='domain')
            .drop(columns=['is_email']),
        get_subdomains(subdomains, 'google.com', 'accounts.'),
        get_subdomains(subdomains, 'web.de', 'navigator.'),
        get_subdomains(subdomains, 'yahoo.com', 'login.'),
        get_subdomains(subdomains, 'live.com', 'outlook.'),
        get_subdomains(subdomains, 'live.com', 'login.'),
        get_subdomains(subdomains, 'live.com', 'account.'),
        get_subdomains(subdomains, 'live.com', 'signup.'),
        get_subdomains(subdomains, 'gmx.net', 'navigator.'),
        get_subdomains(subdomains, 'aol.com', 'login.'),
    ],
    sort=False
) \
    .assign(
        sub_level_domain=lambda df: df.subdomain + df.domain,
        category_names=lambda df: df.category_names.apply(
            lambda x: tuple(['email']))
).drop_duplicates(['id'])

In [16]:
urls_email.shape

(970034, 6)

* Productivity domains

In [17]:
urls_productivity = pd.concat([
    get_subdomains(subdomains, 'google.com', 'docs.'),
    get_subdomains(subdomains, 'google.com', 'calendar.'),
    get_subdomains(subdomains, 'google.com', 'office.'),
],
    sort=False 
) \
    .assign(
        sub_level_domain=lambda df: df.subdomain + df.domain,
        category_names=lambda df: df.category_names.apply(
            lambda x: tuple(['productivity']))
).drop_duplicates(['id'])

urls_productivity.sample(5)

Unnamed: 0,id,url,domain,subdomain,category_names,sub_level_domain
248323,1018878760,112544,google.com,docs.,"(productivity,)",docs.google.com
486598,1072580243,1048409,google.com,docs.,"(productivity,)",docs.google.com
198228,1049128607,112619,google.com,docs.,"(productivity,)",docs.google.com
577621,1097814380,1692581,google.com,docs.,"(productivity,)",docs.google.com
487674,1072865283,1249093,google.com,docs.,"(productivity,)",docs.google.com


In [18]:
urls_productivity.shape

(25244, 6)

* Media sharing domains

In [19]:
urls_media_sharing = pd.concat([
    get_subdomains(subdomains, 'google.com', 'photos.'),
    get_subdomains(subdomains, 'google.com', 'drive.'),
    get_subdomains(subdomains, 'live.com', 'onedrive.'),
],
    sort=False 
) \
    .assign(
        sub_level_domain=lambda df: df.subdomain + df.domain,
        category_names=lambda df: df.category_names.apply(
            lambda x: tuple(['media-sharing']))
).drop_duplicates(['id'])

urls_media_sharing.sample(5)

Unnamed: 0,id,url,domain,subdomain,category_names,sub_level_domain
365015,1045037432,1621075,google.com,photos.,"(media-sharing,)",photos.google.com
365026,1045037400,1621082,google.com,photos.,"(media-sharing,)",photos.google.com
384013,1047637236,1718252,google.com,photos.,"(media-sharing,)",photos.google.com
400618,1052449104,622010,google.com,photos.,"(media-sharing,)",photos.google.com
139390,1034113684,17846,google.com,drive.,"(media-sharing,)",drive.google.com


In [20]:
urls_media_sharing.shape

(15375, 6)

* Social networking domains

In [21]:
urls_social_networking = pd.concat([
    get_subdomains(subdomains, 'google.com', 'plus.'),
    get_subdomains(subdomains, 'yahoo.com', 'drive.'),
    get_subdomains(subdomains, 'vodafone.de', 'groups.'),
],
    sort=False 
) \
    .assign(
        sub_level_domain=lambda df: df.subdomain + df.domain,
        category_names=lambda df: df.category_names.apply(
            lambda x: tuple(['social-networking']))
).drop_duplicates(['id'])

urls_social_networking.sample(5)

Unnamed: 0,id,url,domain,subdomain,category_names,sub_level_domain
525434,1084774904,47583,google.com,plus.,"(social-networking,)",plus.google.com
540533,1089643784,294048,google.com,plus.,"(social-networking,)",plus.google.com
263439,1060448799,294048,google.com,plus.,"(social-networking,)",plus.google.com
381973,1093216431,1702918,google.com,plus.,"(social-networking,)",plus.google.com
292667,1065272095,294048,google.com,plus.,"(social-networking,)",plus.google.com


In [22]:
urls_social_networking.shape

(2285, 6)

* News domains

In [23]:
urls_news = pd.concat([
    get_subdomains(subdomains, 'google.com', 'news.'),
    get_subdomains(subdomains, 'vodafone.de', 'magazin.'),
    get_subdomains(subdomains, 'vodafone.de', 'x.enews.'),
],
    sort=False 
) \
    .assign(
        sub_level_domain=lambda df: df.subdomain + df.domain,
        category_names=lambda df: df.category_names.apply(
            lambda x: tuple(['news']))
).drop_duplicates(['id'])

urls_news.sample(5)

Unnamed: 0,id,url,domain,subdomain,category_names,sub_level_domain
154775,1036665747,56258,google.com,news.,"(news,)",news.google.com
329441,1076231523,1383022,google.com,news.,"(news,)",news.google.com
513875,1081026131,145170,google.com,news.,"(news,)",news.google.com
544553,1090741324,2668880,google.com,news.,"(news,)",news.google.com
181963,1045689731,25409,google.com,news.,"(news,)",news.google.com


In [24]:
urls_news.shape

(6165, 6)

* Translators domains

In [25]:
urls_translators = pd.concat([
    get_subdomains(subdomains, 'google.com', 'translate.'),
],
    sort=False 
) \
    .assign(
        sub_level_domain=lambda df: df.subdomain + df.domain,
        category_names=lambda df: df.category_names.apply(
            lambda x: tuple(['translators']))
).drop_duplicates(['id'])

urls_news.sample(5)

Unnamed: 0,id,url,domain,subdomain,category_names,sub_level_domain
230553,1015601324,30580,google.com,news.,"(news,)",news.google.com
485233,1072065875,25409,google.com,news.,"(news,)",news.google.com
437479,1066833920,25409,google.com,news.,"(news,)",news.google.com
165971,1041049151,260270,google.com,news.,"(news,)",news.google.com
333040,1076754771,1407742,google.com,news.,"(news,)",news.google.com


In [26]:
urls_translators.shape

(28106, 6)

* Shopping domains

In [27]:
urls_shopping = pd.concat([
    get_subdomains(subdomains, 'google.com', 'play.'),
],
    sort=False 
) \
    .assign(
        sub_level_domain=lambda df: df.subdomain + df.domain,
        category_names=lambda df: df.category_names.apply(
            lambda x: tuple(['shopping']))
).drop_duplicates(['id'])

urls_shopping.sample(5)

Unnamed: 0,id,url,domain,subdomain,category_names,sub_level_domain
579177,1098292344,2907104,google.com,play.,"(shopping,)",play.google.com
146066,1037062600,6678,google.com,play.,"(shopping,)",play.google.com
350628,1079277615,1516059,google.com,play.,"(shopping,)",play.google.com
153685,1036534151,157189,google.com,play.,"(shopping,)",play.google.com
216625,1052205683,33444,google.com,play.,"(shopping,)",play.google.com


In [28]:
urls_shopping.shape

(3206, 6)

# Exporting data

Creating a new browsing history file:

In [29]:
urls_with_categories = urls.merge(domain_categories, on='domain').rename(columns={'domain': 'top_level_domain'})

In [30]:
urls_with_categories.head()

Unnamed: 0,web_visits_id,id,panelist_id,url,used_at,active_seconds,top_level_domain,subdomain,category,category_names
0,111761051,1033436752,1421,0,2018-10-05 22:02:38,4,ebesucher.de,,"(business, education)","(business, education)"
1,111761051,1033436580,1421,0,2018-10-05 21:41:22,10,ebesucher.de,,"(business, education)","(business, education)"
2,111761051,1033436588,1421,0,2018-10-05 21:45:36,5,ebesucher.de,,"(business, education)","(business, education)"
3,111761051,1033436584,1421,0,2018-10-05 21:42:22,181,ebesucher.de,,"(business, education)","(business, education)"
4,111761159,1033437168,1421,0,2018-10-05 23:48:14,6,ebesucher.de,,"(business, education)","(business, education)"


In [31]:
urls_sublevel = pd.concat([
    urls_email,
    urls_productivity,
    urls_media_sharing,
    urls_social_networking,
    urls_news,
    urls_translators,
    urls_shopping
], sort=False).drop_duplicates(['id'])

urls_sublevel.shape

(1050415, 6)

In [32]:
urls_sublevel.head()

Unnamed: 0,id,url,domain,subdomain,category_names,sub_level_domain
0,1034500040,28061,live.com,bay174.mail.,"(email,)",bay174.mail.live.com
1,1037376788,97009,live.com,dub125.mail.,"(email,)",dub125.mail.live.com
2,1037377304,97009,live.com,dub125.mail.,"(email,)",dub125.mail.live.com
3,1035452643,134272,live.com,dub118.mail.,"(email,)",dub118.mail.live.com
4,1036819471,163690,live.com,dub129.mail.,"(email,)",dub129.mail.live.com


In [33]:
urls_with_subdomains = urls_with_categories.merge(
    urls_sublevel[['id', 'sub_level_domain', 'category_names']],
    on='id',
    how='left',
    suffixes=('_top', '_sub')
)

In [34]:
urls_with_subdomains = urls_with_subdomains.assign(
                domain = np.where(urls_with_subdomains.sub_level_domain.notnull(), 
                                  urls_with_subdomains.sub_level_domain, 
                                  urls_with_subdomains.top_level_domain),
                category_names = np.where(urls_with_subdomains.category_names_sub.notnull(), 
                                          urls_with_subdomains.category_names_sub, 
                                          urls_with_subdomains.category_names_top))

In [35]:
urls_with_subdomains.sample(5)

Unnamed: 0,web_visits_id,id,panelist_id,url,used_at,active_seconds,top_level_domain,subdomain,category,category_names_top,sub_level_domain,category_names_sub,domain,category_names
2060071,113816735,1048246831,2061,169,2018-10-09 21:44:39,2,google.com,mail.,"(searchenginesandportals,)","(search-engines and portals,)",mail.google.com,"(email,)",mail.google.com,"(email,)"
5918010,114665660,1016872964,1939,737530,2018-10-02 21:18:03,223,imdb.com,,"(entertainment,)","(entertainment,)",,,imdb.com,"(entertainment,)"
7537681,120086116,1074074912,1969,2153459,2018-10-20 13:29:14,4,flickr.com,,"(mediasharing,)","(media-sharing,)",,,flickr.com,"(media-sharing,)"
7190445,114128859,1051043651,1166,256245,2018-10-11 19:23:10,179,schoener-fernsehen.com,,"(entertainment,)","(entertainment,)",,,schoener-fernsehen.com,"(entertainment,)"
667973,115095699,1053965792,1812,18934,2018-10-14 12:37:59,2,live.com,outlook.,"(chatandmessaging,)","(chat and messaging,)",outlook.live.com,"(email,)",outlook.live.com,"(email,)"


##### Constructing new Time Difference DataFrame of Consecutive Records per User

In [36]:
def construct_consecutive_url_dataframe(df):
    df = df.sort_values(['panelist_id', 'used_at'])
    df = df.assign(
            prev_id = df.id.shift().apply(lambda x: 0 if np.isnan(x) else int(x)) * (df.panelist_id == df.panelist_id.shift()),
            left_at = df.used_at + df.active_seconds.apply(lambda x: pd.Timedelta(seconds=x)),
        )
    df = df.assign(
            gap_seconds = (df.used_at - df.left_at.shift()).dt.total_seconds().apply(lambda x: 0 if np.isnan(x) else int(x)) * (df.panelist_id == df.panelist_id.shift())
                )[['id', 'prev_id', 'panelist_id', 'used_at', 'left_at', 'active_seconds', 'gap_seconds', 'top_level_domain', 'category_names_top',
        'sub_level_domain', 'subdomain', 'category_names_sub', 'domain',
        'category_names']]
    return df

In [37]:
urls_with_subdomains_gap = construct_consecutive_url_dataframe(urls_with_subdomains)

In [38]:
len(urls_with_subdomains_gap)

9151243

In [39]:
urls_with_subdomains_gap.sample(10)

Unnamed: 0,id,prev_id,panelist_id,used_at,left_at,active_seconds,gap_seconds,top_level_domain,category_names_top,sub_level_domain,subdomain,category_names_sub,domain,category_names
6147854,1071547243,1071547239,209,2018-10-22 22:08:50,2018-10-22 22:09:02,12,0,jappy.com,"(business,)",,,,jappy.com,"(business,)"
105779,1053479460,1053479456,122,2018-10-14 13:51:15,2018-10-14 13:51:21,6,0,amazon.de,"(shopping,)",,smile.,,amazon.de,"(shopping,)"
8744028,1079779191,1079779187,573,2018-10-25 19:15:20,2018-10-25 19:15:28,8,0,trekkinn.com,"(shopping,)",,,,trekkinn.com,"(shopping,)"
1882029,1092399108,1092399104,391,2018-10-27 12:53:51,2018-10-27 12:58:51,300,0,facebook.com,"(social-networking,)",,apps.,,facebook.com,"(social-networking,)"
9040138,1082262528,1082262524,1261,2018-10-23 12:41:43,2018-10-23 12:41:47,4,0,konsolosluk.gov.tr,"(alcohol and tobacco, education, shopping)",,,,konsolosluk.gov.tr,"(alcohol and tobacco, education, shopping)"
8332239,1079563004,1079563000,512,2018-10-21 22:25:45,2018-10-21 22:25:59,14,0,ancestry.de,"(business, education)",,search.,,ancestry.de,"(business, education)"
4490825,1025450272,1025450268,245,2018-10-03 18:55:39,2018-10-03 18:55:42,3,0,netflix.com,"(entertainment, streaming-media)",,,,netflix.com,"(entertainment, streaming-media)"
3249734,1088727884,1088727880,1591,2018-10-26 17:25:01,2018-10-26 17:25:57,56,44,bs.to,"(streaming-media,)",,,,bs.to,"(streaming-media,)"
2215773,1028760220,1028760216,1458,2018-10-04 17:32:55,2018-10-04 17:32:59,4,0,google.com,"(search-engines and portals,)",,,,google.com,"(search-engines and portals,)"
2126825,1016684080,1016684076,1293,2018-10-02 18:37:45,2018-10-02 18:37:51,6,0,google.com,"(search-engines and portals,)",,,,google.com,"(search-engines and portals,)"


In [40]:
urls_with_subdomains_gap.assign(
    category_names_top = lambda df: df.category_names_top.apply(lambda t: None if pd.isnull(t) else ','.join(t)),
    category_names_sub = lambda df: df.category_names_sub.apply(lambda t: None if pd.isnull(t) else ','.join(t)),
    category_names     = lambda df: df.category_names.apply(lambda t: None if pd.isnull(t) else ','.join(t)),
    ) \
    .to_csv(data_directory + "/pre_processed/browsing_with_gap.csv", index=False)