In [1]:
import pandas as pd
import multiprocessing as mp
import glob
from functools import partial, reduce
from functional import seq
import seaborn as sns

In [2]:
import numpy as np

# Load data

In [3]:
data_directory = "/bigdata/web_tracking/data/release/"

In [4]:
users = pd.read_csv(data_directory + '/raw/users.csv')

In [5]:
urls = pd.read_csv(
    data_directory + '/raw/browsing.csv', dtype={'id': int, 'web_visits_id': int}, parse_dates=['used_at'])

In [6]:
domain_categories = pd.read_csv(
    data_directory + '/raw/domain_categories.csv', converters={'category': lambda x: tuple(sorted(x.split(',')))})

# Defining the category of a website

Cleaning up a bit the data:

In [7]:
words_dashed = ['black', 'content', 'illegal', 'information', 'job', 'media', 
                'message', 'filter', 'real', 'search', 'social', 'streaming', 'virtual']

def put_dash(words_dashed, text):
    temp = list(filter(lambda w: text.startswith(w), words_dashed))
    return text if len(temp) < 1 else text.replace(temp[0], f'{temp[0]}-')


words_replaced = [('food', 'food and recipes'), ('filteravoidance', 'proxy and filter-avoidance'), 
                  ('translation', 'translators'), ('sport', 'sports')]

def replace(words_replaced, text):
    temp = list(filter(lambda p: p[0] in text, words_replaced))
    return text if len(temp) < 1 else temp[0][1]
    

dash = partial(put_dash, words_dashed)
words = partial(replace, words_replaced)

categories = pd.DataFrame(seq(domain_categories.category).reduce(lambda acc, i: acc.union( set(i)), set()), columns=['raw']) \
                .sort_values('raw') \
                .assign(name=lambda df: df.raw.apply(lambda x: words(dash(x.replace('and', ' and '))))) \
                .assign(id=lambda df: (df.name != df.name.shift()).cumsum())

#### Add Category Manually

We will add two new categories: email and productivity.

In [8]:
categories = categories.append(
    pd.DataFrame([(None, 'email', 0), (None, 'productivity', 0)], 
                 columns=['raw', 'name', 'id'])).sort_values(['name', 'raw']).assign(
    id=lambda df: (df.name != df.name.shift()).cumsum())

Let's export this:

In [9]:
categories.to_csv(data_directory + '/pre_processed/categories.csv', index=False)

#### Rebuild Domain Categories

- There was domain-category_ids match, now it's domain-category_names

In [10]:
def find_category_names(categories, t):
    return tuple(map(lambda c: categories.query(f"raw == '{c}'").iloc[0, 1], t))

category = partial(find_category_names, categories)
domain_categories = domain_categories.assign(category_names=domain_categories.category.apply(category))

Exporting the data:

In [11]:
domain_categories_processed = domain_categories[['domain', 'category_names']].assign(
    category_names=lambda df: df.category_names.apply(lambda l: ','.join(l)))

domain_categories_processed.to_csv(data_directory + '/pre_processed/processed_domain_categories.csv', index=False)

# Adding category info related to the subdomains

In [12]:
subdomains = urls[~urls.subdomain.isna()][["id", "url", "domain", "subdomain"]].copy()
subdomains = subdomains.merge(domain_categories_processed, on='domain')

### Examining 'mail' subdomains

1. Large companies which provide email services such as Google, Yahoo, T-Online, Vodafoen, GMX, Web.de.
2. Universities in Germany provide email services.
3. Other subdomains are seen in websites for the business purposes, i.e. subscription, login to buy, etc.

In [13]:
def get_subdomains(df, main, sub): 
    return df[(df.domain == main) & (df.subdomain.str.contains(sub))]

Let's see the ones that contains `mail`:

In [14]:
mail_subdomains = subdomains[subdomains.subdomain.str.contains('mail')]
top_mail_subdomains = mail_subdomains.groupby('domain', as_index=False).agg({'url': 'count'}).nlargest(400, 'url')
top_mail_subdomains.to_csv(data_directory + '/pre_processed/domains_containing_mail_raw.csv', index=False)

Next, we manually curated this .csv in order to generate a new file named `domains_contain_mail.csv`.

The method: 
- Look at the top domains with subdomain that has _mail_ string.
- If sample urls belong the domain seem meaningful urls, then check domain as email.
- Totally, 400 domain were evaluated manually.

### Creating subdomains of different types

* Emails Domains

In [15]:
urls_email = pd.concat(
    [
        subdomains[(subdomains.subdomain.str.endswith('mail.'))]
            .merge(pd.read_csv(data_directory + '/pre_processed/domains_containing_mail.csv', 
                               usecols=['domain', 'is_email']).query('is_email == 1'), on='domain')
            .drop(columns=['is_email']),
        get_subdomains(subdomains, 'google.com', 'accounts.'),
        get_subdomains(subdomains, 'web.de', 'navigator.'),
        get_subdomains(subdomains, 'yahoo.com', 'login.'),
        get_subdomains(subdomains, 'live.com', 'outlook.'),
        get_subdomains(subdomains, 'live.com', 'login.'),
        get_subdomains(subdomains, 'live.com', 'account.'),
        get_subdomains(subdomains, 'live.com', 'signup.'),
        get_subdomains(subdomains, 'gmx.net', 'navigator.'),
        get_subdomains(subdomains, 'aol.com', 'login.'),
    ],
    sort=False
) \
    .assign(
        sub_level_domain=lambda df: df.subdomain + df.domain,
        category_names=lambda df: df.category_names.apply(
            lambda x: tuple(['email']))
).drop_duplicates(['id'])

In [16]:
urls_email.shape

(970034, 6)

* Productivity domains

In [17]:
urls_productivity = pd.concat([
    get_subdomains(subdomains, 'google.com', 'docs.'),
    get_subdomains(subdomains, 'google.com', 'calendar.'),
    get_subdomains(subdomains, 'google.com', 'office.'),
],
    sort=False 
) \
    .assign(
        sub_level_domain=lambda df: df.subdomain + df.domain,
        category_names=lambda df: df.category_names.apply(
            lambda x: tuple(['productivity']))
).drop_duplicates(['id'])

urls_productivity.sample(5)

Unnamed: 0,id,url,domain,subdomain,category_names,sub_level_domain
308652,1067791239,1051031,google.com,calendar.,"(productivity,)",calendar.google.com
428968,1064471468,66150,google.com,docs.,"(productivity,)",docs.google.com
288104,1064202699,96918,google.com,docs.,"(productivity,)",docs.google.com
286383,1027171508,1089930,google.com,calendar.,"(productivity,)",calendar.google.com
544224,1090594048,1188474,google.com,docs.,"(productivity,)",docs.google.com


In [18]:
urls_productivity.shape

(25244, 6)

* Media sharing domains

In [19]:
urls_media_sharing = pd.concat([
    get_subdomains(subdomains, 'google.com', 'photos.'),
    get_subdomains(subdomains, 'google.com', 'drive.'),
    get_subdomains(subdomains, 'live.com', 'onedrive.'),
],
    sort=False 
) \
    .assign(
        sub_level_domain=lambda df: df.subdomain + df.domain,
        category_names=lambda df: df.category_names.apply(
            lambda x: tuple(['media-sharing']))
).drop_duplicates(['id'])

urls_media_sharing.sample(5)

Unnamed: 0,id,url,domain,subdomain,category_names,sub_level_domain
297006,1028840692,1153847,google.com,photos.,"(media-sharing,)",photos.google.com
436139,1066398496,2000716,google.com,photos.,"(media-sharing,)",photos.google.com
31205,1049295811,1439,live.com,onedrive.,"(media-sharing,)",onedrive.live.com
290405,1064738463,1121144,google.com,photos.,"(media-sharing,)",photos.google.com
511548,1080170627,2479906,google.com,photos.,"(media-sharing,)",photos.google.com


In [20]:
urls_media_sharing.shape

(15375, 6)

* Social networking domains

In [21]:
urls_social_networking = pd.concat([
    get_subdomains(subdomains, 'google.com', 'plus.'),
    get_subdomains(subdomains, 'yahoo.com', 'drive.'),
    get_subdomains(subdomains, 'vodafone.de', 'groups.'),
],
    sort=False 
) \
    .assign(
        sub_level_domain=lambda df: df.subdomain + df.domain,
        category_names=lambda df: df.category_names.apply(
            lambda x: tuple(['social-networking']))
).drop_duplicates(['id'])

urls_social_networking.sample(5)

Unnamed: 0,id,url,domain,subdomain,category_names,sub_level_domain
155910,1037177887,47583,google.com,plus.,"(social-networking,)",plus.google.com
320918,1074926391,294048,google.com,plus.,"(social-networking,)",plus.google.com
417045,1061936516,294048,google.com,plus.,"(social-networking,)",plus.google.com
357595,1043541864,15102,google.com,plus.,"(social-networking,)",plus.google.com
168243,1041417979,270266,google.com,plus.,"(social-networking,)",plus.google.com


In [22]:
urls_social_networking.shape

(2285, 6)

* News domains

In [23]:
urls_news = pd.concat([
    get_subdomains(subdomains, 'google.com', 'news.'),
    get_subdomains(subdomains, 'vodafone.de', 'magazin.'),
    get_subdomains(subdomains, 'vodafone.de', 'x.enews.'),
],
    sort=False 
) \
    .assign(
        sub_level_domain=lambda df: df.subdomain + df.domain,
        category_names=lambda df: df.category_names.apply(
            lambda x: tuple(['news']))
).drop_duplicates(['id'])

urls_news.sample(5)

Unnamed: 0,id,url,domain,subdomain,category_names,sub_level_domain
565595,1094163704,2828095,google.com,news.,"(news,)",news.google.com
483045,1071245947,25409,google.com,news.,"(news,)",news.google.com
205976,1050720735,144882,google.com,news.,"(news,)",news.google.com
361825,1044393412,25409,google.com,news.,"(news,)",news.google.com
360492,1090280939,144856,google.com,news.,"(news,)",news.google.com


In [24]:
urls_news.shape

(6165, 6)

* Translators domains

In [25]:
urls_translators = pd.concat([
    get_subdomains(subdomains, 'google.com', 'translate.'),
],
    sort=False 
) \
    .assign(
        sub_level_domain=lambda df: df.subdomain + df.domain,
        category_names=lambda df: df.category_names.apply(
            lambda x: tuple(['translators']))
).drop_duplicates(['id'])

urls_news.sample(5)

Unnamed: 0,id,url,domain,subdomain,category_names,sub_level_domain
498213,1081423856,25409,google.com,news.,"(news,)",news.google.com
478829,1070053087,145185,google.com,news.,"(news,)",news.google.com
389351,1049137140,1746908,google.com,news.,"(news,)",news.google.com
457873,1072738836,145185,google.com,news.,"(news,)",news.google.com
140745,1034888568,25409,google.com,news.,"(news,)",news.google.com


In [26]:
urls_translators.shape

(28106, 6)

* Shopping domains

In [27]:
urls_shopping = pd.concat([
    get_subdomains(subdomains, 'google.com', 'play.'),
],
    sort=False 
) \
    .assign(
        sub_level_domain=lambda df: df.subdomain + df.domain,
        category_names=lambda df: df.category_names.apply(
            lambda x: tuple(['shopping']))
).drop_duplicates(['id'])

urls_shopping.sample(5)

Unnamed: 0,id,url,domain,subdomain,category_names,sub_level_domain
147075,1037406100,97484,google.com,play.,"(shopping,)",play.google.com
146057,1037062436,88344,google.com,play.,"(shopping,)",play.google.com
332989,1076723507,1406284,google.com,play.,"(shopping,)",play.google.com
389543,1049200112,1747514,google.com,play.,"(shopping,)",play.google.com
389650,1049198428,1747700,google.com,play.,"(shopping,)",play.google.com


In [28]:
urls_shopping.shape

(3206, 6)

# Exporting data

Creating a new browsing history file:

In [29]:
urls_with_categories = urls.merge(domain_categories, on='domain').rename(columns={'domain': 'top_level_domain'})

In [30]:
urls_with_categories.head()

Unnamed: 0,web_visits_id,id,panelist_id,url,used_at,active_seconds,top_level_domain,subdomain,category,category_names
0,111761051,1033436752,1421,0,2018-10-05 22:02:38,4,ebesucher.de,,"(business, education)","(business, education)"
1,111761051,1033436580,1421,0,2018-10-05 21:41:22,10,ebesucher.de,,"(business, education)","(business, education)"
2,111761051,1033436588,1421,0,2018-10-05 21:45:36,5,ebesucher.de,,"(business, education)","(business, education)"
3,111761051,1033436584,1421,0,2018-10-05 21:42:22,181,ebesucher.de,,"(business, education)","(business, education)"
4,111761159,1033437168,1421,0,2018-10-05 23:48:14,6,ebesucher.de,,"(business, education)","(business, education)"


In [31]:
urls_sublevel = pd.concat([
    urls_email,
    urls_productivity,
    urls_media_sharing,
    urls_social_networking,
    urls_news,
    urls_translators,
    urls_shopping
], sort=False).drop_duplicates(['id'])

urls_sublevel.shape

(1050415, 6)

In [32]:
urls_sublevel.head()

Unnamed: 0,id,url,domain,subdomain,category_names,sub_level_domain
0,1034500040,28061,live.com,bay174.mail.,"(email,)",bay174.mail.live.com
1,1037376788,97009,live.com,dub125.mail.,"(email,)",dub125.mail.live.com
2,1037377304,97009,live.com,dub125.mail.,"(email,)",dub125.mail.live.com
3,1035452643,134272,live.com,dub118.mail.,"(email,)",dub118.mail.live.com
4,1036819471,163690,live.com,dub129.mail.,"(email,)",dub129.mail.live.com


In [33]:
urls_with_subdomains = urls_with_categories.merge(
    urls_sublevel[['id', 'sub_level_domain', 'category_names']],
    on='id',
    how='left',
    suffixes=('_top', '_sub')
)

In [34]:
urls_with_subdomains = urls_with_subdomains.assign(
                domain = np.where(urls_with_subdomains.sub_level_domain.notnull(), 
                                  urls_with_subdomains.sub_level_domain, 
                                  urls_with_subdomains.top_level_domain),
                category_names = np.where(urls_with_subdomains.category_names_sub.notnull(), 
                                          urls_with_subdomains.category_names_sub, 
                                          urls_with_subdomains.category_names_top))

In [35]:
urls_with_subdomains.sample(5)

Unnamed: 0,web_visits_id,id,panelist_id,url,used_at,active_seconds,top_level_domain,subdomain,category,category_names_top,sub_level_domain,category_names_sub,domain,category_names
1135540,122531536,1086236912,1071,2585864,2018-10-26 09:19:54,4,google.de,,"(searchenginesandportals,)","(search-engines and portals,)",,,google.de,"(search-engines and portals,)"
7934863,114316088,1013780988,1410,449348,2018-10-01 17:28:01,4,tagoria.de,,"(games,)","(games,)",,,tagoria.de,"(games,)"
8019634,122874240,1089306732,2126,110265,2018-10-26 21:57:52,2,sonderspiele.net,,"(business, gambling, games)","(business, gambling, games)",,,sonderspiele.net,"(business, gambling, games)"
1150677,123254088,1092671256,1203,2706105,2018-10-27 10:53:06,2,google.de,,"(searchenginesandportals,)","(search-engines and portals,)",,,google.de,"(search-engines and portals,)"
3523777,117221548,1044548388,199,40137,2018-10-10 20:18:27,2,t-online.de,email.,"(searchenginesandportals,)","(search-engines and portals,)",email.t-online.de,"(email,)",email.t-online.de,"(email,)"


##### Constructing new Time Difference DataFrame of Consecutive Records per User

In [36]:
def construct_consecutive_url_dataframe(df):
    df = df.sort_values(['panelist_id', 'used_at'])
    df = df.assign(
            prev_id = df.id.shift().apply(lambda x: 0 if np.isnan(x) else int(x)) * (df.panelist_id == df.panelist_id.shift()),
            left_at = df.used_at + df.active_seconds.apply(lambda x: pd.Timedelta(seconds=x)),
        )
    df = df.assign(
            gap_seconds = (df.used_at - df.left_at.shift()).dt.total_seconds().apply(lambda x: 0 if np.isnan(x) else int(x)) * (df.panelist_id == df.panelist_id.shift())
                )[['id', 'prev_id', 'panelist_id', 'used_at', 'left_at', 'active_seconds', 'gap_seconds', 'top_level_domain', 'category_names_top',
        'sub_level_domain', 'subdomain', 'category_names_sub', 'domain',
        'category_names']]
    return df

In [37]:
urls_with_subdomains_gap = construct_consecutive_url_dataframe(urls_with_subdomains)

In [38]:
len(urls_with_subdomains_gap)

9151243

In [39]:
urls_with_subdomains_gap.sample(10)

Unnamed: 0,id,prev_id,panelist_id,used_at,left_at,active_seconds,gap_seconds,top_level_domain,category_names_top,sub_level_domain,subdomain,category_names_sub,domain,category_names
5032992,1097540212,1097540208,456,2018-10-29 22:55:31,2018-10-29 22:55:37,6,0,mydealz.de,"(shopping,)",,,,mydealz.de,"(shopping,)"
765325,1017545964,1017545960,399,2018-10-02 10:47:21,2018-10-02 10:48:48,87,0,paypal.com,"(business, economy and finance)",,,,paypal.com,"(business, economy and finance)"
7623645,1028141356,1028141352,533,2018-10-03 21:04:35,2018-10-03 21:04:51,16,0,vova.com,"(business,)",,,,vova.com,"(business,)"
9120573,1088864512,1088864508,1371,2018-10-26 12:08:54,2018-10-26 12:09:06,12,4,mailerlite.com,"(business,)",,app.,,mailerlite.com,"(business,)"
3382121,1053024320,1053024316,1641,2018-10-14 08:29:51,2018-10-14 08:30:03,12,0,ebay-kleinanzeigen.de,"(business, shopping)",,,,ebay-kleinanzeigen.de,"(business, shopping)"
1575051,1089622735,1089622731,948,2018-10-30 21:55:53,2018-10-30 21:55:59,6,0,facebook.com,"(social-networking,)",,,,facebook.com,"(social-networking,)"
5653824,1083206767,1083206763,1551,2018-10-25 14:42:18,2018-10-25 14:42:20,2,0,euros-4-mails.de,"(business,)",,,,euros-4-mails.de,"(business,)"
2202541,1027126004,1027126000,2030,2018-10-03 02:28:38,2018-10-03 02:28:40,2,0,google.com,"(search-engines and portals,)",,,,google.com,"(search-engines and portals,)"
8550365,1068361172,1068361168,1165,2018-10-17 20:45:48,2018-10-17 20:45:50,2,0,telefon-treff.de,"(business, education, information-tech)",,,,telefon-treff.de,"(business, education, information-tech)"
293185,1083928279,1083928275,1616,2018-10-25 13:29:17,2018-10-25 13:29:51,34,0,amazon.de,"(shopping,)",,smile.,,amazon.de,"(shopping,)"


In [40]:
urls_with_subdomains_gap.assign(
    category_names_top = lambda df: df.category_names_top.apply(lambda t: None if pd.isnull(t) else ','.join(t)),
    category_names_sub = lambda df: df.category_names_sub.apply(lambda t: None if pd.isnull(t) else ','.join(t)),
    category_names     = lambda df: df.category_names.apply(lambda t: None if pd.isnull(t) else ','.join(t)),
    ) \
    .to_csv(data_directory + "/pre_processed/browsing_with_gap.csv", index=False)