In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
import pandas as pd

In [None]:
from web_tracking.pre_processing.raw_data_processor import RawDataProcessor

# Load data

In [None]:
data_directory = "/bigdata/web_tracking/data/release/"

Users

In [None]:
users = pd.read_csv(data_directory + '/raw/users.csv', dtype={'panelist_id': int})

Web browsing behavior

In [None]:
processed = pd.read_csv(data_directory + "/pre_processed/browsing_with_gap.csv")

List of users having enough data

In [None]:
selected_users = pd.read_csv(data_directory + '/pre_processed/selected_users.csv')

# Basic statistics

Number of individuals

In [None]:
len(users)

Number of total website visits

In [None]:
len(processed)

Number of unique domains 

In [None]:
len(processed.top_level_domain.unique())

# Comparing the top domains in our sample with Alexa

First, let's analyze our sample. How similar is the top domains in the sample with the top domains given by Alexa? 

- According to Alexa: "A website has a Reach score of 5,000, this means that 5,000 people out of a sample of 1 million"[¹](https://www.klipfolio.com/metrics/marketing/reach-per-million-users), and they calculate it by using `Count Unique(People who saw your content per Day per Million Users)`
- So, we computed number of unique visitors per domain to compare our dataset with Alexa.
- Section 4 in the article.

In [None]:
top_domains = processed.top_level_domain.value_counts(normalize=True).to_frame() \
                    .reset_index().rename(columns={'index': 'domain', 'top_level_domain': 'visits'}) \
                    .assign(rank = lambda df: range(1, len(df)+1))

In [None]:
alexa_top_5K = pd.read_csv('../../../alexa_GER_top5000.csv').drop(columns=['X1.nrow.file.Results.Result.Alexa.TopSites.Country.Sites.Site.'])

Matching the domains 

In [None]:
top_domains_5K  = top_domains.merge(alexa_top_5K, left_on='domain', right_on='url')

f'{len(top_domains_5K)} / 5000'

Pearson correlation:

In [None]:
top_domains_5K[['visits', 'PageViews.PerMillion']].corr(method='pearson')

# Demographic statistics

### Without filtering

Gender distribution

In [None]:
temp = users[~users.gender.isna()].gender.apply(lambda x: str(x).title()).value_counts(normalize=True).to_frame().applymap(lambda x: round(x * 100, 2))

ax = temp.plot.bar(legend=False, title='Gender Distribution', ylim=(0, 55), xlim=(0, 100))
ax.set_ylabel("%")
ax.tick_params(axis='x', labelrotation=0)
ax.grid(color='lightgrey', linestyle='--', linewidth=.75, alpha=.3)
ax.set_axisbelow(True)

for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}%', (p.get_x() + p.get_width() / 4., p.get_height() + .5))

Age distribution

In [None]:
temp = users[~users.age_recode.isna()].age_recode.apply(str).value_counts(normalize=True).to_frame().applymap(lambda x: x * 100).rename(index={'(24,34]': '[25,34]', '(34,44]': '[35,44]', '(44,54]': '[45,54]', '(54,64]': '[55,64]', '(64,80]': '[65,80]', 'nan': 'n/a'}).sort_index()
ax = temp.plot.bar(legend=False, title='Age  Distribution', color='darkturquoise', ylim=(0, 25))
ax.set_ylabel("%")
ax.set_xlabel("Age Groups")
ax.grid(color='lightgrey', linestyle='--', linewidth=.75, alpha=.3)
ax.set_axisbelow(True)
ax.tick_params(axis='x', labelrotation=0)

for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}%', (p.get_x() + p.get_width() / 4., p.get_height() + .5))

### With filtering

In [None]:
users_filtered = users[users.panelist_id.isin(selected_users['user'].values)].copy()

Number of users

In [None]:
len(users_filtered)

Gender distribution

In [None]:
temp = users_filtered[~users_filtered.gender.isna()].gender.apply(lambda x: str(x).title()).value_counts(normalize=True).to_frame().applymap(lambda x: round(x * 100, 2))
ax = temp.plot.bar(legend=False, title='Gender Distribution', ylim=(0, 55), xlim=(0, 100))
ax.set_ylabel("%")
ax.tick_params(axis='x', labelrotation=0)
ax.grid(color='lightgrey', linestyle='--', linewidth=.75, alpha=.3)
ax.set_axisbelow(True)

for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}%', (p.get_x() + p.get_width() / 4., p.get_height() + .5))

Age distribution

In [None]:
temp = users_filtered[~users_filtered.age_recode.isna()].age_recode.apply(str).value_counts(normalize=True).to_frame().applymap(lambda x: x * 100).rename(index={'(24,34]': '[25,34]', '(34,44]': '[35,44]', '(44,54]': '[45,54]', '(54,64]': '[55,64]', '(64,80]': '[65,80]', 'nan': 'n/a'}).sort_index()
ax = temp.plot.bar(legend=False, title='Age  Distribution', color='darkturquoise', ylim=(0, 27))
ax.set_ylabel("%")
ax.set_xlabel("Age Groups")
ax.grid(color='lightgrey', linestyle='--', linewidth=.75, alpha=.3)
ax.set_axisbelow(True)
ax.tick_params(axis='x', labelrotation=0)

for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}%', (p.get_x() + p.get_width() / 4., p.get_height() + .5))