In [1]:
import pandas as pd

# data ingestion

## functions def

In [28]:
# pd.read_excel: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html
# pd.DatetimeIndex: https://pandas.pydata.org/docs/reference/api/pandas.DatetimeIndex.html
# pd.DataFrame.asfreq: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.asfreq.html

HOSTNAME_FIELD = 'Hostname'

def pull_raw_data(file_location):
    raw = pd.read_excel(file_location, None)
    return raw

def get_region_data(region, file_location):
    """
    Function for fetching region data
    :file_location: file path for data source
    :region:
        NB GA Data 1:
            NB | GA | US Data
            NB | GA | CA Data
            NB | GA | AU Data
            NB | GA | NZ Data
            NB | GA | JNBO Data
        NB GA Data 2:
            NB | GA | EU + UK
            NB | GA | TW
            NB | GA | HK
            NB | GA | MY
            NB | GA | SG
    """
    all_data = pull_raw_data(file_location)
    region_data = all_data[region]
    region_data = region_data.copy(deep=False)
    region_data['Date'] = pd.to_datetime(region_data['Date'])
    region_data.set_index('Date', inplace=True)
    region_data.index = pd.DatetimeIndex(region_data.index.values, freq=region_data.index.inferred_freq)
    asc = region_data.sort_index()
    return asc

def describe_region_data(region_data):
    return pd.Series({'devices':','.join(list(region_data['Device Category'].unique())),
                      'channels':','.join(list(region_data['Default Channel Grouping'].unique())),
                      'hostnames':','.join(list(region_data['Hostname'].unique())),
                      'start_date':region_data.index.min(),
                      'end_date':region_data.index.max()})

def get_hostname_data(hostname, region_data):
    hostname_data = region_data[region_data[HOSTNAME_FIELD] == hostname]
    return hostname_data

def apply_index_freq(data, freq):
    return data.asfreq(freq)

def aggregate_daily_data(data):
    data = data.copy(deep=False)
    data.loc[:, 'Year'] = data.index.year
    data.loc[:, 'Month'] = data.index.month
    data.loc[:, 'Day'] = 1
    data.loc[:, 'Date'] = pd.to_datetime(data[['Year', 'Month', 'Day']])
    return data.groupby('Date').agg({'Sessions':'sum', 'Pageviews':'sum'})

## test

In [29]:
ts_file = '/opt/notebooks/datasets/NB_GA_Data_1.xlsx'
region_data = get_region_data('NB | GA | US Data', ts_file)
describe_region_data(region_data)

devices                                   desktop,mobile,tablet
channels      (Other),Direct,Display,Organic Search,Paid Sea...
hostnames     www.newbalance.com,www.nbls.cc,www.newbalance....
start_date                                  2016-11-30 00:00:00
end_date                                    2022-12-31 00:00:00
dtype: object

In [30]:
newbalancecom = get_hostname_data('www.newbalance.com', region_data)
newbalancecom_month = aggregate_daily_data(newbalancecom)
newbalancecom_month = apply_index_freq(newbalancecom_month, 'MS')
newbalancecom_month

Unnamed: 0_level_0,Sessions,Pageviews
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-11-01,141300,855724
2016-12-01,3378004,23999244
2017-01-01,3360180,21827590
2017-02-01,3269727,22854333
2017-03-01,3927370,26801491
...,...,...
2022-08-01,10958689,37023290
2022-09-01,9481649,31645979
2022-10-01,12059381,38057960
2022-11-01,13982669,46661192


In [31]:
ts_file = '/opt/notebooks/datasets/NB_GA_Data_1.xlsx'
region_data = get_region_data('NB | GA | CA Data', ts_file)
describe_region_data(region_data)

devices                                   desktop,mobile,tablet
channels      Direct,Organic Search,Paid Search,Referral,Soc...
hostnames     www.newbalance.ca,(not set),www.bigdata-domain...
start_date                                  2016-11-30 00:00:00
end_date                                    2022-12-31 00:00:00
dtype: object

In [32]:
newbalanceca = get_hostname_data('www.newbalance.ca', region_data)
newbalanceca_month = aggregate_daily_data(newbalanceca)
newbalanceca_month = apply_index_freq(newbalanceca_month, 'MS')
newbalanceca_month

Unnamed: 0_level_0,Sessions,Pageviews
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-11-01,4002,31120
2016-12-01,107499,945851
2017-01-01,142985,1162558
2017-02-01,144891,1214112
2017-03-01,239750,1808862
...,...,...
2022-08-01,1121613,4362433
2022-09-01,1104203,4355085
2022-10-01,1121899,4191353
2022-11-01,1094205,4055384


In [33]:
ts_file = '/opt/notebooks/datasets/NB_GA_Data_2.xlsx'
region_data = get_region_data('NB | GA | EU + UK', ts_file)
describe_region_data(region_data)

devices                                   desktop,mobile,tablet
channels      Email,Paid Search,Organic Search,Referral,Dire...
hostnames     at.newbalance.eu,www.newbalance.fr,www.newbala...
start_date                                  2018-09-25 00:00:00
end_date                                    2022-12-31 00:00:00
dtype: object

In [34]:
newbalancefr = get_hostname_data('www.newbalance.fr', region_data)
newbalancefr_month = aggregate_daily_data(newbalancefr)
newbalancefr_month = apply_index_freq(newbalancefr_month, 'MS')
newbalancefr_month

Unnamed: 0_level_0,Sessions,Pageviews
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-09-01,67388,300965
2018-10-01,289161,1467829
2018-11-01,395531,1932762
2018-12-01,292351,1402871
2019-01-01,437445,2182022
2019-02-01,304855,1546504
2019-03-01,345297,1967805
2019-04-01,427892,2198442
2019-05-01,455582,2321955
2019-06-01,413931,2237647
