In [1]:
import pandas as pd
import numpy as np
import os.path
import os
import argparse

import extractor
from feeder import VarFeeder
import numba
from typing import Tuple, Dict, List

In [2]:
def read_cached(name) -> pd.DataFrame:
    """
    Reads csv file (maybe zipped) from data directory and caches it's content as a pickled DataFrame
    :param name: file name without extension
    :return: file content
    """
    cached = 'data/%s.pkl' % name
    sources = ['data/%s.csv' % name, 'data/%s.csv.zip' % name]
    if os.path.exists(cached):
        return pd.read_pickle(cached)
    else:
        for src in sources:
            if os.path.exists(src):
                df = pd.read_csv(src)
                df.to_pickle(cached)
                return df

In [3]:
def read_all() -> pd.DataFrame:
    """
    Reads source data for training/prediction
    """
    def read_file(file):
        df = read_cached(file).set_index('Page')
        df.columns = df.columns.astype('M8[D]')
        return df

    # Path to cached data
    path = os.path.join('data', 'all.pkl')
    if os.path.exists(path):
        df = pd.read_pickle(path)
    else:
        # Official data
        df = read_file('train_2')
        # Scraped data
        scraped = read_file('2017-08-15_2017-09-11_new')
        # Update last two days by scraped data
        df[pd.Timestamp('2017-09-10')] = scraped['2017-09-10']
        df[pd.Timestamp('2017-09-11')] = scraped['2017-09-11']

        df = df.sort_index()
        # Cache result
        df.to_pickle(path)
    return df

In [4]:
# todo:remove
def make_holidays(tagged, start, end) -> pd.DataFrame:
    def read_df(lang):
        result = pd.read_pickle('data/holidays/%s.pkl' % lang)
        return result[~result.dw].resample('D').size().rename(lang)

    holidays = pd.DataFrame([read_df(lang) for lang in ['de', 'en', 'es', 'fr', 'ja', 'ru', 'zh']])
    holidays = holidays.loc[:, start:end].fillna(0)
    result =tagged[['country']].join(holidays, on='country').drop('country', axis=1).fillna(0).astype(np.int8)
    result.columns = pd.DatetimeIndex(result.columns.values)
    return result

In [5]:
def read_x(start, end) -> pd.DataFrame:
    """
    Gets source data from start to end date. Any date can be None
    """
    df = read_all()
    # User GoogleAnalitycsRoman has really bad data with huge traffic spikes in all incarnations.
    # Wikipedia banned him, we'll ban it too
    bad_roman = df.index.str.startswith("User:GoogleAnalitycsRoman")
    df = df[~bad_roman]
    if start and end:
        return df.loc[:, start:end]
    elif end:
        return df.loc[:, :end]
    else:
        return df

In [6]:
@numba.jit(nopython=True)
def single_autocorr(series, lag):
    """
    Autocorrelation for single data series
    :param series: traffic series
    :param lag: lag, days
    :return:
    """
    s1 = series[lag:]
    s2 = series[:-lag]
    ms1 = np.mean(s1)
    ms2 = np.mean(s2)
    ds1 = s1 - ms1
    ds2 = s2 - ms2
    divider = np.sqrt(np.sum(ds1 * ds1)) * np.sqrt(np.sum(ds2 * ds2))
    return np.sum(ds1 * ds2) / divider if divider != 0 else 0

In [7]:
@numba.jit(nopython=True)
def batch_autocorr(data, lag, starts, ends, threshold, backoffset=0):
    """
    Calculate autocorrelation for batch (many time series at once)
    :param data: Time series, shape [n_pages, n_days]
    :param lag: Autocorrelation lag
    :param starts: Start index for each series
    :param ends: End index for each series
    :param threshold: Minimum support (ratio of time series length to lag) to calculate meaningful autocorrelation.
    :param backoffset: Offset from the series end, days.
    :return: autocorrelation, shape [n_series]. If series is too short (support less than threshold),
    autocorrelation value is NaN
    """
    n_series = data.shape[0]
    n_days = data.shape[1]
    max_end = n_days - backoffset
    corr = np.empty(n_series, dtype=np.float64)
    support = np.empty(n_series, dtype=np.float64)
    for i in range(n_series):
        series = data[i]
        end = min(ends[i], max_end)
        real_len = end - starts[i]
        support[i] = real_len/lag
        if support[i] > threshold:
            series = series[starts[i]:end]
            c_365 = single_autocorr(series, lag)
            c_364 = single_autocorr(series, lag-1)
            c_366 = single_autocorr(series, lag+1)
            # Average value between exact lag and two nearest neighborhs for smoothness
            corr[i] = 0.5 * c_365 + 0.25 * c_364 + 0.25 * c_366
        else:
            corr[i] = np.NaN
    return corr #, support

In [8]:
@numba.jit(nopython=True)
def find_start_end(data: np.ndarray):
    """
    Calculates start and end of real traffic data. Start is an index of first non-zero, non-NaN value,
     end is index of last non-zero, non-NaN value
    :param data: Time series, shape [n_pages, n_days]
    :return:
    """
    n_pages = data.shape[0]
    n_days = data.shape[1]
    start_idx = np.full(n_pages, -1, dtype=np.int32)
    end_idx = np.full(n_pages, -1, dtype=np.int32)
    for page in range(n_pages):
        # scan from start to the end
        for day in range(n_days):
            if not np.isnan(data[page, day]) and data[page, day] > 0:
                start_idx[page] = day
                break
        # reverse scan, from end to start
        for day in range(n_days - 1, -1, -1):
            if not np.isnan(data[page, day]) and data[page, day] > 0:
                end_idx[page] = day
                break
    return start_idx, end_idx

In [9]:
def prepare_data(start, end, valid_threshold) -> Tuple[pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray]:
    """
    Reads source data, calculates start and end of each series, drops bad series, calculates log1p(series)
    :param start: start date of effective time interval, can be None to start from beginning
    :param end: end date of effective time interval, can be None to return all data
    :param valid_threshold: minimal ratio of series real length to entire (end-start) interval. Series dropped if
    ratio is less than threshold
    :return: tuple(log1p(series), nans, series start, series end)
    """
    df = read_x(start, end)
    starts, ends = find_start_end(df.values)
    # boolean mask for bad (too short) series
    page_mask = (ends - starts) / df.shape[1] < valid_threshold
    print("Masked %d pages from %d" % (page_mask.sum(), len(df)))
    inv_mask = ~page_mask
    df = df[inv_mask]
    nans = pd.isnull(df)
    return np.log1p(df.fillna(0)), nans, starts[inv_mask], ends[inv_mask]


In [10]:
def lag_indexes(begin, end) -> List[pd.Series]:
    """
    Calculates indexes for 3, 6, 9, 12 months backward lag for the given date range
    :param begin: start of date range
    :param end: end of date range
    :return: List of 4 Series, one for each lag. For each Series, index is date in range(begin, end), value is an index
     of target (lagged) date in a same Series. If target date is out of (begin,end) range, index is -1
    """
    dr = pd.date_range(begin, end)
    # key is date, value is day index
    base_index = pd.Series(np.arange(0, len(dr)), index=dr)

    def lag(offset):
        dates = dr - offset
        return pd.Series(data=base_index.loc[dates].fillna(-1).astype(np.int16).values, index=dr)

    return [lag(pd.DateOffset(months=m)) for m in (3, 6, 9, 12)]

In [11]:
def make_page_features(pages: np.ndarray) -> pd.DataFrame:
    """
    Calculates page features (site, country, agent, etc) from urls
    :param pages: Source urls
    :return: DataFrame with features as columns and urls as index
    """
    tagged = extractor.extract(pages).set_index('page')
    # Drop useless features
    features = tagged.drop(['term', 'marker'], axis=1)
    return features

In [12]:
def uniq_page_map(pages):
    """
    Finds agent types (spider, desktop, mobile, all) for each unique url, i.e. groups pages by agents
    :param pages: all urls (must be presorted)
    :return: array[num_unique_urls, 4], where each column corresponds to agent type and each row corresponds to unique url.
     Value is an index of page in source pages array. If agent is missing, value is -1
    """
    import re
    result = np.full([len(pages), 4], -1, dtype=np.int32)
    pat = re.compile(
        '(.+(?:(?:wikipedia\.org)|(?:commons\.wikimedia\.org)|(?:www\.mediawiki\.org)))_([a-z_-]+?)')
    prev_page = None
    num_page = -1
    agents = {'all-access_spider': 0, 'desktop_all-agents': 1, 'mobile-web_all-agents': 2, 'all-access_all-agents': 3}
    for i, entity in enumerate(pages):
        match = pat.fullmatch(entity)
        assert match
        page = match.group(1)
        agent = match.group(2)
        if page != prev_page:
            prev_page = page
            num_page += 1
        result[num_page, agents[agent]] = i
    return result[:num_page+1]

In [13]:
def encode_page_features(df) -> Dict[str, pd.DataFrame]:
    """
    Applies one-hot encoding to page features and normalises result
    :param df: page features DataFrame (one column per feature)
    :return: dictionary feature_name:encoded_values. Encoded values is [n_pages,n_values] array
    """
    def encode(column) -> pd.DataFrame:
        one_hot = pd.get_dummies(df[column], drop_first=False)
        # noinspection PyUnresolvedReferences
        return (one_hot - one_hot.mean()) / one_hot.std()

    return {str(column): encode(column) for column in df}

In [14]:
def normalize(values: np.ndarray):
    return (values - values.mean()) / np.std(values)

In [15]:
# parser = argparse.ArgumentParser(description='Prepare data')
# parser.add_argument('data_dir')
# parser.add_argument('--valid_threshold', default=0.0, type=float, help="Series minimal length threshold (pct of data length)")
# parser.add_argument('--add_days', default=64, type=int, help="Add N days in a future for prediction")
# parser.add_argument('--start', help="Effective start date. Data before the start is dropped")
# parser.add_argument('--end', help="Effective end date. Data past the end is dropped")
# parser.add_argument('--corr_backoffset', default=0, type=int, help='Offset for correlation calculation')
# args = parser.parse_args()

In [16]:
# Get the data
df, nans, starts, ends = prepare_data(None, None, 0.0)
# df, nans, starts, ends = prepare_data(args.start, args.end, args.valid_threshold)

Masked 0 pages from 145036


## prepare_dataメソッド

In [17]:
df = read_x(None, None)

In [18]:
df.shape

(145036, 804)

In [19]:
starts, ends = find_start_end(df.values)

In [20]:
# boolean mask for bad (too short) series
valid_threshold = 0.0
page_mask = (ends - starts) / df.shape[1] < valid_threshold

In [21]:
page_mask

array([False, False, False, ..., False, False, False], dtype=bool)

In [22]:
inv_mask = ~page_mask
df = df[inv_mask]

In [23]:
nans = pd.isnull(df)

## read_allメソッド

In [24]:
def read_file(file):
    df = read_cached(file).set_index('Page')
    df.columns = df.columns.astype('M8[D]')
    return df

In [25]:
# Path to cached data
path = os.path.join('data', 'all.pkl')

In [26]:
df = pd.read_pickle(path)

In [27]:
# Official data
df = read_file('train_2')

In [28]:
# Scraped data
scraped = read_file('2017-08-15_2017-09-11')

In [29]:
# Update last two days by scraped data
df[pd.Timestamp('2017-09-10')] = scraped['2017-09-10']
df[pd.Timestamp('2017-09-11')] = scraped['2017-09-11']

In [30]:
df = df.sort_index()
# Cache result
df.to_pickle(path)

In [31]:
df.columns

DatetimeIndex(['2015-07-01', '2015-07-02', '2015-07-03', '2015-07-04',
               '2015-07-05', '2015-07-06', '2015-07-07', '2015-07-08',
               '2015-07-09', '2015-07-10',
               ...
               '2017-09-02', '2017-09-03', '2017-09-04', '2017-09-05',
               '2017-09-06', '2017-09-07', '2017-09-08', '2017-09-09',
               '2017-09-10', '2017-09-11'],
              dtype='datetime64[ns]', length=804, freq=None)

In [32]:
# User GoogleAnalitycsRoman has really bad data with huge traffic spikes in all incarnations.
# Wikipedia banned him, we'll ban it too
bad_roman = df.index.str.startswith("User:GoogleAnalitycsRoman")

In [33]:
df = df[~bad_roman]

## find_start_endメソッド

In [34]:
data = df.values

In [35]:
n_pages = data.shape[0]
n_days = data.shape[1]

In [36]:
start_idx = np.full(n_pages, -1, dtype=np.int32)
end_idx = np.full(n_pages, -1, dtype=np.int32)

In [37]:
for page in range(n_pages):
    # scan from start to the end
    for day in range(n_days):
        if not np.isnan(data[page, day]) and data[page, day] > 0:
            start_idx[page] = day
            break
    # reverse scan, from end to start
    for day in range(n_days - 1, -1, -1):
        if not np.isnan(data[page, day]) and data[page, day] > 0:
            end_idx[page] = day
            break

In [38]:
start_idx

array([0, 2, 0, ..., 0, 0, 0], dtype=int32)

In [39]:
end_idx

array([803, 803, 803, ..., 803, 803, 803], dtype=int32)

## uniq_page_mapメソッド

In [40]:
pages = df.index.values

In [41]:
import re

In [42]:
result = np.full([len(pages), 4], -1, dtype=np.int32)

In [43]:
pat = re.compile(
        '(.+(?:(?:wikipedia\.org)|(?:commons\.wikimedia\.org)|(?:www\.mediawiki\.org)))_([a-z_-]+?)')

In [44]:
# prev_page = None
num_page = -1
agents = {'all-access_spider': 0, 'desktop_all-agents': 1, 'mobile-web_all-agents': 2, 'all-access_all-agents': 3}

In [45]:
# for i, entity in enumerate(pages):
#     match = pat.fullmatch(entity)
#     assert match
#     page = match.group(1)
#     agent = match.group(2)
#     if page != prev_page:
#         prev_page = page
#         num_page += 1
#     result[num_page, agents[agent]] = i

NameError: name 'prev_page' is not defined

In [46]:
i = 3
entity = pages[3]

In [47]:
match = pat.fullmatch(entity)

In [48]:
page = match.group(1)
agent = match.group(2)

In [49]:
page

'"Awaken,_My_Love!"_en.wikipedia.org'

In [None]:
agent

In [None]:
prev_page = page

In [None]:
num_page += 1

In [None]:
num_page

In [None]:
result[num_page, agents[agent]] = i

In [None]:
result

In [None]:
result

## batch_autocorrメソッド

In [50]:
data, lag, starts, ends, threshold, backoffset = df.values, 365, starts, ends, 1.5, 0

In [51]:
n_series = data.shape[0]
n_days = data.shape[1]

In [52]:
max_end = n_days - backoffset

In [53]:
corr = np.empty(n_series, dtype=np.float64)
support = np.empty(n_series, dtype=np.float64)

In [54]:
np.empty(5)

array([  3.53878511e-316,   6.92698980e-310,   5.39497551e-317,
         5.67930167e-311,   2.37151510e-322])

In [None]:
# for i in range(n_series):
#     series = data[i]
#     end = min(ends[i], max_end)
#     real_len = end - starts[i]
#     support[i] = real_len/lag
#     if support[i] > threshold:
#         series = series[starts[i]:end]
#         c_365 = single_autocorr(series, lag)
#         c_364 = single_autocorr(series, lag-1)
#         c_366 = single_autocorr(series, lag+1)
#         # Average value between exact lag and two nearest neighborhs for smoothness
#         corr[i] = 0.5 * c_365 + 0.25 * c_364 + 0.25 * c_366
#     else:
#         corr[i] = np.NaN

In [74]:
i = 1

In [75]:
series = data[i]

In [76]:
end = min(ends[i], max_end)

In [77]:
real_len = end - starts[i]

In [78]:
support[i] = real_len/lag

In [79]:
support

array([  2.20000000e+000,   2.19452055e+000,   6.92688373e-310, ...,
         1.22655141e+266,   1.78495620e+161,   1.91084238e+214])

In [80]:
support[i] > threshold

True

In [81]:
series = series[starts[i]:end]

In [82]:
c_365 = single_autocorr(series, lag)
c_364 = single_autocorr(series, lag-1)
c_366 = single_autocorr(series, lag+1)

In [84]:
print(c_365, c_366, c_366)

nan nan nan


In [85]:
# Average value between exact lag and two nearest neighborhs for smoothness
corr[i] = 0.5 * c_365 + 0.25 * c_364 + 0.25 * c_366

In [86]:
starts

array([0, 2, 0, ..., 0, 0, 0], dtype=int32)

In [87]:
ends

array([803, 803, 803, ..., 803, 803, 803], dtype=int32)

## lag_indexesメソッド

In [89]:
begin, end = df.columns[0], df.columns[-1] + pd.Timedelta(63, unit='D')

In [90]:
begin

Timestamp('2015-07-01 00:00:00')

In [91]:
end

Timestamp('2017-11-13 00:00:00')

In [93]:
dr = pd.date_range(begin, end)

In [95]:
base_index = pd.Series(np.arange(0, len(dr)), index=dr)

In [97]:
def lag(offset):
    dates = dr - offset
    return pd.Series(data=base_index.loc[dates].fillna(-1).astype(np.int16).values, index=dr)

In [None]:
[lag(pd.DateOffset(months=m)) for m in (3, 6, 9, 12)]