In [1]:
from db_utils import *
% matplotlib inline

In [3]:
# Get Pre 2015-03 data from pentaho
query = """
SELECT
sum(pageviews) as n,
timestamp, 
country_iso, 
project, 
access_method
FROM staging.pentahoviews05
WHERE is_spider = 0
AND is_automata = 0
AND project RLIKE 'wikipedia'
group by timestamp, country_name, project, access_method
"""

d =  query_analytics_store(query, {})

In [4]:
d.head()

Unnamed: 0,access_method,country_iso,n,project,timestamp
0,mobile web,AF,1000,de.wikipedia,2013-04-01
1,desktop,AF,22000,en.wikipedia,2013-04-01
2,mobile web,AF,8000,en.wikipedia,2013-04-01
3,desktop,AF,8000,fa.wikipedia,2013-04-01
4,mobile web,AF,3000,fa.wikipedia,2013-04-01


In [5]:
# sampled data overlaps with hive data for 1 month (April 2015). We delete the month sampled data
d = d[d['timestamp'] != '2015-04-01']

In [6]:
# Get post 2015-03 data from hive

query = """
SET mapred.job.queue.name=priority;
SELECT
sum(view_count) as n,
year,
month,
country_code as country_iso, 
project, 
access_method
FROM wmf.projectview_hourly
WHERE agent_type = 'user'
AND project RLIKE 'wikipedia'
AND YEAR >= 2015
group by year, month, country_code, project, access_method;
"""

dn =  query_hive_ssh(query, 'forecasting_refresh')

In [7]:
dn['month'] = dn['month'].astype(str)
dn['year'] = dn['year'].astype(str)
dn['month'] = dn['month'].apply(lambda x: x if len(x) == 2 else '0' + x)
dn['month'].value_counts()
dn['timestamp'] = dn['year'] + '-' + dn['month'] + '-01' 

In [8]:
df = pd.concat([d, dn], axis=0)

In [9]:
df.index = df['timestamp']

In [10]:
# Add checkpoint in case data gets lost
# Last Checkpoint: Sept 8
df.to_csv('/Users/ellerywulczyn/wmf/pageview_forecasting/checkpoint.csv')
#df = pd.read_csv('/Users/ellerywulczyn/wmf/pageview_forecasting/checkpoint.csv')

In [11]:
group_dimensions = ['project', 'access_method', 'country_iso',]
groups = df.groupby(group_dimensions)

In [12]:
cube = {}
for group in groups:
    colname = '/'.join(group[0])
    dg = group[1]
    dg.index = dg['timestamp']
    data = pd.Series(dg['n'])
    if data.sum() > 1000000:
        cube[colname] = data

In [13]:
len(cube)

3109

In [14]:
df_cube = pd.DataFrame(cube)

In [15]:
df_cube.tail()

Unnamed: 0,ab.wikipedia/desktop/US,ace.wikipedia/desktop/CN,ace.wikipedia/desktop/DE,ace.wikipedia/desktop/US,af.wikipedia/desktop/--,af.wikipedia/desktop/CN,af.wikipedia/desktop/DE,af.wikipedia/desktop/FR,af.wikipedia/desktop/NL,af.wikipedia/desktop/US,...,zh.wikipedia/mobile web/NZ,zh.wikipedia/mobile web/PH,zh.wikipedia/mobile web/RU,zh.wikipedia/mobile web/SE,zh.wikipedia/mobile web/SG,zh.wikipedia/mobile web/TH,zh.wikipedia/mobile web/TW,zh.wikipedia/mobile web/US,zh.wikipedia/mobile web/Unknown,zh.wikipedia/mobile web/VN
2015-08-01,23292,9392,21199,46863,145978,11946,107698,101138,39297,478241,...,277718,76324,115577,81268,1837474,305238,54495466,8661747,,197986
2015-09-01,26047,7561,29509,44211,128668,129062,124972,54018,21061,549939,...,266727,67295,93935,80807,1772282,237343,46495456,6492104,,175296
2015-10-01,22177,7244,16606,36040,127969,139708,114990,39155,20297,382960,...,275893,68527,110216,81953,1955418,251200,50268768,6549152,,185900
2015-11-01,20980,5689,9975,26385,53435,120174,82942,33376,21821,297032,...,272518,75334,113915,83391,2001574,233239,50433435,7330562,,177688
2015-12-01,2949,444,375,4510,1125,11054,9382,7118,3315,41343,...,33703,8763,15157,11604,233096,31244,6163775,1023861,,23560


In [16]:
df_cube['YearMonth'] = pd.to_datetime(df_cube.index)

In [17]:
df_codes = pd.read_csv('./app/data/country_codes.csv')[['ISO 3166-1 2 Letter Code', 'Common Name' ]]
codes_dict = dict(tuple(x) for x in df_codes.values)
codes_dict['Unknown'] = 'Unknown'

df_valid_cols = [c for c in df_cube.columns if len(c.split('/')) == 3 and c.split('/')[2] in codes_dict]
df_valid_cols.append('YearMonth')
df_cube = df_cube[df_valid_cols]

def replace_ISO_with_country(c,codes_dict):
    if c == 'YearMonth':
        return c
    t = c.split('/')
    t[2] = codes_dict[t[2]]
    return '/'.join(t)

df_cube.columns = [replace_ISO_with_country(c,codes_dict) for c in df_cube.columns]

In [18]:
df_cube.to_csv('/Users/ellerywulczyn/wmf/pageview_forecasting/app/data/cube.csv', index = False)

In [19]:
df_cube.shape

(33, 3006)