# Import

In [1]:
from functools import reduce
from pathlib import Path
import time
import datetime

import pandas as pd
import requests
from wmfdata import hive
from wmfdata.utils import print_err, pd_display_all

# Parameters

In [2]:
# TSV file where metrics are or will be saved
FILENAME = "metrics/metrics.tsv"

# Metric month. The mediawiki_history snapshot must be from the metrics month or later.
last_month = datetime.date.today().replace(day=1) - datetime.timedelta(days=1)

METRICS_MONTH_TEXT = last_month.strftime("%Y-%m")
MEDIAWIKI_HISTORY_SNAPSHOT = last_month.strftime("%Y-%m")



In [3]:
datetime.date.today().replace(day=1) - datetime.timedelta(days=1)

datetime.date(2020, 3, 31)

# Preparation

In [4]:
# Convert our metrics month to all the formats we need and provide them in a dict
# so we can easily use them to format strings
metrics_month = METRICS_MONTH_TEXT
date_params = {
    "mediawiki_history_snapshot": MEDIAWIKI_HISTORY_SNAPSHOT,
    "metrics_month": str(metrics_month),
    #"metrics_prev_month": str(metrics_month - 1),
    #"metrics_month_start": str(metrics_month.start_time), 
    "metrics_month_first_day": str((datetime.date.today()- datetime.timedelta(days=31)).replace(day=1)),
    #"metrics_month_end": str(last_month),
    "metrics_month_last_day": str(last_month),
    "metrics_year": last_month.year,
    "metrics_cur_month" : last_month.month
}

# Load any previous results
try:
    old_metrics = (
        pd.read_csv(FILENAME, sep="\t", parse_dates = ["month"])
        .set_index("month")
    )
except FileNotFoundError:
    old_metrics = None
    
def prepare_query(filename):
    return (
        Path(filename)
        .read_text()
        .format(**date_params)
    )

# MariaDB and Hive query metrics


In [5]:
queries = {
    "pageviews": {
        "file": "queries/pageviews.hql",
        "engine": "hive"
    },
    "page_previews": {
        "file": "queries/page_previews.hql",
        "engine": "hive"
    },
    "unique_devices": {
        "file": "queries/unique_devices.hql",
        "engine": "hive"
    },
    "global_south_pageviews": {
        "file": "queries/global_south_pageviews.hql",
        "engine": "hive"
    },
    "mobile-heavy_pageviews": {
        "file": "queries/mobile_heavy_pageviews.hql",
        "engine": "hive"
    },
    "mobile-heavy_previews": {
        "file": "queries/mobile_heavy_previews.hql",
        "engine": "hive"
    },
    "global_south_previews": {
        "file": "queries/global_south_previews.hql",
        "engine": "hive"
    }
       
}


for key, val in queries.items():
    query = prepare_query(val["file"])
    engine = val["engine"]
    print_err("Running {} on {}...".format(key, engine))
    
    if engine == "mariadb":
        result = mariadb.run(query)
    elif engine == "hive":
        result = hive.run(query)
    else:
        raise ValueError("Unknown engine specified.") 
    
    result = result.assign(month=lambda df: pd.to_datetime(df["month"]))
    val["result"] = result

Running global_north_previews on hive...
Running pageviews on hive...
Running page_previews on hive...
Running mobile-heavy_previews on hive...
Running unique_devices on hive...
Running global_south_previews on hive...
Running mobile-heavy_pageviews on hive...
Running global_south_pageviews on hive...
Running global_north_pageviews on hive...


# Combining and saving metrics

In [6]:
# Assemble list of result dataframes
results = [val["result"] for _, val in queries.items()]

# Merge them all, assuming that the month is the only common column
new_metrics = reduce(lambda l, r: pd.merge(l, r, how="outer"), results)

# Set the month as an index so combine_first works properly
new_metrics = new_metrics.set_index("month").sort_index()

# Add Metrics for Interactions
new_metrics['interactions'] = new_metrics.apply(lambda x: x['previews_seen'] + x['total_pageview'], axis=1)
new_metrics['gs_interactions'] = new_metrics.apply(lambda x: x['gs_previews'] + x['gs_pageviews'], axis=1)
new_metrics['mh_interactions'] = new_metrics.apply(lambda x: x['mh_previews'] + x['mh_pageviews'], axis=1)


In [8]:
if old_metrics is None:
    metrics = new_metrics
else:
    metrics = new_metrics.combine_first(old_metrics)
    
pd_display_all(metrics.tail(10))

Unnamed: 0_level_0,desktop,gn_interactions,gn_pageviews,gn_previews,gs_interactions,gs_pageviews,gs_previews,interactions,mh_interactions,mh_pageviews,mh_previews,mobileweb,previews_seen,total_pageview,unique_devices
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2019-06-01,5790501000.0,12619470000.0,11357430000.0,1262031000.0,3785372000.0,3442175000.0,343196693.0,16449780000.0,617333603.0,588140293.0,29193310.0,8804954000.0,1609283000.0,14840500000.0,1506536000.0
2019-07-01,5894085000.0,13039280000.0,11762140000.0,1277139000.0,3913795000.0,3552436000.0,361359372.0,17021110000.0,687542306.0,655773508.0,31768798.0,9226142000.0,1643555000.0,15377560000.0,1502241000.0
2019-08-01,5718161000.0,13003680000.0,11734260000.0,1269425000.0,3986866000.0,3624499000.0,362366891.0,17063240000.0,723552995.0,692032091.0,31520904.0,9445959000.0,1636496000.0,15426740000.0,1509795000.0
2019-09-01,5889674000.0,13122220000.0,11743460000.0,1378752000.0,4131243000.0,3741501000.0,389742454.0,17318130000.0,769139828.0,735344494.0,33795334.0,9408033000.0,1773316000.0,15544820000.0,1695557000.0
2019-10-01,6714077000.0,14285260000.0,12758370000.0,1526889000.0,4311099000.0,3906008000.0,405090073.0,18658770000.0,825074721.0,788811634.0,36263087.0,9750824000.0,1936367000.0,16722400000.0,1811302000.0
2019-11-01,6404464000.0,13993690000.0,12479110000.0,1514581000.0,4096324000.0,3705914000.0,390410201.0,18140100000.0,787954594.0,755302852.0,32651742.0,9572732000.0,1909127000.0,16230970000.0,1694747000.0
2019-12-01,5988201000.0,13686320000.0,12243150000.0,1443172000.0,3724967000.0,3364769000.0,360197742.0,17456310000.0,758927001.0,723749087.0,35177914.0,9388243000.0,1807163000.0,15649150000.0,1576535000.0
2020-01-01,6598842000.0,15150850000.0,13455900000.0,1694953000.0,3998114000.0,3601104000.0,397009855.0,19199710000.0,819825311.0,782726990.0,37098321.0,10206930000.0,2097220000.0,17102490000.0,1639562000.0
2020-02-01,6144843000.0,13819090000.0,12265810000.0,1553277000.0,3805531000.0,3431799000.0,373731960.0,17670250000.0,765111783.0,731187027.0,33924756.0,9327580000.0,1931595000.0,15738660000.0,1576108000.0
2020-03-01,6869440000.0,15065830000.0,13385980000.0,1679843000.0,4306498000.0,3891817000.0,414681293.0,19418210000.0,850707475.0,811436650.0,39270825.0,10170820000.0,2098584000.0,17319630000.0,1663182000.0


In [9]:
metrics.to_csv(FILENAME, sep="\t")