# Wikipedia Clickstream

## Get data

In [3]:
import gzip
import os 
from pathlib import Path
from urllib import request
import zipfile

import pandas as pd

In [4]:
bundle_root = Path(os.environ['LABS_BUNDLE_ROOT'])
data_raw = bundle_root / 'data' / 'raw'

In [6]:
def maybe_download(filename):
    """Download a file if not present."""    
    dest_filename = data_raw / filename
    if not dest_filename.exists():
        print("Attempting to download:", filename)
        request.urlretrieve(url + filename, dest_filename)
        print("Download complete!")
    return dest_filename

### All historical data

In [8]:
clickstream_filename = maybe_download("https://ndownloader.figshare.com/articles/1305770/versions/22", "clickstream.zip")

In [9]:
def maybe_extract(filename):
    """Unzip a file if it's not already unzipped."""
    root, ext = os.path.splitext(filename)
    if not os.path.isdir(root):
        print("Extracting data for %s." % root)
        if ext == ".zip":
            with zipfile.ZipFile(filename, 'r') as f:
                f.extractall(root)
        else:
            raise ValueError("Extension %s not recognized." % ext)
    return [os.path.join(root, d) for d in sorted(os.listdir(root)) if d != '.DS_Store']

In [10]:
clickstream_filenames = maybe_extract(clickstream_filename)

In [11]:
clickstream_filenames

['/Users/brandon/go/src/github.com/contiamo/labs/examples/wikipedia_clickstream/data/raw/clickstream/2015_01_en_clickstream.tsv.gz',
 '/Users/brandon/go/src/github.com/contiamo/labs/examples/wikipedia_clickstream/data/raw/clickstream/2015_02_en_clickstream.tsv.gz',
 '/Users/brandon/go/src/github.com/contiamo/labs/examples/wikipedia_clickstream/data/raw/clickstream/2016_02_en_clickstream.tsv.gz',
 '/Users/brandon/go/src/github.com/contiamo/labs/examples/wikipedia_clickstream/data/raw/clickstream/2016_03_en_clickstream.tsv.gz',
 '/Users/brandon/go/src/github.com/contiamo/labs/examples/wikipedia_clickstream/data/raw/clickstream/2016_04_ar_clickstream.tsv.gz',
 '/Users/brandon/go/src/github.com/contiamo/labs/examples/wikipedia_clickstream/data/raw/clickstream/2016_04_en_clickstream.tsv.gz',
 '/Users/brandon/go/src/github.com/contiamo/labs/examples/wikipedia_clickstream/data/raw/clickstream/2016_04_fa_clickstream.tsv.gz',
 '/Users/brandon/go/src/github.com/contiamo/labs/examples/wikipedia_c