![ga4](https://www.google-analytics.com/collect?v=2&tid=G-6VDTYWLKX6&cid=1&en=page_view&sid=1&dl=statmike%2Fvertex-ai-mlops%2Farchitectures%2Ftracking&dt=tracking_github.ipynb)

# GitHub Traffic For Repository

## Setup

In [18]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID

'statmike-mlops-349915'

In [19]:
REGION = 'us-central1'

In [20]:
github_user = 'statmike'
github_repo = 'vertex-ai-mlops'

In [21]:
import requests
import json
import time
from datetime import datetime
import pandas as pd
from io import StringIO
import os, shutil

from google.cloud import bigquery
from google.cloud import storage

In [22]:
bq = bigquery.Client()
gcs = storage.Client()

In [23]:
DIR = 'temp'
!rm -rf {DIR}
!mkdir -p {DIR}

---
## Get Secret

You need to create a secret to hold the PAT for accessing the GitHub API.  More information on [creating a secret manager](https://cloud.google.com/secret-manager/docs/creating-and-accessing-secrets#secretmanager-create-secret-console).


In [7]:
try:
    import google.cloud.secretmanager
except ImportError:
    print('You need to pip install google-cloud-secret-manager')
    !pip install google-cloud-secret-manager -q

In [8]:
from google.cloud import secretmanager

In [9]:
client = secretmanager.SecretManagerServiceClient()

In [10]:
secret = client.access_secret_version(request = {"name": f'projects/{PROJECT_ID}/secrets/github_api/versions/latest'})

In [11]:
pat = secret.payload.data.decode('utf-8')

---
## GitHub API

- GitHub [traffic API](https://docs.github.com/en/rest/metrics/traffic#about-the-repository-traffic-api)
- Permission the PAT will need are under [adminstration](https://docs.github.com/en/rest/overview/permissions-required-for-fine-grained-personal-access-tokens#administration)

In [13]:
github_api_url = f'https://api.github.com/repos/{github_user}/{github_repo}'

In [14]:
## all three work:
response = requests.get(f'{github_api_url}/traffic/clones', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})
#response = requests.get(f'{github_api_url}/traffic/clones', auth = ('statmike', f'{pat}'), headers = {'Accept': 'application/vnd.github+json'})
#response = requests.get(f'{github_api_url}/traffic/clones', auth = ('statmike', f'{pat}'), headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})

In [15]:
response

<Response [200]>

In [16]:
response.text

'{"count":175,"uniques":83,"clones":[{"timestamp":"2023-02-04T00:00:00Z","count":10,"uniques":4},{"timestamp":"2023-02-05T00:00:00Z","count":1,"uniques":1},{"timestamp":"2023-02-06T00:00:00Z","count":4,"uniques":3},{"timestamp":"2023-02-07T00:00:00Z","count":16,"uniques":8},{"timestamp":"2023-02-08T00:00:00Z","count":6,"uniques":3},{"timestamp":"2023-02-09T00:00:00Z","count":20,"uniques":12},{"timestamp":"2023-02-10T00:00:00Z","count":28,"uniques":17},{"timestamp":"2023-02-11T00:00:00Z","count":10,"uniques":6},{"timestamp":"2023-02-12T00:00:00Z","count":9,"uniques":6},{"timestamp":"2023-02-13T00:00:00Z","count":6,"uniques":6},{"timestamp":"2023-02-14T00:00:00Z","count":29,"uniques":7},{"timestamp":"2023-02-15T00:00:00Z","count":13,"uniques":8},{"timestamp":"2023-02-16T00:00:00Z","count":20,"uniques":19},{"timestamp":"2023-02-17T00:00:00Z","count":3,"uniques":2}]}'

---
### Review metrics: Community Metrics

In [116]:
response = requests.get(f'{github_api_url}/community/profile', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})
json.loads(response.text)

{'health_percentage': 42,
 'description': 'Google Cloud Platform Vertex AI end-to-end workflows for machine learning operations',
 'documentation': None,
 'files': {'code_of_conduct': None,
  'code_of_conduct_file': None,
  'contributing': None,
  'issue_template': None,
  'pull_request_template': None,
  'license': {'key': 'apache-2.0',
   'name': 'Apache License 2.0',
   'spdx_id': 'Apache-2.0',
   'url': 'https://api.github.com/licenses/apache-2.0',
   'node_id': 'MDc6TGljZW5zZTI=',
   'html_url': 'https://github.com/statmike/vertex-ai-mlops/blob/main/LICENSE'},
  'readme': {'url': 'https://api.github.com/repos/statmike/vertex-ai-mlops/contents/readme.md',
   'html_url': 'https://github.com/statmike/vertex-ai-mlops/blob/main/readme.md'}},
 'updated_at': None}

---
### Review metrics: Statistics - Weekly Code Activity

In [17]:
# timestamp (12AM Sunday each week for life of the repository), additions, deletions
response = requests.get(f'{github_api_url}/stats/code_frequency', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})

while response.status_code == 202:
    time.sleep(30)
    response = requests.get(f'{github_api_url}/stats/code_frequency', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})
    
json.loads(response.text)

[[1616889600, 2983, -547],
 [1617494400, 7461, -3499],
 [1618099200, 12394, -6314],
 [1618704000, 7904, -6179],
 [1619308800, 0, 0],
 [1619913600, 0, 0],
 [1620518400, 0, 0],
 [1621123200, 0, 0],
 [1621728000, 0, 0],
 [1622332800, 0, 0],
 [1622937600, 695, -899],
 [1623542400, 0, 0],
 [1624147200, 246, -359],
 [1624752000, 0, 0],
 [1625356800, 14349, -11041],
 [1625961600, 3422, -1542],
 [1626566400, 1582, -1716],
 [1627171200, 0, 0],
 [1627776000, 0, 0],
 [1628380800, 1693, -2389],
 [1628985600, 0, 0],
 [1629590400, 0, 0],
 [1630195200, 10777, -6390],
 [1630800000, 400423, -206756],
 [1631404800, 217627, -217787],
 [1632009600, 14216, -10514],
 [1632614400, 1370, -1244],
 [1633219200, 2047, -715],
 [1633824000, 2705, -2357],
 [1634428800, 37, -62],
 [1635033600, 207, -161],
 [1635638400, 0, 0],
 [1636243200, 0, 0],
 [1636848000, 0, 0],
 [1637452800, 134, -139],
 [1638057600, 3731, -3358],
 [1638662400, 287, -296],
 [1639267200, 0, 0],
 [1639872000, 0, 0],
 [1640476800, 0, 0],
 [164108

In [191]:
# array of array is not allowed in json/bigquery, convert to array of struct/dict:

[{'week': v[0], 'additions': v[1], 'deletions': v[2]} for v in json.loads(response.text)]

[{'week': 1616889600, 'additions': 2983, 'deletions': -547},
 {'week': 1617494400, 'additions': 7461, 'deletions': -3499},
 {'week': 1618099200, 'additions': 12394, 'deletions': -6314},
 {'week': 1618704000, 'additions': 7904, 'deletions': -6179},
 {'week': 1619308800, 'additions': 0, 'deletions': 0},
 {'week': 1619913600, 'additions': 0, 'deletions': 0},
 {'week': 1620518400, 'additions': 0, 'deletions': 0},
 {'week': 1621123200, 'additions': 0, 'deletions': 0},
 {'week': 1621728000, 'additions': 0, 'deletions': 0},
 {'week': 1622332800, 'additions': 0, 'deletions': 0},
 {'week': 1622937600, 'additions': 695, 'deletions': -899},
 {'week': 1623542400, 'additions': 0, 'deletions': 0},
 {'week': 1624147200, 'additions': 246, 'deletions': -359},
 {'week': 1624752000, 'additions': 0, 'deletions': 0},
 {'week': 1625356800, 'additions': 14349, 'deletions': -11041},
 {'week': 1625961600, 'additions': 3422, 'deletions': -1542},
 {'week': 1626566400, 'additions': 1582, 'deletions': -1716},
 {'w

---
### Review metrics: Statistics - Weekly Commit Activity

In [38]:
# commits per day [sunday, ..., saturday], total, week (timestamp)
response = requests.get(f'{github_api_url}/stats/commit_activity', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})

while response.status_code == 202:
    time.sleep(30)
    response = requests.get(f'{github_api_url}/stats/commit_activity', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})
    
json.loads(response.text)

[{'days': [0, 1, 2, 1, 0, 1, 0], 'total': 5, 'week': 1638057600},
 {'days': [0, 0, 0, 3, 0, 0, 0], 'total': 3, 'week': 1638662400},
 {'days': [0, 0, 0, 0, 0, 0, 0], 'total': 0, 'week': 1639267200},
 {'days': [0, 0, 0, 0, 0, 0, 0], 'total': 0, 'week': 1639872000},
 {'days': [0, 0, 0, 0, 0, 0, 0], 'total': 0, 'week': 1640476800},
 {'days': [0, 16, 3, 1, 0, 1, 1], 'total': 22, 'week': 1641081600},
 {'days': [0, 0, 0, 0, 0, 0, 0], 'total': 0, 'week': 1641686400},
 {'days': [0, 0, 0, 0, 0, 0, 0], 'total': 0, 'week': 1642291200},
 {'days': [0, 0, 0, 2, 0, 0, 2], 'total': 4, 'week': 1642896000},
 {'days': [0, 0, 0, 1, 1, 1, 0], 'total': 3, 'week': 1643500800},
 {'days': [0, 5, 0, 3, 2, 1, 1], 'total': 12, 'week': 1644105600},
 {'days': [0, 2, 1, 0, 0, 2, 0], 'total': 5, 'week': 1644710400},
 {'days': [0, 1, 0, 0, 0, 1, 0], 'total': 2, 'week': 1645315200},
 {'days': [0, 0, 0, 0, 0, 0, 0], 'total': 0, 'week': 1645920000},
 {'days': [0, 0, 0, 2, 1, 6, 1], 'total': 10, 'week': 1646524800},
 {'day

---
### Review metrics: Statistics - Weekly Contributor Activity (Code and Commit)

In [39]:
# list of dict for each author with 52 week data: total alltime, week list [week timestamp, additions, deletions, commits], author info
response = requests.get(f'{github_api_url}/stats/contributors', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})

while response.status_code == 202:
    time.sleep(30)
    response = requests.get(f'{github_api_url}/stats/contributors', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})
    
json.loads(response.text)

[{'total': 1,
  'weeks': [{'w': 1616889600, 'a': 0, 'd': 0, 'c': 0},
   {'w': 1617494400, 'a': 0, 'd': 0, 'c': 0},
   {'w': 1618099200, 'a': 0, 'd': 0, 'c': 0},
   {'w': 1618704000, 'a': 0, 'd': 0, 'c': 0},
   {'w': 1619308800, 'a': 0, 'd': 0, 'c': 0},
   {'w': 1619913600, 'a': 0, 'd': 0, 'c': 0},
   {'w': 1620518400, 'a': 0, 'd': 0, 'c': 0},
   {'w': 1621123200, 'a': 0, 'd': 0, 'c': 0},
   {'w': 1621728000, 'a': 0, 'd': 0, 'c': 0},
   {'w': 1622332800, 'a': 0, 'd': 0, 'c': 0},
   {'w': 1622937600, 'a': 0, 'd': 0, 'c': 0},
   {'w': 1623542400, 'a': 0, 'd': 0, 'c': 0},
   {'w': 1624147200, 'a': 0, 'd': 0, 'c': 0},
   {'w': 1624752000, 'a': 0, 'd': 0, 'c': 0},
   {'w': 1625356800, 'a': 0, 'd': 0, 'c': 0},
   {'w': 1625961600, 'a': 0, 'd': 0, 'c': 0},
   {'w': 1626566400, 'a': 0, 'd': 0, 'c': 0},
   {'w': 1627171200, 'a': 0, 'd': 0, 'c': 0},
   {'w': 1627776000, 'a': 0, 'd': 0, 'c': 0},
   {'w': 1628380800, 'a': 0, 'd': 0, 'c': 0},
   {'w': 1628985600, 'a': 0, 'd': 0, 'c': 0},
   {'w': 16

---
### Review metrics: Statistics - Weekly Contributor Commit Activity 
- a very summarized view of the above the looks at just commit counts by two contributor groups: owner, all.  Non-owner commits can be calculated by subtraction.

In [40]:
# total commit counts for all and owner as list for last 52 weeks (index=0 is oldest week, to most recent)
response = requests.get(f'{github_api_url}/stats/participation', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})

#while response.status_code == 202:
#    time.sleep(30)
#    response = requests.get(f'{github_api_url}/stats/participation', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})
    
json.loads(response.text)

{'all': [5,
  3,
  0,
  0,
  0,
  21,
  1,
  0,
  2,
  6,
  11,
  6,
  2,
  0,
  9,
  14,
  9,
  0,
  13,
  4,
  2,
  0,
  1,
  0,
  1,
  12,
  3,
  0,
  0,
  2,
  3,
  3,
  0,
  0,
  10,
  9,
  15,
  10,
  9,
  12,
  2,
  22,
  34,
  30,
  29,
  45,
  17,
  11,
  13,
  13,
  7,
  6],
 'owner': [5,
  3,
  0,
  0,
  0,
  21,
  1,
  0,
  2,
  6,
  11,
  6,
  2,
  0,
  9,
  14,
  8,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  2,
  3,
  3,
  0,
  0,
  10,
  9,
  15,
  10,
  9,
  12,
  2,
  22,
  34,
  30,
  29,
  45,
  15,
  10,
  13,
  12,
  6,
  6]}

---
### Review metrics: Traffic - Clones

In [41]:
# clone stats for last 14 days
response = requests.get(f'{github_api_url}/traffic/clones', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})
json.loads(response.text)

{'count': 49,
 'uniques': 39,
 'clones': [{'timestamp': '2022-11-12T00:00:00Z', 'count': 2, 'uniques': 2},
  {'timestamp': '2022-11-13T00:00:00Z', 'count': 2, 'uniques': 1},
  {'timestamp': '2022-11-14T00:00:00Z', 'count': 15, 'uniques': 11},
  {'timestamp': '2022-11-15T00:00:00Z', 'count': 3, 'uniques': 3},
  {'timestamp': '2022-11-16T00:00:00Z', 'count': 2, 'uniques': 2},
  {'timestamp': '2022-11-17T00:00:00Z', 'count': 2, 'uniques': 2},
  {'timestamp': '2022-11-18T00:00:00Z', 'count': 3, 'uniques': 3},
  {'timestamp': '2022-11-19T00:00:00Z', 'count': 3, 'uniques': 3},
  {'timestamp': '2022-11-20T00:00:00Z', 'count': 2, 'uniques': 2},
  {'timestamp': '2022-11-21T00:00:00Z', 'count': 7, 'uniques': 5},
  {'timestamp': '2022-11-22T00:00:00Z', 'count': 2, 'uniques': 1},
  {'timestamp': '2022-11-23T00:00:00Z', 'count': 1, 'uniques': 1},
  {'timestamp': '2022-11-24T00:00:00Z', 'count': 1, 'uniques': 1},
  {'timestamp': '2022-11-25T00:00:00Z', 'count': 4, 'uniques': 3}]}

---
### Review metrics: Traffic - Top 10 Content

In [42]:
# top 10 popular content for previous 14 days
response = requests.get(f'{github_api_url}/traffic/popular/paths', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})
json.loads(response.text)

[{'path': '/statmike/vertex-ai-mlops',
  'title': 'statmike/vertex-ai-mlops: Google Cloud Platform Vertex AI end-to-end workflow...',
  'count': 435,
  'uniques': 185},
 {'path': '/statmike/vertex-ai-mlops/tree/main/02%20-%20Vertex%20AI%20AutoML',
  'title': 'vertex-ai-mlops/02 - Vertex AI AutoML at main · statmike/vertex-ai-mlops · Gi...',
  'count': 74,
  'uniques': 38},
 {'path': '/statmike/vertex-ai-mlops/tree/main/00%20-%20Setup',
  'title': 'vertex-ai-mlops/00 - Setup at main · statmike/vertex-ai-mlops · GitHub',
  'count': 57,
  'uniques': 33},
 {'path': '/statmike/vertex-ai-mlops/tree/main/05%20-%20TensorFlow',
  'title': 'vertex-ai-mlops/05 - TensorFlow at main · statmike/vertex-ai-mlops · GitHub',
  'count': 55,
  'uniques': 32},
 {'path': '/statmike/vertex-ai-mlops/blob/main/00%20-%20Setup/00%20-%20Environment%20Setup.ipynb',
  'title': 'vertex-ai-mlops/00 - Environment Setup.ipynb at main · statmike/vertex-ai-mlops',
  'count': 46,
  'uniques': 28},
 {'path': '/statmike/ver

---
### Review metrics: Traffic - Top 10 Referral Sources

In [43]:
# top 10 referral sources for last 14 days
response = requests.get(f'{github_api_url}/traffic/popular/referrers', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})
json.loads(response.text)

[{'referrer': 'youtube.com', 'count': 325, 'uniques': 75},
 {'referrer': 'github.com', 'count': 221, 'uniques': 38},
 {'referrer': 'Google', 'count': 146, 'uniques': 54},
 {'referrer': 'notebooks.githubusercontent.com', 'count': 25, 'uniques': 13},
 {'referrer': 'art-analytics.appspot.com', 'count': 7, 'uniques': 2},
 {'referrer': 'statics.teams.cdn.office.net', 'count': 4, 'uniques': 2},
 {'referrer': 'm.facebook.com', 'count': 3, 'uniques': 1}]

---
### Review metrics: Traffic - Page Views

In [153]:
# number of views for last 14 days
response = requests.get(f'{github_api_url}/traffic/views', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})
json.loads(response.text)

{'count': 1524,
 'uniques': 253,
 'views': [{'timestamp': '2022-11-13T00:00:00Z', 'count': 44, 'uniques': 12},
  {'timestamp': '2022-11-14T00:00:00Z', 'count': 123, 'uniques': 35},
  {'timestamp': '2022-11-15T00:00:00Z', 'count': 150, 'uniques': 33},
  {'timestamp': '2022-11-16T00:00:00Z', 'count': 128, 'uniques': 32},
  {'timestamp': '2022-11-17T00:00:00Z', 'count': 166, 'uniques': 41},
  {'timestamp': '2022-11-18T00:00:00Z', 'count': 74, 'uniques': 31},
  {'timestamp': '2022-11-19T00:00:00Z', 'count': 89, 'uniques': 13},
  {'timestamp': '2022-11-20T00:00:00Z', 'count': 147, 'uniques': 20},
  {'timestamp': '2022-11-21T00:00:00Z', 'count': 141, 'uniques': 29},
  {'timestamp': '2022-11-22T00:00:00Z', 'count': 78, 'uniques': 20},
  {'timestamp': '2022-11-23T00:00:00Z', 'count': 74, 'uniques': 21},
  {'timestamp': '2022-11-24T00:00:00Z', 'count': 156, 'uniques': 34},
  {'timestamp': '2022-11-25T00:00:00Z', 'count': 108, 'uniques': 19},
  {'timestamp': '2022-11-26T00:00:00Z', 'count': 29, 

---
### Review metrics: Activity - List Repository Events

In [366]:
# a list of dicts that represent event on the repository like commit, star, watch, fork, .. 
response = requests.get(f'{github_api_url}/events', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})
#json.loads(response.text)

---
## IDEA

- Raw Data To BigQuery
    - collect metrics first time
    - write to json file
        - avoid nested array's which looks like list of list
        - include timestamp `collect_time`
    - store file in GCS
    - Create a BigQuery Dataset
    - Create/Replace a BigQuery Table from the file
        - partition by `collect_time`
    - collect metrics again, save to file, store in GCS
    - append new metrics to BigQuery Table


- Automate Data To BigQuery (Daily)
    - Cloud Scheduler > PubSub > Cloud Function
        - Get Secret for PAT
        - Fetch from GitHub API
        - Store in BigQuery
        - Create Pub/Sub Topic
        - Create Cloud Schedular for each night 4AM to trigger Pub/Sub Topic
        - Write Cloud Function subscribed to Pub/Sub Topic that updates tables each night: insert, append
        - Trigger DataForm ELT process

---
## Raw Data To BigQuery

---
### Collect Metrics

Create a json object that stores the returns from the api calls:

In [222]:
current_datetime = datetime.now()

paths = ['community/profile', 'stats/code_frequency', 'stats/commit_activity', 'stats/contributors', 'stats/participation', 'traffic/clones', 'traffic/popular/paths', 'traffic/popular/referrers', 'traffic/views']

responses = {
    'collect_time': current_datetime.strftime("%Y-%m-%dT%H:%M:%SZ"),
    'github_user': github_user,
    'github_repo': github_repo
}

In [223]:
for path in paths:
    response = requests.get(f'{github_api_url}/{path}', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})

    while response.status_code == 202:
        time.sleep(30)
        response = requests.get(f'{github_api_url}/{path}', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})

    if path == 'stats/code_frequency':
        responses[path.replace('/', '_')] = {'response': [{'week': v[0], 'additions': v[1], 'deletions': v[2]} for v in json.loads(response.text)]}
    else:
        responses[path.replace('/', '_')] = {'response': json.loads(response.text)}

---
### Write to File

In [224]:
with open(f"{DIR}/responses_{current_datetime.strftime('%Y%m%d%H%M%S')}.json",'w') as file:
    json.dump(responses, file)

---
### Store in GCS

In [225]:
bucket = gcs.bucket(PROJECT_ID)
blob = bucket.blob(f"architectures/tracking/responses_{current_datetime.strftime('%Y%m%d%H%M%S')}.json")
blob.upload_from_filename(f"{DIR}/responses_{current_datetime.strftime('%Y%m%d%H%M%S')}.json")

In [226]:
list(bucket.list_blobs(prefix = 'architectures/tracking'))

[<Blob: statmike-mlops-349915, architectures/tracking/responses_20221127032906.json, 1669519749193090>]

In [227]:
bucket.name

'statmike-mlops-349915'

In [228]:
blob.name

'architectures/tracking/responses_20221127032906.json'

---
### Create BigQuery Dataset

In [229]:
ds = bigquery.Dataset(f"{PROJECT_ID}.github_api")
ds.location = 'us' #REGION
ds = bq.create_dataset(dataset = ds, exists_ok = True)

In [230]:
ds.dataset_id

'github_api'

In [231]:
ds.full_dataset_id

'statmike-mlops-349915:github_api'

In [259]:
ds_fetch = bq.get_dataset(f"{PROJECT_ID}.github_api")

In [262]:
ds_fetch.full_dataset_id

'statmike-mlops-349915:github_api'

---
### Create/Replace BigQuery Table

With time partition on collect_time

In [237]:
destination = bigquery.TableReference.from_string(f"{PROJECT_ID}.{ds.dataset_id}.raw")
job_config = bigquery.LoadJobConfig(
    write_disposition = 'WRITE_TRUNCATE', # WRITE_EMPTY, WRITE_APPEND
    source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
    autodetect = True,
    time_partitioning = bigquery.table.TimePartitioning(field='collect_time')
)
job = bq.load_table_from_uri(f"gs://{bucket.name}/{blob.name}", destination, job_config = job_config)
job.result()

LoadJob<project=statmike-mlops-349915, location=US, id=22e62465-4065-45d5-b81d-af638eab1d68>

In [238]:
job.errors

---
### Collect Metrics Again + Store in GCS (directly)

In [367]:
current_datetime = datetime.now()

paths = ['community/profile', 'stats/code_frequency', 'stats/commit_activity', 'stats/contributors', 'stats/participation', 'traffic/clones', 'traffic/popular/paths', 'traffic/popular/referrers', 'traffic/views', 'events']

responses = {
    'collect_time': current_datetime.strftime("%Y-%m-%dT%H:%M:%SZ"),
    'github_user': github_user,
    'github_repo': github_repo
}

In [368]:
for path in paths:
    response = requests.get(f'{github_api_url}/{path}', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})

    while response.status_code == 202:
        time.sleep(30)
        response = requests.get(f'{github_api_url}/{path}', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})

    if path == 'stats/code_frequency':
        responses[path.replace('/', '_')] = {'response': [{'week': v[0], 'additions': v[1], 'deletions': v[2]} for v in json.loads(response.text)]}
    else:
        responses[path.replace('/', '_')] = {'response': json.loads(response.text)}

In [369]:
bucket = gcs.bucket(PROJECT_ID)
blob = bucket.blob(f"architectures/tracking/responses_{current_datetime.strftime('%Y%m%d%H%M%S')}.json")
blob.upload_from_string(json.dumps(responses))

---
### Append Metrics to BigQuery Table

In [371]:
destination = bigquery.TableReference.from_string(f"{PROJECT_ID}.{ds.dataset_id}.raw")
job_config = bigquery.LoadJobConfig(
    write_disposition = 'WRITE_APPEND', # WRITE_EMPTY, WRITE_APPEND
    source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
    autodetect = True,
    schema_update_options = [bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION]
)
job = bq.load_table_from_uri(f"gs://{bucket.name}/{blob.name}", destination, job_config = job_config)
job.result()

LoadJob<project=statmike-mlops-349915, location=US, id=a09d992a-b9c8-4e1c-98ac-7afe38a210cd>

---
## Automate Data Collection: Daily

### Package Installs (if needed)

This notebook uses the Python Clients for
- Google Service Usage
    - to enable APIs
- Cloud Pub/Sub
- Cloud Functions
- Cloud Scheduler

The cells below check to see if the required Python libraries are installed.  If any are not it will print a message to do the install with the associated pip command to use.  These installs must be completed before continuing this notebook.

In [244]:
try:
    import google.cloud.service_usage_v1
except ImportError:
    print('You need to pip install google-cloud-service-usage')
    !pip install google-cloud-service-usage -q

In [245]:
try:
    import google.cloud.pubsub
except ImportError:
    print('You need to pip install google-cloud-pubsub')
    !pip install google-cloud-pubsub -q

In [246]:
try:
    import google.cloud.functions
except ImportError:
    print('You need to pip install google-cloud-functions')
    !pip install google-cloud-functions -q

In [247]:
try:
    import google.cloud.scheduler
except ImportError:
    print('You need to pip install google-cloud-scheduler')
    !pip install google-cloud-scheduler -q

In [251]:
from google.cloud import service_usage_v1
from google.cloud import pubsub_v1
from google.cloud import functions_v1
from google.cloud import scheduler_v1

In [252]:
su_client = service_usage_v1.ServiceUsageClient()
pubsub_pubclient = pubsub_v1.PublisherClient() 
functions_client = functions_v1.CloudFunctionsServiceClient()
scheduler_client = scheduler_v1.CloudSchedulerClient()

---
### Pub/Sub

Use a Pub/Sub topic to trigger a Cloud Function to run.  The topic will be able to receive message manually or on a schedule from Cloud Scheduler.

The main concepts:
- Topic - a feed of messages
     - Publish - send a new message to a topic
     - Subscription - receive messages that arrive on topic
          - Push - the subscriber has new messages pushed to it
          - Pull - the subscriber request new messages by pulling them
          
In this example, a topic will be set up for daily runs of metric functions.  Publishing a new message to this topic will trigger one or more Cloud Functions to run like the one setup below.  The Cloud Funtion will have a push subscription to the topic.

In [253]:
PUBSUB_TOPIC = 'github_api'

In [254]:
for topic in pubsub_pubclient.list_topics(project = f'projects/{PROJECT_ID}'):
    if topic.name.endswith(PUBSUB_TOPIC):
        break
    else: topic = ''

In [255]:
topic = ''
if topic:
    print(topic)
else:
    topic = pubsub_pubclient.create_topic(
        name = pubsub_pubclient.topic_path(PROJECT_ID, PUBSUB_TOPIC)
    )
    print(topic)

name: "projects/statmike-mlops-349915/topics/github_api"



---
### Cloud Functions

#### Create Files for Function

In [257]:
if os.path.exists(f'{DIR}/function'): shutil.rmtree(f'{DIR}/function')
os.makedirs(f'{DIR}/function')

In [258]:
%%writefile {DIR}/function/requirements.txt
pandas
google-cloud-bigquery
google-cloud-storage

Writing temp/function/requirements.txt


In [372]:
%%writefile {DIR}/function/main.py

# packages
import base64
import requests
import json
import time
from datetime import datetime
import pandas as pd
from io import StringIO
import os
from google.cloud import bigquery
from google.cloud import storage



# clients
bq = bigquery.Client()
gcs = storage.Client()



# parameters and defined objects
github_user = 'statmike'
github_repo = 'vertex-ai-mlops'
github_api_url = f'https://api.github.com/repos/{github_user}/{github_repo}'
pat = os.getenv('GITHUB_PAT')
current_datetime = datetime.now()
paths = ['community/profile', 'stats/code_frequency', 'stats/commit_activity', 'stats/contributors', 'stats/participation', 'traffic/clones', 'traffic/popular/paths', 'traffic/popular/referrers', 'traffic/views', '/events']
responses = {
    'collect_time': current_datetime.strftime("%Y-%m-%dT%H:%M:%SZ"),
    'github_user': github_user,
    'github_repo': github_repo
}



def collect(event, context):
    # print inputs to Cloud Function
    function_inputs = json.loads(base64.b64decode(event['data']).decode('utf-8'))
    print(function_inputs)
    PROJECT_ID = function_inputs['PROJECT_ID']
    
    # collect responses from GitHub API:
    for path in paths:
        response = requests.get(f'{github_api_url}/{path}', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})

        while response.status_code == 202:
            time.sleep(30)
            response = requests.get(f'{github_api_url}/{path}', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})

        if path == 'stats/code_frequency':
            responses[path.replace('/', '_')] = {'response': [{'week': v[0], 'additions': v[1], 'deletions': v[2]} for v in json.loads(response.text)]}
        else:
            responses[path.replace('/', '_')] = {'response': json.loads(response.text)}

    # save to GCS Bucket
    bucket = gcs.bucket(PROJECT_ID)
    blob = bucket.blob(f"architectures/tracking/data/responses_{current_datetime.strftime('%Y%m%d%H%M%S')}.json")
    blob.upload_from_string(json.dumps(responses))
    
    # append to BigQuery Table
    ds = bq.get_dataset(f"{PROJECT_ID}.github_api")
    destination = bigquery.TableReference.from_string(f"{PROJECT_ID}.{ds.dataset_id}.raw")
    job_config = bigquery.LoadJobConfig(
        write_disposition = 'WRITE_APPEND', # WRITE_EMPTY, WRITE_APPEND
        source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
        autodetect = True,
        schema_update_options = [bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION]
    )
    job = bq.load_table_from_uri(f"gs://{bucket.name}/{blob.name}", destination, job_config = job_config)
    job.result()

Overwriting temp/function/main.py


#### Store Files in Cloud Storage

Copy from local folder (`DIR/function`) to GCS at the path `architectures/tracking/function`:

In [373]:
!ls {DIR}/function

function.zip  main.py  requirements.txt


In [374]:
import zipfile
with zipfile.ZipFile(f'{DIR}/function/function.zip', mode = 'w') as archive:
    archive.write(f'{DIR}/function/main.py', 'main.py')
    archive.write(f'{DIR}/function/requirements.txt', 'requirements.txt')

In [375]:
!ls {DIR}/function

function.zip  main.py  requirements.txt


In [376]:
with zipfile.ZipFile(f'{DIR}/function/function.zip', mode = 'r') as zip:
    zip.printdir()

File Name                                             Modified             Size
main.py                                        2022-11-28 01:52:38         2671
requirements.txt                               2022-11-27 15:31:40           78


In [377]:
SOURCEPATH = f'architectures/tracking/function'

In [378]:
blob = bucket.blob(f'{SOURCEPATH}/function.zip')
blob.upload_from_filename(f'{DIR}/function/function.zip')

In [379]:
list(bucket.list_blobs(prefix = f'{SOURCEPATH}'))

[<Blob: statmike-mlops-349915, architectures/tracking/function/function.zip, 1669600363956588>]

In [380]:
print(f"View the bucket directly here:\nhttps://console.cloud.google.com/storage/browser/{PROJECT_ID}/{SOURCEPATH};tab=objects&project={PROJECT_ID}")

View the bucket directly here:
https://console.cloud.google.com/storage/browser/statmike-mlops-349915/architectures/tracking/function;tab=objects&project=statmike-mlops-349915


#### Create (or Update) Cloud Function

In [381]:
SERVICE_ACCOUNT = !gcloud config list --format='value(core.account)' 
SERVICE_ACCOUNT = SERVICE_ACCOUNT[0]
SERVICE_ACCOUNT

'1026793852137-compute@developer.gserviceaccount.com'

In [382]:
function_name = 'github_api'

In [383]:
function = ''
for function in functions_client.list_functions(request = functions_v1.ListFunctionsRequest(parent = f'projects/{PROJECT_ID}/locations/{REGION}')):
    if function.name.endswith(function_name):
        break
    else: function = ''

In [384]:
function

name: "projects/statmike-mlops-349915/locations/us-central1/functions/github_api"
source_archive_url: "gs://statmike-mlops-349915/architectures/tracking/function/function.zip"
event_trigger {
  event_type: "providers/cloud.pubsub/eventTypes/topic.publish"
  resource: "projects/statmike-mlops-349915/topics/github_api"
  service: "pubsub.googleapis.com"
  failure_policy {
  }
}
status: ACTIVE
entry_point: "collect"
timeout {
  seconds: 360
}
available_memory_mb: 256
service_account_email: "1026793852137-compute@developer.gserviceaccount.com"
update_time {
  seconds: 1669572349
  nanos: 679000000
}
version_id: 3
runtime: "python310"
ingress_settings: ALLOW_ALL
build_id: "9e4f7407-8109-4796-a0ed-caa75d499793"
secret_environment_variables {
  key: "GITHUB_PAT"
  project_id: "1026793852137"
  secret: "github_api"
  version: "latest"
}
build_name: "projects/1026793852137/locations/us-central1/builds/9e4f7407-8109-4796-a0ed-caa75d499793"
docker_registry: CONTAINER_REGISTRY

In [385]:
from google.protobuf.duration_pb2 import Duration

functionDef = functions_v1.CloudFunction()
functionDef.name = f'projects/{PROJECT_ID}/locations/{REGION}/functions/{function_name}'
functionDef.source_archive_url = f"gs://{PROJECT_ID}/{SOURCEPATH}/function.zip"
functionDef.event_trigger = functions_v1.EventTrigger()
functionDef.event_trigger.event_type = 'providers/cloud.pubsub/eventTypes/topic.publish'
functionDef.event_trigger.resource = topic.name
functionDef.runtime = 'python310'
functionDef.entry_point = 'collect'
functionDef.timeout = Duration(seconds = 360)
functionDef.service_account_email = SERVICE_ACCOUNT

functionDef.secret_environment_variables = [functions_v1.SecretEnvVar(
    key = 'GITHUB_PAT',
    secret = 'github_api'
)]

In [386]:
functionDef

name: "projects/statmike-mlops-349915/locations/us-central1/functions/github_api"
source_archive_url: "gs://statmike-mlops-349915/architectures/tracking/function/function.zip"
event_trigger {
  event_type: "providers/cloud.pubsub/eventTypes/topic.publish"
  resource: "projects/statmike-mlops-349915/topics/github_api"
}
entry_point: "collect"
timeout {
  seconds: 360
}
service_account_email: "1026793852137-compute@developer.gserviceaccount.com"
runtime: "python310"
secret_environment_variables {
  key: "GITHUB_PAT"
  secret: "github_api"
}

In [387]:
function

name: "projects/statmike-mlops-349915/locations/us-central1/functions/github_api"
source_archive_url: "gs://statmike-mlops-349915/architectures/tracking/function/function.zip"
event_trigger {
  event_type: "providers/cloud.pubsub/eventTypes/topic.publish"
  resource: "projects/statmike-mlops-349915/topics/github_api"
  service: "pubsub.googleapis.com"
  failure_policy {
  }
}
status: ACTIVE
entry_point: "collect"
timeout {
  seconds: 360
}
available_memory_mb: 256
service_account_email: "1026793852137-compute@developer.gserviceaccount.com"
update_time {
  seconds: 1669572349
  nanos: 679000000
}
version_id: 3
runtime: "python310"
ingress_settings: ALLOW_ALL
build_id: "9e4f7407-8109-4796-a0ed-caa75d499793"
secret_environment_variables {
  key: "GITHUB_PAT"
  project_id: "1026793852137"
  secret: "github_api"
  version: "latest"
}
build_name: "projects/1026793852137/locations/us-central1/builds/9e4f7407-8109-4796-a0ed-caa75d499793"
docker_registry: CONTAINER_REGISTRY

In [388]:
if function:
    request = functions_v1.UpdateFunctionRequest(
        function = functionDef
    )
    operation = functions_client.update_function(request = request)
else:
    request = functions_v1.CreateFunctionRequest(
        location = f"projects/{PROJECT_ID}/locations/{REGION}",
        function = functionDef
    )
    operation = functions_client.create_function(request = request)

In [389]:
response = operation.result()
print(response)

name: "projects/statmike-mlops-349915/locations/us-central1/functions/github_api"
source_archive_url: "gs://statmike-mlops-349915/architectures/tracking/function/function.zip"
event_trigger {
  event_type: "providers/cloud.pubsub/eventTypes/topic.publish"
  resource: "projects/statmike-mlops-349915/topics/github_api"
  service: "pubsub.googleapis.com"
  failure_policy {
  }
}
status: ACTIVE
entry_point: "collect"
timeout {
  seconds: 360
}
available_memory_mb: 256
service_account_email: "1026793852137-compute@developer.gserviceaccount.com"
update_time {
  seconds: 1669600531
  nanos: 4000000
}
version_id: 4
runtime: "python310"
ingress_settings: ALLOW_ALL
build_id: "f724ceba-a914-498c-b473-43f671448e73"
secret_environment_variables {
  key: "GITHUB_PAT"
  project_id: "1026793852137"
  secret: "github_api"
  version: "latest"
}
build_name: "projects/1026793852137/locations/us-central1/builds/f724ceba-a914-498c-b473-43f671448e73"
docker_registry: CONTAINER_REGISTRY



In [390]:
print(f'Review the Cloud Function in the console here:\nhttps://console.cloud.google.com/functions/list?env=gen1&project={PROJECT_ID}')

Review the Cloud Function in the console here:
https://console.cloud.google.com/functions/list?env=gen1&project=statmike-mlops-349915


---
### Manual Run of Cloud Function

Publish a message to the Pub/Sub topic that will cause the Cloud Function to initiate training.  The code below could be anywhere you want to trigger training!

The function will receive the message as `event` in the format:
```
{
    '@type': 'type.googleapis.com/google.pubsub.v1.PubsubMessage',
    'attributes': {'key' : 'value', ...},
    'data': <base64 encoded string>
}
```

To handle the `event` and retrieve the inputs of the message three things need to happen:
1. reference the 'data' value as `event['data']`
2. decode the 'data' value with `base64.b64decode(<1>).decode('utf-8')`
3. convert the decoded string into a Python dictionary with `json.loads(<2>)`

This looks like:
```
funtion_inputs = json.loads(base64.b64decode(event['data']).decode('utf-8'))
```

In [361]:
function_input = {
    'PROJECT_ID': PROJECT_ID
}

In [362]:
message = json.dumps(function_input)
message = message.encode('utf-8')

In [363]:
future = pubsub_pubclient.publish(topic.name, message, trigger = 'manual')

In [364]:
future.result()

'6335322892776834'

---
### Scheduled Run with Cloud Scheduler

Use Cloud Scheduler to publish a message to the topic at any defined interval which will cause the Cloud Function to initiate training.

Resources:
- List of Time zones - [TZ Database Names](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones)
- Job Frequency - [unix-cron format guide](https://man7.org/linux/man-pages/man5/crontab.5.html)
    - minute hour day_of_month month day_of_week
    - 0 23 * * tue = 11PM every Tuesday



In [331]:
schedule_name = 'github_api'

In [332]:
schedule = ''
for schedule in scheduler_client.list_jobs(parent = f'projects/{PROJECT_ID}/locations/{REGION}'):
    if schedule.name.endswith(schedule_name):
        break
    else: schedule = ''

In [333]:
if schedule:
    print(schedule)
else:
    request = scheduler_v1.CreateJobRequest(
        parent = f'projects/{PROJECT_ID}/locations/{REGION}',
        job = scheduler_v1.Job(
            name = f'projects/{PROJECT_ID}/locations/{REGION}/jobs/{schedule_name}',
            pubsub_target = scheduler_v1.PubsubTarget(
                topic_name = topic.name,
                data = message,
                attributes = {'trigger': 'scheduled'}
            ),
            schedule = '0 6 * * *',
            time_zone = 'America/New_York'
        )
    )
    schedule = scheduler_client.create_job(request = request)
    print(schedule)

name: "projects/statmike-mlops-349915/locations/us-central1/jobs/github_api"
pubsub_target {
  topic_name: "projects/statmike-mlops-349915/topics/github_api"
  data: "{\"PROJECT_ID\": \"statmike-mlops-349915\"}"
  attributes {
    key: "trigger"
    value: "scheduled"
  }
}
user_update_time {
  seconds: 1669567409
}
state: ENABLED
schedule: "0 6 * * *"
time_zone: "America/New_York"



In [334]:
print(f'Review the Cloud Scheduler in the console here:\nhttps://console.cloud.google.com/cloudscheduler?&project={PROJECT_ID}')

Review the Cloud Scheduler in the console here:
https://console.cloud.google.com/cloudscheduler?&project=statmike-mlops-349915


---
## ELT

Two phases
- initial build
- incremental build

In [397]:
query = f"""
SELECT
    # repo info
    github_user, github_repo, collect_time, 
    
    # traffic views info: rolling 2 weeks, yesterday
    traffic_views.response.uniques as traffic_views_unique_14day,
    traffic_views.response.count as traffic_views_count_14day,
    traffic_views_un.uniques as traffic_views_unique_1day,
    traffic_views_un.count as traffic_views_count_1day,
    
    # traffic clones info: rolling 2 weeks, yesterday
    traffic_clones.response.uniques as traffic_clones_unique_14day,
    traffic_clones.response.count as traffic_clones_count_14day,
    traffic_clones_un.uniques as traffic_clones_unique_1day,
    traffic_clones_un.count as traffic_clones_count_1day
    
FROM `statmike-mlops-349915.github_api.raw`, 
    UNNEST(traffic_views.response.views) AS traffic_views_un, 
    UNNEST(traffic_clones.response.clones) AS traffic_clones_un
WHERE 
    DATE_DIFF(DATE(collect_time), DATE(traffic_views_un.timestamp), DAY) = 1 
    AND
    DATE_DIFF(DATE(collect_time), DATE(traffic_clones_un.timestamp), DAY) = 1
ORDER BY collect_time
"""
results = bq.query(query = query).to_dataframe()
results

Unnamed: 0,github_user,github_repo,collect_time,traffic_views_unique_14day,traffic_views_count_14day,traffic_views_unique_1day,traffic_views_count_1day,traffic_clones_unique_14day,traffic_clones_count_14day,traffic_clones_unique_1day,traffic_clones_count_1day
0,statmike,vertex-ai-mlops,2022-11-27 03:29:06+00:00,253,1524,10,29,43,57,6,10
1,statmike,vertex-ai-mlops,2022-11-27 03:38:03+00:00,253,1524,10,29,43,57,6,10
2,statmike,vertex-ai-mlops,2022-11-27 17:57:41+00:00,246,1686,10,29,42,59,6,10
3,statmike,vertex-ai-mlops,2022-11-27 18:05:16+00:00,248,1689,10,29,42,59,6,10
4,statmike,vertex-ai-mlops,2022-11-28 01:43:35+00:00,249,1703,13,241,43,61,3,6
5,statmike,vertex-ai-mlops,2022-11-28 11:00:13+00:00,242,1693,13,241,41,61,3,6
6,statmike,vertex-ai-mlops,2022-11-29 11:00:11+00:00,239,1713,25,114,44,66,10,16
7,statmike,vertex-ai-mlops,2022-11-30 11:00:11+00:00,236,1708,36,222,43,67,5,8
8,statmike,vertex-ai-mlops,2022-12-02 11:00:10+00:00,239,1770,31,142,46,79,4,5
9,statmike,vertex-ai-mlops,2022-12-03 11:00:09+00:00,224,1726,24,80,41,75,2,2


---
# END OF LIFE STEPS

Now that the full ETL proceess for this data is setup this data can be used to create an initial history since it has been running for several month logging raw daily data. 

The section below will use the logic developed in the official setup to create initial tables for `github_metrics`. 

Then:
 - [ ] Remove Cloud Function
 - [ ] Remove Cloud Scheduler
 - [ ] Remove Pub/Sub
 - [ ] Delete Bucket Folder path
 - [ ] Delete BQ Dataset/Tables
 
 
 


In [24]:
BQ_PROJECT = PROJECT_ID
BQ_DATASET = 'github_api'

### RAW Get Last Collection Per Day

Sometimes the script ran multiple time per day, get that last retrieval (collect_time)

In [77]:
query = f"""
CREATE OR REPLACE TABLE `{BQ_PROJECT}.{BQ_DATASET}.raw_last` AS
    WITH
        ADDDATE AS (SELECT *, DATE(collect_time) as collect_date FROM `{BQ_PROJECT}.{BQ_DATASET}.raw`),
        REDUCER AS (
            SELECT AS VALUE ARRAY_AGG(t ORDER BY collect_time DESC LIMIT 1)[OFFSET(0)]
            FROM ADDDATE t
            GROUP BY collect_date)
    SELECT *
    FROM REDUCER
    ORDER BY collect_date, collect_time
"""
job = bq.query(query = query)#.to_dataframe()
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f55df6bf190>

## Traffic/Clones

In [97]:
BQ_TABLE_PREFIX = 'traffic_clones'

In [98]:
query = f"""
CREATE OR REPLACE TABLE `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_PREFIX}_raw` AS
SELECT
    collect_date,
    FORMAT_TIMESTAMP('%Y-%m-%dT%H:%M:%SZ', traffic_clones_un.timestamp) as timestamp,
    traffic_clones_un.count as count,
    traffic_clones_un.uniques as uniques,
    traffic_clones.response.uniques as uniques_last14days,
    CONCAT(github_user, '/', github_repo) as repo
FROM `{BQ_PROJECT}.{BQ_DATASET}.raw_last`, 
    UNNEST(traffic_clones.response.clones) AS traffic_clones_un
ORDER BY collect_date, timestamp
"""
job = bq.query(query = query)#.to_dataframe()
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f55df4d4250>

In [106]:
query = f"""
CREATE OR REPLACE TABLE `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_PREFIX}_prep` AS
WITH
    COL_U14 AS (
        SELECT
            FORMAT_TIMESTAMP('%Y-%m-%dT%H:%M:%SZ', TIMESTAMP(collect_date)) as timestamp,
            repo, MAX(uniques_last14days) as uniques_last14days
        FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_PREFIX}_raw`
        GROUP BY collect_date, repo
    ),
    COL_COUNT AS (
        SELECT timestamp, repo, MAX(count) as count
        FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_PREFIX}_raw`
        GROUP BY timestamp, repo
    ),
    COL_UNIQUES AS (
        SELECT timestamp, repo, MAX(uniques) as uniques
        FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_PREFIX}_raw`
        GROUP BY timestamp, repo
    )
SELECT *
FROM COL_U14
LEFT OUTER JOIN COL_UNIQUES USING(timestamp, repo)
LEFT OUTER JOIN COL_COUNT USING(timestamp, repo)
ORDER BY timestamp
"""
job = bq.query(query = query)#.to_dataframe()
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f55ca978290>

## Traffic/popular/paths

In [113]:
BQ_TABLE_PREFIX = 'traffic_popular_paths'

In [114]:
query = f"""
CREATE OR REPLACE TABLE `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_PREFIX}_raw` AS
SELECT
    traffic_un.count as count,
    traffic_un.uniques as uniques,
    traffic_un.title as title,
    traffic_un.path as path,
    FORMAT_TIMESTAMP('%Y-%m-%dT00:00:00Z', collect_time) as timestamp,
    CONCAT(github_user, '/', github_repo) as repo
FROM `{BQ_PROJECT}.{BQ_DATASET}.raw_last`, 
    UNNEST(traffic_popular_paths.response) AS traffic_un
ORDER BY collect_time, count DESC
"""
job = bq.query(query = query)#.to_dataframe()
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f55df4df9d0>

not a best practice to extract (BQ -> Pandas), then transform, then Load (Pandas -> BQ).  Since this is a one time operation it is less syntax to parse the `path` and `title` columns using Pandas.

In [115]:
query = f"""
SELECT
    traffic_un.count as count,
    traffic_un.uniques as uniques,
    traffic_un.title as title,
    traffic_un.path as path,
    FORMAT_TIMESTAMP('%Y-%m-%dT00:00:00Z', collect_time) as timestamp,
    CONCAT(github_user, '/', github_repo) as repo
FROM `{BQ_PROJECT}.{BQ_DATASET}.raw_last`, 
    UNNEST(traffic_popular_paths.response) AS traffic_un
ORDER BY collect_time, count DESC
"""
extract = bq.query(query = query).to_dataframe()
extract.head()

Unnamed: 0,count,uniques,title,path,timestamp,repo
0,429,179,statmike/vertex-ai-mlops: Google Cloud Platfor...,/statmike/vertex-ai-mlops,2022-11-27T00:00:00Z,statmike/vertex-ai-mlops
1,73,37,vertex-ai-mlops/02 - Vertex AI AutoML at main ...,/statmike/vertex-ai-mlops/tree/main/02%20-%20V...,2022-11-27T00:00:00Z,statmike/vertex-ai-mlops
2,55,31,vertex-ai-mlops/00 - Setup at main · statmike/...,/statmike/vertex-ai-mlops/tree/main/00%20-%20S...,2022-11-27T00:00:00Z,statmike/vertex-ai-mlops
3,55,31,vertex-ai-mlops/05 - TensorFlow at main · stat...,/statmike/vertex-ai-mlops/tree/main/05%20-%20T...,2022-11-27T00:00:00Z,statmike/vertex-ai-mlops
4,47,28,vertex-ai-mlops/00 - Environment Setup.ipynb a...,/statmike/vertex-ai-mlops/blob/main/00%20-%20S...,2022-11-27T00:00:00Z,statmike/vertex-ai-mlops


In [116]:
import urllib
def parse_path(p):
    p = urllib.parse.unquote(p).replace('blob/main/', '')
    p = p.replace('tree/main/', '')
    if p.rfind('.') == -1 or (p.rfind('.') < p.rfind('/')):
        p += '/readme.md'
    return p

extract['file'] = extract.apply(lambda x: parse_path(x['path']), axis = 1)
extract = extract.drop(['title', 'path'], axis = 1)
extract.head()

Unnamed: 0,count,uniques,timestamp,repo,file
0,429,179,2022-11-27T00:00:00Z,statmike/vertex-ai-mlops,/statmike/vertex-ai-mlops/readme.md
1,73,37,2022-11-27T00:00:00Z,statmike/vertex-ai-mlops,/statmike/vertex-ai-mlops/02 - Vertex AI AutoM...
2,55,31,2022-11-27T00:00:00Z,statmike/vertex-ai-mlops,/statmike/vertex-ai-mlops/00 - Setup/readme.md
3,55,31,2022-11-27T00:00:00Z,statmike/vertex-ai-mlops,/statmike/vertex-ai-mlops/05 - TensorFlow/read...
4,47,28,2022-11-27T00:00:00Z,statmike/vertex-ai-mlops,/statmike/vertex-ai-mlops/00 - Setup/00 - Envi...


In [117]:
load_job = bq.load_table_from_dataframe(
    dataframe = extract,
    destination = bigquery.TableReference.from_string(f"{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_PREFIX}_raw"),
    job_config = bigquery.LoadJobConfig(
        write_disposition = 'WRITE_TRUNCATE', # WRITE_TRUNCATE = replace if exists, WRITE_APPEND = append if exists, WRITE_EMPTY = write new but dont overwrite
        autodetect = True, # detect schema
    )
)
load_job.result()

LoadJob<project=statmike-mlops-349915, location=US, id=1323f4fd-0b61-40c6-930e-249e65153db5>

In [118]:
query = f"""
CREATE OR REPLACE TABLE `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_PREFIX}_prep` AS
SELECT *
FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_PREFIX}_raw`
ORDER BY timestamp, count DESC 
"""
job = bq.query(query = query)#.to_dataframe()
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f55c9d8edd0>

## Traffic/popular/referrers

In [110]:
BQ_TABLE_PREFIX = 'traffic_popular_referrers'

In [111]:
query = f"""
CREATE OR REPLACE TABLE `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_PREFIX}_raw` AS
SELECT
    traffic_un.referrer as referrer,
    traffic_un.count as count,
    traffic_un.uniques as uniques,
    FORMAT_TIMESTAMP('%Y-%m-%dT00:00:00Z', collect_time) as timestamp,
    CONCAT(github_user, '/', github_repo) as repo
FROM `{BQ_PROJECT}.{BQ_DATASET}.raw_last`, 
    UNNEST(traffic_popular_referrers.response) AS traffic_un
ORDER BY collect_time, count DESC
"""
job = bq.query(query = query)#.to_dataframe()
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f55df4c22d0>

In [112]:
query = f"""
CREATE OR REPLACE TABLE `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_PREFIX}_prep` AS
SELECT *
FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_PREFIX}_raw`
ORDER BY timestamp, count DESC
"""
job = bq.query(query = query)#.to_dataframe()
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f55f4317550>

## Traffic/Views

In [107]:
BQ_TABLE_PREFIX = 'traffic_views'

In [108]:
query = f"""
CREATE OR REPLACE TABLE `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_PREFIX}_raw` AS
SELECT
    collect_date,
    FORMAT_TIMESTAMP('%Y-%m-%dT%H:%M:%SZ', traffic_views_un.timestamp) as timestamp,
    traffic_views_un.count as count,
    traffic_views_un.uniques as uniques,
    traffic_views.response.uniques as uniques_last14days,
    CONCAT(github_user, '/', github_repo) as repo
FROM `{BQ_PROJECT}.{BQ_DATASET}.raw_last`, 
    UNNEST(traffic_views.response.views) AS traffic_views_un
ORDER BY collect_date, timestamp
"""
job = bq.query(query = query)#.to_dataframe()
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f55c9d64e50>

In [109]:
query = f"""
CREATE OR REPLACE TABLE `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_PREFIX}_prep` AS
WITH
    COL_U14 AS (
        SELECT
            FORMAT_TIMESTAMP('%Y-%m-%dT%H:%M:%SZ', TIMESTAMP(collect_date)) as timestamp,
            repo, MAX(uniques_last14days) as uniques_last14days
        FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_PREFIX}_raw`
        GROUP BY collect_date, repo
    ),
    COL_COUNT AS (
        SELECT timestamp, repo, MAX(count) as count
        FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_PREFIX}_raw`
        GROUP BY timestamp, repo
    ),
    COL_UNIQUES AS (
        SELECT timestamp, repo, MAX(uniques) as uniques
        FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE_PREFIX}_raw`
        GROUP BY timestamp, repo
    )
SELECT *
FROM COL_U14
LEFT OUTER JOIN COL_UNIQUES USING(timestamp, repo)
LEFT OUTER JOIN COL_COUNT USING(timestamp, repo)
ORDER BY timestamp
"""
job = bq.query(query = query)#.to_dataframe()
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f55df556890>

## REMOVE RESOURCES