In [1]:
import boto3

In [2]:
import os
os.environ.setdefault('AWS_PROFILE', 'gh-activity')

'gh-activity'

In [3]:
def get_job_details(job_name):
    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table('jobs')
    job_details = table.get_item(Key={'job_id': job_name})['Item']
    return job_details

In [4]:
job_details = get_job_details('ghactivity_ingest')
job_details

{'job_description': 'Ingest ghactivity data to s3',
 'is_active': 'Y',
 'job_id': 'ghactivity_ingest',
 'baseline_days': Decimal('3')}

In [5]:
# check particular entry from the table
baseline_days = job_details['baseline_days']
baseline_days

Decimal('3')

In [6]:
from datetime import datetime as dt
from datetime import timedelta as td

In [7]:
start_time = dt.now().date() - td(days=int(baseline_days))
start_time

datetime.date(2022, 11, 14)

In [8]:
# creating file name
start_file = f"{dt.strftime(start_time, '%Y-%m-%d')}-0.json.gz"

In [9]:
start_file

'2022-11-14-0.json.gz'

In [10]:
# updating new entry with existing details
job_details = {
    'job_id': 'ghactivity_ingest',
    'job_description': 'Ingest ghactivity data to s3',
    'is_active': 'Y',
    'baseline_days': 3,
    'job_run_bookmark_details': {
        'last_run_file_name': start_file,
    }
}

In [11]:
# connecting to aws service
dynamodb = boto3.resource('dynamodb')

In [12]:
# connect to particular table
table = dynamodb.Table('jobs')

In [13]:
# inserting the entry into the table
table.put_item(Item=job_details)

{'ResponseMetadata': {'RequestId': 'RV6BQDF76O8S74K4KJIOVNGIM3VV4KQNSO5AEMVJF66Q9ASUAAJG',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'server': 'Server',
   'date': 'Thu, 17 Nov 2022 05:53:13 GMT',
   'content-type': 'application/x-amz-json-1.0',
   'content-length': '2',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'RV6BQDF76O8S74K4KJIOVNGIM3VV4KQNSO5AEMVJF66Q9ASUAAJG',
   'x-amz-crc32': '2745614147'},
  'RetryAttempts': 0}}

In [14]:
# check if the entry has been updated
table.get_item(Key={'job_id': 'ghactivity_ingest'})

{'Item': {'job_description': 'Ingest ghactivity data to s3',
  'is_active': 'Y',
  'job_id': 'ghactivity_ingest',
  'baseline_days': Decimal('3'),
  'job_run_bookmark_details': {'last_run_file_name': '2022-11-14-0.json.gz'}},
 'ResponseMetadata': {'RequestId': 'FLHDIR64MUNNRR1UUT6P4DD8N7VV4KQNSO5AEMVJF66Q9ASUAAJG',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'server': 'Server',
   'date': 'Thu, 17 Nov 2022 05:53:16 GMT',
   'content-type': 'application/x-amz-json-1.0',
   'content-length': '233',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'FLHDIR64MUNNRR1UUT6P4DD8N7VV4KQNSO5AEMVJF66Q9ASUAAJG',
   'x-amz-crc32': '2271273004'},
  'RetryAttempts': 0}}

In [15]:
# extracting particular entries from the table
job_run_bookmark_details = table. \
    get_item(Key={'job_id': 'ghactivity_ingest'})['Item']['job_run_bookmark_details']

In [16]:
job_run_bookmark_details

{'last_run_file_name': '2022-11-14-0.json.gz'}

In [17]:
dt_part = job_run_bookmark_details['last_run_file_name'].split('.')[0]

In [18]:
dt_part

'2022-11-14-0'

**creating next file with one hour increment**

In [19]:
next_file = f"{dt.strftime(dt.strptime(dt_part, '%Y-%m-%d-%H') + td(hours=1), '%Y-%m-%d-%-H')}.json.gz"

In [20]:
next_file

'2022-11-14-1.json.gz'

In [21]:
import requests

In [22]:
res = requests.get(f'https://data.gharchive.org/{next_file}')

In [23]:
file = open(f'data/{next_file}', 'wb')

In [24]:
file.write(res.content)

85539087

In [25]:
file.close()

In [26]:
!ls -ltr data/

total 163160
-rw-r--r-- 1 edward edward 33989636 Nov 14 17:17 2022-06-05-0.json.gz
-rw-r--r-- 1 edward edward 47534740 Nov 16 17:34 2022-11-13-0.json.gz
-rw-r--r-- 1 edward edward 85539087 Nov 17 13:16 2022-11-14-1.json.gz


In [27]:
import pandas as pd

In [28]:
df = pd.read_json(f'data/{next_file}', lines=True, orient='records')

In [29]:
df

Unnamed: 0,id,type,actor,repo,payload,public,created_at,org
0,25209166750,PushEvent,"{'id': 41898282, 'login': 'github-actions[bot]...","{'id': 458311390, 'name': 'viniciuscorreialour...","{'push_id': 11649029048, 'size': 1, 'distinct_...",True,2022-11-14 01:00:00+00:00,
1,25209166753,PushEvent,"{'id': 106945582, 'login': 'katoushiki', 'disp...","{'id': 563682326, 'name': 'yusaku-uema/GFF2-4B...","{'push_id': 11649029040, 'size': 7, 'distinct_...",True,2022-11-14 01:00:00+00:00,
2,25209166756,WatchEvent,"{'id': 11931030, 'login': 'tmartin8080', 'disp...","{'id': 14453572, 'name': 'ericmj/decimal', 'ur...",{'action': 'started'},True,2022-11-14 01:00:00+00:00,
3,25209166757,PushEvent,"{'id': 41898282, 'login': 'github-actions[bot]...","{'id': 488716101, 'name': 'michelly-alves/mich...","{'push_id': 11649029052, 'size': 1, 'distinct_...",True,2022-11-14 01:00:00+00:00,
4,25209166759,CreateEvent,"{'id': 14252925, 'login': 'maheshpeddigithub',...","{'id': 565628071, 'name': 'maheshpeddigithub/n...","{'ref': 'main', 'ref_type': 'branch', 'master_...",True,2022-11-14 01:00:00+00:00,
...,...,...,...,...,...,...,...,...
149899,25209748711,PushEvent,"{'id': 95552879, 'login': 'gabflag', 'display_...","{'id': 465407496, 'name': 'gabflag/projects_in...","{'push_id': 11649348560, 'size': 1, 'distinct_...",True,2022-11-14 01:59:59+00:00,
149900,25209748713,PushEvent,"{'id': 41898282, 'login': 'github-actions[bot]...","{'id': 448735812, 'name': 'Handowsblack/Handow...","{'push_id': 11649348576, 'size': 1, 'distinct_...",True,2022-11-14 01:59:59+00:00,
149901,25209748723,PushEvent,"{'id': 41898282, 'login': 'github-actions[bot]...","{'id': 415468661, 'name': 'joaopedrozand/joaop...","{'push_id': 11649348583, 'size': 1, 'distinct_...",True,2022-11-14 01:59:59+00:00,
149902,25209748726,PushEvent,"{'id': 9512067, 'login': 'simonsmh', 'display_...","{'id': 81063788, 'name': 'simonsmh/openwrt-dis...","{'push_id': 11649348566, 'size': 1, 'distinct_...",True,2022-11-14 01:59:59+00:00,


In [30]:
df.columns

Index(['id', 'type', 'actor', 'repo', 'payload', 'public', 'created_at',
       'org'],
      dtype='object')

In [31]:
df.dtypes

id                          int64
type                       object
actor                      object
repo                       object
payload                    object
public                       bool
created_at    datetime64[ns, UTC]
org                        object
dtype: object

In [32]:
df.shape

(149904, 8)