In [22]:
import requests
import pandas as pd
import boto3
import time
import datetime
import calendar

In [3]:
s = "01/12/2011"
time.mktime(datetime.datetime.strptime(s, "%d/%m/%Y").timetuple())

1322726400.0

Parameters for scraping
- Start date and end date
    - Should be in local time for simplicity
    - Input should include: Month, day, and timestamp (to the second)
- Interval to scrape (every hour, every minute, every second, etc)

In [2]:
def get_unix_timestamp(time):
    '''
    INPUT: list in following format: [year, month, day, hours (24), minutes, seconds] **for UTC**
    OUTPUT: Int of unix timestamp
    '''
    dt = datetime.datetime(time[0], time[1], time[2], time[3], time[4], time[5])
    return calendar.timegm(dt.timetuple())

def get_venmo_url(start_unix_timestamp, end_unix_timestamp, limit=1000000):
    return f'https://venmo.com/api/v5/public?since={start_unix_timestamp}&until={end_unix_timestamp}&limit={limit}'

def get_venmo_data(url):
    raw_data = requests.get(url)
    json_data = raw_data.json()
    return json_data

def scrape(start_date, end_date, interval, limit=1000000):
    '''
    INPUT: start_date and end_date, each as list in format: [year, month, day, hours (24), minutes, seconds]
           Int of min 10 representing the number of second-long intervals to scrape
    OUTPUT: none, will upload to s3 bucket
    '''
    s3 = boto3.resource('s3')
    if interval < 10:
        print('Interval must be greater than 10 seconds')
    else:
        start_unix_ts = get_unix_timestamp(start_date)
        end_unix_ts = get_unix_timestamp(end_date)
        scrape_count = int((end_unix_ts - start_unix_ts) / interval)
        print(f'With your parameters, this function will scrape the public venmo API {scrape_count} times.')
        test_lst = []
        for i in range(start_unix_ts, end_unix_ts, interval):
            url = get_venmo_url(i, i + interval, limit)
            data = get_venmo_data(url)['data']
            test_lst.append(data)
            file_name = f'{i}_{i + interval}_{limit}'
            obj = s3.Object('transaction-data-2018',f'{file_name}.json')
            obj.put(Body=json.dumps(data))
            time.sleep(1)
        return test_lst

https://venmo.com/api/v5/public?since=1476921600&until=1476921660&limit=1000000


In [382]:
practice = scrape([2018, 2, 24, 2, 0, 0], [2018, 2, 24, 2, 1, 0], 30)

With your parameters, this function will scrape the public venmo API 2 times.


In [383]:
s3 = boto3.client('s3')
response = s3.get_object(Bucket='transaction-data-2018',Key='1519437600_1519437630_1000000.json')
body = json.loads((response['Body'].read()))
df = pd.DataFrame(body)

In [4]:
time = [2017, 7, 11, 0, 0, 0]

In [6]:
dt = datetime.datetime(time[0], time[1], time[2][time[3]])
calendar.timegm(dt.timetuple())

TypeError: 'int' object is not subscriptable

In [9]:
calendar.timegm(datetime.datetime(2017, 7, 11, 0, 0, 0).timetuple())

1499731200

In [10]:
datetime.datetime(2017, 7, 11, 0, 0, 0)

datetime.datetime(2017, 7, 11, 0, 0)

In [None]:
calendar.timegm()

In [5]:
dt = datetime.datetime(2014, 11, 22, 15, 0, 1)

In [7]:
unix = calendar.timegm(dt.utctimetuple())

In [8]:
datetime.datetime.utcfromtimestamp(unix)

datetime.datetime(2014, 11, 22, 15, 0, 1)

In [9]:
def get_unix_timestamp(time):
    '''
    INPUT: list in following format: [year, month, day, hours (24), minutes, seconds] **for UTC**
    OUTPUT: Int of unix timestamp
    '''
    dt = datetime.datetime(time[0], time[1], time[2], time[3], time[4], time[5])
    return calendar.timegm(dt.utctimetuple())

In [10]:
get_unix_timestamp([2017, 7, 11, 0, 0, 0])

1499731200

In [20]:
time1 = [2017, 7, 11, 0, 0, 0]
time2 = [2018, 7, 11, 0, 0, 0]

In [23]:
dt1 = datetime.datetime(time1[0], time1[1], time1[2], time1[3], time1[4], time1[5])
dt2 = datetime.datetime(time2[0], time2[1], time2[2], time2[3], time2[4], time2[5])

In [24]:
unix1 = int(time.mktime(dt1.timetuple()))
unix2 = int(time.mktime(dt2.timetuple()))

In [25]:
unix1, unix2

(1499756400, 1531292400)

In [27]:
(unix2-unix1)/1800

17520.0

In [14]:
t = calendar.timegm(dt.utctimetuple())

In [15]:
datetime.datetime.utcfromtimestamp(t)

datetime.datetime(2017, 7, 11, 0, 0)

In [19]:
int(time.mktime(dt.timetuple()))

1499756400