In [1]:
import requests
import pandas as pd
import boto3
import time
import datetime
import calendar
from sys import argv
import json

In [68]:
def get_unix_timestamp(input_time):
    '''
    INPUT: list in following format: [year, month, day, hours (24), minutes, seconds] **for UTC**
    OUTPUT: Int of unix timestamp
    '''
    dt = datetime.datetime(input_time[0], input_time[1], input_time[2],
                           input_time[3], input_time[4], input_time[5])
    return int(time.mktime(dt.timetuple()))


def get_venmo_url(start_unix_timestamp, end_unix_timestamp, limit=1000000):
    return f'https://venmo.com/api/v5/public?since={start_unix_timestamp}&until={end_unix_timestamp}&limit={limit}'


def get_venmo_data(url):
    raw_data = requests.get(url)
    json_data = raw_data.json()
    return json_data


def get_unix_timestamp(time):
    '''
    INPUT: list in following format: [year, month, day, hours (24), minutes, seconds] **for UTC**
    OUTPUT: Int of unix timestamp
    '''
    dt = datetime.datetime(time[0], time[1], time[2], time[3], time[4], time[5])
    return calendar.timegm(dt.utctimetuple())

def get_s3_keys(bucket):
    """Get a list of keys in an S3 bucket."""
    keys = []
    s3 = boto3.client('s3')
    response = s3.list_objects_v2(Bucket=bucket)
    for obj in response['Contents']:
        keys.append(obj['Key'])
    return keys

def get_df_from_aws_keys(bucket, key_list):
    s3 = boto3.client('s3')
    data = list()
    for i in range(len(key_list)):
        if i % 20 == 0:
            print(len(key_list) - i)
        response = s3.get_object(Bucket=bucket,Key=key_list[i])
        body = json.loads((response['Body'].read()))
        data.extend(body)
    return pd.DataFrame(data)

def scrape(start_date, end_date, interval, limit=1000000):
    '''
    INPUT: start_date and end_date, each as list in format: [year, month, day, hours (24), minutes, seconds]
           Int of min 10 representing the number of second-long intervals to scrape
    OUTPUT: none, will upload to s3 bucket
    '''
    keys = get_s3_keys('transaction-data-2018')
    links = [get_venmo_url(key.split('_')[0], key.split('_')[1], \
                           key.split('_')[2]).split('.json')[0] for key in keys]
    s3 = boto3.resource('s3')
    if interval < 10:
        print('Interval must be greater than 10 seconds')
    else:
        start_unix_ts = get_unix_timestamp(start_date)
        end_unix_ts = get_unix_timestamp(end_date)
        scrape_count = int((end_unix_ts - start_unix_ts) / interval)
        print(
            f'With your parameters, this function will scrape the public venmo API {scrape_count} times.')
        for i in range(start_unix_ts, end_unix_ts, interval):
            url = get_venmo_url(i, i + interval, limit)
            if url in links:
                print(f'already scraped: {url}')
            else:
                print(f'scraping: {url}')
                data = get_venmo_data(url)['data']
                print(len(data))
                file_name = f'{i}_{i + interval}_{limit}'
                obj = s3.Object('transaction-data-2018', f'{file_name}.json')
                obj.put(Body=json.dumps(data))
                time.sleep(1)
                
def get_s3_data_on_link(bucket, link):
    s3_link = link[38:48]+"_"+link[55:65]+"_"+link[72:]+".json"
    return get_df_from_aws_keys(bucket, [s3_link])

In [69]:
get_s3_data_on_link('transaction-data-2018', word)

1


Unnamed: 0,action_links,actor,audience,comments,created_time,likes,mentions,message,payment_id,permalink,story_id,transactions,type,updated_time,via
0,{},"{'username': 'Michael-Fowlkes', 'picture': 'ht...",public,[],2018-07-23T21:56:48Z,"{'count': 0, 'data': []}",[],Cost of smelling good,1112755914,/story/5b564f20e1f606dc2f137490,5b564f20e1f606dc2f137490,"[{'target': {'username': 'carolzhang', 'pictur...",payment,2018-07-23T21:56:48Z,
1,{},"{'username': 'Stephanie-Clark-112', 'picture':...",public,[],2018-07-23T21:56:48Z,"{'count': 0, 'data': []}",[],🐾,1112755915,/story/5b564f20e1f606dc2f13773d,5b564f20e1f606dc2f13773d,"[{'target': {'username': 'Zachary-McKenney', '...",payment,2018-07-23T21:56:48Z,
2,{},"{'username': 'Brittany-Nelson-38', 'picture': ...",public,[],2018-07-23T21:56:48Z,"{'count': 0, 'data': []}",[],Don’t know,1112755917,/story/5b564f20e1f606dc2f13799a,5b564f20e1f606dc2f13799a,"[{'target': {'username': 'AnthonyFreni', 'pict...",payment,2018-07-23T21:56:48Z,
3,{},"{'username': 'rbilotti35', 'picture': 'https:/...",public,[],2018-07-23T21:56:48Z,"{'count': 0, 'data': []}",[],Thai Food,1112755918,/story/5b564f20e1f606dc2f137ad4,5b564f20e1f606dc2f137ad4,"[{'target': {'username': 'TrentLS', 'picture':...",payment,2018-07-23T21:56:48Z,
4,{},"{'username': 'Richard-Gerstein', 'picture': 'h...",public,[],2018-07-23T21:56:48Z,"{'count': 0, 'data': []}",[],Ramen,1112755921,/story/5b564f20e1f606dc2f137d99,5b564f20e1f606dc2f137d99,"[{'target': {'username': 'faithlking', 'pictur...",payment,2018-07-23T21:56:48Z,
5,{},"{'username': 'Logan-Pascoe', 'picture': 'https...",public,[],2018-07-23T21:56:48Z,"{'count': 0, 'data': []}",[],🙂,1112755923,/story/5b564f20e1f606dc2f1380ef,5b564f20e1f606dc2f1380ef,"[{'target': {'username': 'haydenpascoe', 'pict...",payment,2018-07-23T21:56:48Z,
6,{},"{'username': 'David-Earle-11', 'picture': 'htt...",public,[],2018-07-23T21:56:48Z,"{'count': 0, 'data': []}",[],Funny liquid,1112755925,/story/5b564f20e1f606dc2f138241,5b564f20e1f606dc2f138241,"[{'target': {'username': 'Robert-Jozic', 'pict...",payment,2018-07-23T21:56:48Z,
7,{},"{'username': 'LELA-BIEBUYCK', 'picture': 'http...",public,[],2018-07-23T21:56:48Z,"{'count': 0, 'data': []}",[],Patterson laundry,1112755926,/story/5b564f20e1f606dc2f138332,5b564f20e1f606dc2f138332,"[{'target': {'username': 'April-Elmer', 'pictu...",payment,2018-07-23T21:56:48Z,
8,{},"{'username': 'Drew-Kersh', 'picture': 'https:/...",public,[],2018-07-23T21:56:47Z,"{'count': 0, 'data': []}",[],💸,1112755881,/story/5b564f1fe1f606dc2f134952,5b564f1fe1f606dc2f134952,"[{'target': {'username': 'barberx', 'picture':...",payment,2018-07-23T21:56:47Z,
9,{},"{'username': 'Mitchell-LaScola', 'picture': 'h...",public,[],2018-07-23T21:56:47Z,"{'count': 0, 'data': []}",[],Lmao I’ve forgot forever,1112755882,/story/5b564f1fe1f606dc2f134ad3,5b564f1fe1f606dc2f134ad3,"[{'target': {'username': 'IanNapier', 'picture...",payment,2018-07-23T21:56:47Z,


In [71]:
test = get_s3_data_on_link('transaction-data-2018', 'https://venmo.com/api/v5/public?since=1501405200&until=1501407000&limit=1000000')

1


In [72]:
test

Unnamed: 0,action_links,actor,audience,comments,created_time,likes,mentions,message,payment_id,permalink,story_id,transactions,type,updated_time,via
0,{},"{'username': 'Kiera-ONeill-1', 'picture': 'htt...",public,[],2018-07-23T22:28:18Z,"{'count': 0, 'data': []}",[],Camping supplies,1112814089,/story/5b565682e1f606dc2f5c52ea,5b565682e1f606dc2f5c52ea,"[{'target': {'username': 'Jasmine-Duchene', 'p...",payment,2018-07-23T22:28:18Z,
1,{},"{'username': 'falonnewton', 'picture': 'https:...",public,[],2018-07-23T22:28:18Z,"{'count': 0, 'data': []}",[],Mirror,1112814092,/story/5b565682e1f606dc2f5c58a8,5b565682e1f606dc2f5c58a8,"[{'target': {'username': 'Kristen-Gottwald', '...",payment,2018-07-23T22:28:18Z,
2,{},"{'username': 'Debbie-Peete', 'picture': 'https...",public,[],2018-07-23T22:28:18Z,"{'count': 0, 'data': []}",[],Dinne,1112814094,/story/5b565682e1f606dc2f5c5db8,5b565682e1f606dc2f5c5db8,"[{'target': {'username': 'Katrina-Joubert', 'p...",payment,2018-07-23T22:28:18Z,
3,{},"{'username': 'Bethany-Gartner', 'picture': 'ht...",public,[],2018-07-23T22:28:18Z,"{'count': 0, 'data': []}",[],Ice house and parking bc ily,1112814097,/story/5b565682e1f606dc2f5c6269,5b565682e1f606dc2f5c6269,"[{'target': {'username': 'Jennifer-Chickola', ...",payment,2018-07-23T22:28:18Z,
4,{},"{'username': 'Mariah-Nolasea', 'picture': 'htt...",public,[],2018-07-23T22:28:18Z,"{'count': 0, 'data': []}",[],🎨,1112814103,/story/5b565682e1f606dc2f5c6f02,5b565682e1f606dc2f5c6f02,"[{'target': {'username': 'Kevin-Suarez-72', 'p...",payment,2018-07-23T22:28:18Z,
5,{},"{'username': 'Haylee-Barker', 'picture': 'http...",public,[],2018-07-23T22:28:18Z,"{'count': 0, 'data': []}",[],👸🏻🎊,1112814104,/story/5b565682e1f606dc2f5c71ab,5b565682e1f606dc2f5c71ab,"[{'target': {'username': 'Rachel-Munos', 'pict...",payment,2018-07-23T22:28:18Z,
6,{},"{'username': 'Kelli-Floyd-1', 'picture': 'http...",public,[],2018-07-23T22:28:18Z,"{'count': 0, 'data': []}",[],Vance bday!!,1112814108,/story/5b565683e1f606dc2f5c792c,5b565683e1f606dc2f5c792c,"[{'target': {'username': 'Courtney-Reid-8', 'p...",payment,2018-07-23T22:28:18Z,
7,{},"{'username': 'JoeOmlor', 'picture': 'https://v...",public,[],2018-07-23T22:28:17Z,"{'count': 0, 'data': []}",[],🐷🐥,1112814054,/story/5b565681e1f606dc2f5c2708,5b565681e1f606dc2f5c2708,"[{'target': {'username': 'Erika-Simshauser', '...",payment,2018-07-23T22:28:17Z,
8,{},"{'username': 'JakeHoyer', 'picture': 'https://...",public,[],2018-07-23T22:28:17Z,"{'count': 0, 'data': []}",[],Stuff,1112814055,/story/5b565681e1f606dc2f5c29a5,5b565681e1f606dc2f5c29a5,"[{'target': {'username': 'Joshua-Mueller-3', '...",payment,2018-07-23T22:28:17Z,
9,{},"{'username': 'LaurieSmith2086', 'picture': 'ht...",public,[],2018-07-23T22:28:17Z,"{'count': 0, 'data': []}",[],Booze for Vanessa at Oxbow.,1112814057,/story/5b565681e1f606dc2f5c2dca,5b565681e1f606dc2f5c2dca,"[{'target': {'username': 'Katie-Shields-5', 'p...",payment,2018-07-23T22:28:17Z,


In [34]:
len(test['data'])

50

In [24]:
keys

['1499731200_1499731230_1000000.json',
 '1499731200_1499733000_1000000.json',
 '1499731230_1499731260_1000000.json',
 '1499731260_1499731290_1000000.json',
 '1499731290_1499731320_1000000.json',
 '1499731320_1499731350_1000000.json',
 '1499731350_1499731380_1000000.json',
 '1499731380_1499731410_1000000.json',
 '1499731410_1499731440_1000000.json',
 '1499731440_1499731470_1000000.json',
 '1499731470_1499731500_1000000.json',
 '1499731500_1499731530_1000000.json',
 '1499731530_1499731560_1000000.json',
 '1499731560_1499731590_1000000.json',
 '1499731590_1499731620_1000000.json',
 '1499731620_1499731650_1000000.json',
 '1499731650_1499731680_1000000.json',
 '1499731680_1499731710_1000000.json',
 '1499731710_1499731740_1000000.json',
 '1499731740_1499731770_1000000.json',
 '1499731770_1499731800_1000000.json',
 '1499731800_1499731830_1000000.json',
 '1499731830_1499731860_1000000.json',
 '1499731860_1499731890_1000000.json',
 '1499731890_1499731920_1000000.json',
 '1499731920_1499731950_1

In [None]:
scrape([2017, 7, 11, 0, 0, 0], [2018, 7, 11, 0, 0, 0], 1800)

In [3]:
keys = get_s3_keys('transaction-data-2018')

In [19]:
links = [get_venmo_url(key.split('_')[0], key.split('_')[1], key.split('_')[2]).split('.json')[0] for key in keys]

In [38]:
get_df_from_aws_keys('transaction-data-2018', keys[-3:-4])

In [60]:
keys

['1499731200_1499731230_1000000.json',
 '1499731200_1499733000_1000000.json',
 '1499731230_1499731260_1000000.json',
 '1499731260_1499731290_1000000.json',
 '1499731290_1499731320_1000000.json',
 '1499731320_1499731350_1000000.json',
 '1499731350_1499731380_1000000.json',
 '1499731380_1499731410_1000000.json',
 '1499731410_1499731440_1000000.json',
 '1499731440_1499731470_1000000.json',
 '1499731470_1499731500_1000000.json',
 '1499731500_1499731530_1000000.json',
 '1499731530_1499731560_1000000.json',
 '1499731560_1499731590_1000000.json',
 '1499731590_1499731620_1000000.json',
 '1499731620_1499731650_1000000.json',
 '1499731650_1499731680_1000000.json',
 '1499731680_1499731710_1000000.json',
 '1499731710_1499731740_1000000.json',
 '1499731740_1499731770_1000000.json',
 '1499731770_1499731800_1000000.json',
 '1499731800_1499731830_1000000.json',
 '1499731830_1499731860_1000000.json',
 '1499731860_1499731890_1000000.json',
 '1499731890_1499731920_1000000.json',
 '1499731920_1499731950_1