In [None]:
'''
Many of us use Venmo as a convenient way to send cash to our friends. 
Venmo, however, is a social network with public data and an API, with everything that entails. 
Your task for this interview is to take publicly available Venmo data, anonymize it, 
and perform some simple visualizations or analyses on the data. 

Some project details.
-A publicly available Venmo dataset can be found here: https://github.com/sa7mon/venmo-data
--The data comes in formats compatible with either MongoDB or SQLite. 
Choose either to do your initial download / pre-processing.
--This is a relatively large amount of data, so feel free to take a reasonable subset 
(perhaps on the order of 100,0000 to 500,000 transactions. Make sure to subset in a reasonable manner)
-Once you have subsetted the data in your database of choice, 
use Python or another language you are comfortable with to perform a basic analysis. 
Create one or two charts that provide some kind of summary of a feature of the 
subsetted, anonymized data. 
--For example, here is a chart showing the logarithm of the distribution of the 
number of payments made per actor. 
--Note: the analysis is not as important here to us as showing that you 1) 
feel comfortable working with databases and f2) can use Python as a scripting language.
--We’ll do a high level review of your approach and your code in the interview.

mongod --dbpath C:\3_Mongo\Data
'''

In [26]:
import pymongo

client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["test"]
coll = db["venmo"]

query = coll.aggregate([{
    "$group" : {
        "_id":"$app.name", 
         "count": {
             "$sum":1
         }
    }
}])

plat_list = list()
for x in query:
    print(x)
    plat_list.append(x)
client.close()

{'_id': 'Workflow', 'count': 6}
{'_id': 'Kasisto KAI', 'count': 3}
{'_id': 'Developer Settings', 'count': 2}
{'_id': 'Alexa for PayPal', 'count': 2}
{'_id': 'BottleRocketUtility', 'count': 6}
{'_id': 'Venmo for iPhone', 'count': 6156264}
{'_id': 'splitwise', 'count': 25621}
{'_id': 'Georgetown University Alumni & Student FCU', 'count': 1}
{'_id': 'Venmo Developer', 'count': 61}
{'_id': 'Venmo for Android', 'count': 884654}
{'_id': 'Venmo.com', 'count': 9271}
{'_id': 'Pay.mo', 'count': 3}
{'_id': 'tab', 'count': 201}
{'_id': 'drupe', 'count': 6}
{'_id': 'venmo payouts', 'count': 484}


In [27]:
import plotly.graph_objects as gobj

labels = list()
values = list()
for x in plat_list:
    labels.append(x["_id"])
    values.append(x["count"])

fig = gobj.Figure(data=[gobj.Pie(labels=labels, values=values)])
fig.show()

In [28]:
import pymongo

client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["test"]
coll = db["venmo"]

query = coll.aggregate([
    {
        "$limit": 350000
    },
    {
        "$group" : {
            "_id": {
                "platform":"$app.name",
                "_id_": "$id",
                "date": "$date_created"
            }, 
            "count1": {
                "$sum":1
            }
        },
    }, 
    {
        "$group" :{
            "_id": "$_id.platform",
            "time": {
                "$push" : {
                    "date": {
                        "$substr": ["$_id.date", 0, 10]
                    },
                    "time": {
                        "$substr": ["$_id.date", 11, 8]
                    }
                }
            },
            "count": {
                "$sum": 1
            }
        }
    }
],
    allowDiskUse = True
)

plat_list2 = list()
for x in query:
    #print(x)
    plat_list2.append(x)
print(len(plat_list2))
client.close()

8


In [29]:
import plotly.graph_objects as go

fig = go.Figure()

#iterate through every platform
for x in plat_list2:
    data = {}
    #iterate through every transaction time
    sorted_time = sorted(x['time'], key = lambda i: i['date'])
    for y in sorted_time:
        if y['date'] in data.keys():
            data[y['date']] = data[y['date']] + 1
        else:
            data[y['date']] = 1
    date_list = list()
    count_list = list()
    for z in data:
        date_list.append(z)
        count_list.append(data[z])
    fig.add_trace(go.Scatter(x=date_list, y=count_list,
                     mode='lines+markers',
                     name=x['_id']))

fig.show()

In [30]:
import pymongo

client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["test"]
coll = db["venmo"]

query = coll.aggregate([
#     {
#         "$limit": 350000
#     },
    {
        "$group" : {
            "_id": {
                "platform":"$app.name",
                "_id_": "$id",
                "date": {
                    "$substr": ["$date_created", 0, 10]
                },
                "time": {
                    "$substr": ["$date_created", 11, 8]
                }
            }, 
        },
    },
],
    allowDiskUse = True
)

plat_list3 = list()
for x in query:
    #print(x)
    plat_list3.append(x)
print(len(plat_list3))

client.close()

7076585


In [31]:
import plotly.graph_objects as go

#format data
data = {}
sorted_time = sorted(plat_list3, key = lambda i: i['_id']['date'])
for x in plat_list3:
    if x['_id']['platform'] not in data.keys():
        data[x['_id']['platform']] = {
            'time' : list(),
            'date' : list()
        }
    data[x['_id']['platform']]['time'].append(x['_id']['time'])
    data[x['_id']['platform']]['date'].append(x['_id']['date'])

#print(data['tab'])

fig = go.Figure()
#iterate through every platform

data2 = {}
#iterate through every transaction time
for y in data:
    #print(data[y])
    for z in data[y]['date']:
        if z in data2.keys():
            data2[z] = data2[z] + 1
        else:
            data2[z] = 1
    date_list = list()
    count_list = list()
    for z in data2:
        date_list.append(z)
        count_list.append(data2[z])
    fig.add_trace(go.Scatter(x=date_list, y=count_list,
                     mode='markers',
                     name=y))

fig.show()