In [65]:
from pymongo import MongoClient
import folium
import numpy as np
import pandas as pd

In [2]:
def connectCollection(database, collection):
    client = MongoClient()
    db = client[database]
    coll = db[collection]
    return db, coll

In [3]:
db, coll = connectCollection('companies','companies')

In [4]:
def getLocation(comp,i):
    latitude = comp['offices'][i]['latitude']
    longitude = comp['offices'][i]['longitude']
    loc = {
        'type':'Point',
        'coordinates':[longitude, latitude]
    }
    return loc

In [5]:
def getName(comp,i):
    name = comp['name']
    return name

I am going to filter all companies that have been founded after 2003, that have raised at least 1M dollars and that have a category code similar to a gaming company.

In [6]:
pipeline = [
    { "$unwind": "$offices"},
    {"$match":{ "$and": [  {"founded_year":{"$gt":2003}},
            {"funding_rounds.raised_amount":{"$gt":1000000}}, { "$or":[{"category_code":"web"},
            {"category_code":"software"},{"category_code":"games_video"},
            {"category_code":"hardware"},{"category_code":"mobile"},{"category_code":"music"},
            {"category_code":"photo_video"},{"category_code":"design"}]}] }}
#    {"$match":"}
    ]
results = list(coll.aggregate(pipeline))

In [7]:
name = []
city = []
country_code = []
longitude = []
latitude = []

for e in results:
    name.append(e["name"])
    city.append(e['offices']["city"])
    country_code.append(e['offices']["country_code"])
    for c in e["offices"].items():
        if c[0]=="longitude":
            longitude.append(c[1])
        elif c[0]=="latitude":
            latitude.append(c[1])

In [8]:
print(len(name),len(city),len(country_code),len(longitude),len(latitude))

1211 1211 1211 1211 1211


In [9]:
myData = {"name":name, "city":city, "country":country_code, "latitude":latitude, "longitude":longitude}

In [10]:
startup_df = pd.DataFrame(data=myData)

In [11]:
startup_df

Unnamed: 0,name,city,country,latitude,longitude
0,Wetpaint,Seattle,USA,47.603122,-122.333253
1,Wetpaint,New York,USA,40.723731,-73.996431
2,Geni,West Hollywood,USA,34.090368,-118.393064
3,Slacker,San Diego,USA,33.022176,-117.081406
4,Joost,New York,USA,40.746497,-74.009447
...,...,...,...,...,...
1206,Antix Labs,Berkshire,GBR,,
1207,Incuity Software,Mission Viejo,USA,33.664087,-117.632519
1208,Bump Technologies,Mountain View,USA,,
1209,Tiny Speck,Vancouver,CAN,49.282455,-123.109217


Here I could use geocoding to obtain the null coordinates with the address of each offices, but for simplifying purposes, I am going to get rid of them.

In [70]:
startup_clean = startup_df.dropna()

In [71]:
startup_clean

Unnamed: 0,name,city,country,latitude,longitude
0,Wetpaint,Seattle,USA,47.603122,-122.333253
1,Wetpaint,New York,USA,40.723731,-73.996431
2,Geni,West Hollywood,USA,34.090368,-118.393064
3,Slacker,San Diego,USA,33.022176,-117.081406
4,Joost,New York,USA,40.746497,-74.009447
...,...,...,...,...,...
1195,Clavis Technology,Cambridge,USA,42.362663,-71.084153
1197,Equiendo,Nangor Road,IRL,53.337074,-6.251891
1205,Continuity Engine,New Haven,USA,41.327918,-73.243552
1207,Incuity Software,Mission Viejo,USA,33.664087,-117.632519


In [72]:
startup_clean.index = pd.RangeIndex(len(startup_clean.index))

In [73]:
startup_clean.head(30)

Unnamed: 0,name,city,country,latitude,longitude
0,Wetpaint,Seattle,USA,47.603122,-122.333253
1,Wetpaint,New York,USA,40.723731,-73.996431
2,Geni,West Hollywood,USA,34.090368,-118.393064
3,Slacker,San Diego,USA,33.022176,-117.081406
4,Joost,New York,USA,40.746497,-74.009447
5,Babelgum,London,GBR,53.344104,-6.267494
6,Mahalo,Culver City,USA,34.017606,-118.487267
7,Kyte,San Francisco,USA,37.788482,-122.409173
8,Veoh,San Diego,USA,32.902266,-117.20834
9,Wesabe,San Francisco,USA,37.793148,-122.402567


In [14]:
import folium

In [15]:
map_city=folium.Map(location=[40.7221,-73.9712], zoom_start=12)
for index, row in startup_clean.iterrows():
    folium.Marker((row['latitude'],row['longitude']),
                    radius=2,
                    icon=folium.Icon(icon='cloud',color='red'), 
                   ).add_to(map_city)
    


In [16]:
map_city

## APIs

In [17]:
import os
from dotenv import load_dotenv
import requests
load_dotenv()


True

In [26]:
def googleRequestAuthorized(lat,lon,r,keyword):

    # Function
    authToken = os.getenv("GOOGLE_API_TOKEN")
    if not authToken:
        raise ValueError("NECESITAS UN TOKEN")
    url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json?location={},{}&radius={}&keyword={}&key={}".format(lat,lon,r,keyword,authToken)

    res = requests.get(url)
    data = res.json()
    return data

In [47]:
starbucks = googleRequestAuthorized("40.723731","-73.996431","200","starbucks")

In [48]:
vegan = googleRequestAuthorized("40.723731","-73.996431","200","vegan+restaurant")

In [49]:
club = googleRequestAuthorized("40.723731","-73.996431","500","club")

In [50]:
airport = googleRequestAuthorized("40.723731","-73.996431","30000","airport")

In [76]:
kindergarten = googleRequestAuthorized("40.723731","-73.996431","600","kindergarten")

In [77]:
l = kindergarten['results']

In [78]:
print(l[0]['name'])
print(l[0]['geometry']['location']['lat'])
print(l[0]['geometry']['location']['lng'])

Neighborhood School
40.7244039
-73.9863598


In [86]:
for i in range(len(startup_clean)) : 
    lat = startup_clean.loc[i, "latitude"]
    lon = startup_clean.loc[i, "longitude"]
    starbucks = googleRequestAuthorized(str(lat),str(lon),"200","starbucks")
    try:
        startup_clean.loc[i, 'Starbucks'] = starbucks['results'][0]['name']
        startup_clean.loc[i, 'stbk_lat'] = starbucks['results'][0]['geometry']['location']['lat']
        startup_clean.loc[i, 'stbk_lon'] = starbucks['results'][0]['geometry']['location']['lng']

    except:
        startup_clean.loc[i, 'Starbucks'] = np.nan
        startup_clean.loc[i, 'stbk_lat'] = np.nan
        startup_clean.loc[i, 'stbk_lon'] = np.nan

In [87]:
pd.set_option('display.max_rows', None)
display(startup_clean)

Unnamed: 0,name,city,country,latitude,longitude,Starbucks,stbk_lat,stbk_lon
0,Wetpaint,Seattle,USA,47.603122,-122.333253,Starbucks,47.604156,-122.330827
1,Wetpaint,New York,USA,40.723731,-73.996431,Starbucks,40.722553,-73.997942
2,Geni,West Hollywood,USA,34.090368,-118.393064,,,
3,Slacker,San Diego,USA,33.022176,-117.081406,Starbucks,33.023491,-117.081837
4,Joost,New York,USA,40.746497,-74.009447,,,
5,Babelgum,London,GBR,53.344104,-6.267494,,,
6,Mahalo,Culver City,USA,34.017606,-118.487267,Starbucks,34.01724,-118.489752
7,Kyte,San Francisco,USA,37.788482,-122.409173,Starbucks,37.787288,-122.407432
8,Veoh,San Diego,USA,32.902266,-117.20834,,,
9,Wesabe,San Francisco,USA,37.793148,-122.402567,Starbucks,37.792846,-122.404304


In [88]:
startup_clean.to_csv(r'./input/starbucks_with_nan.csv')

In [89]:
starbucks_clean = startup_clean.dropna()

In [90]:
starbucks_clean.index = pd.RangeIndex(len(starbucks_clean.index))

In [91]:
pd.set_option('display.max_rows', None)
display(starbucks_clean)

Unnamed: 0,name,city,country,latitude,longitude,Starbucks,stbk_lat,stbk_lon
0,Wetpaint,Seattle,USA,47.603122,-122.333253,Starbucks,47.604156,-122.330827
1,Wetpaint,New York,USA,40.723731,-73.996431,Starbucks,40.722553,-73.997942
2,Slacker,San Diego,USA,33.022176,-117.081406,Starbucks,33.023491,-117.081837
3,Mahalo,Culver City,USA,34.017606,-118.487267,Starbucks,34.01724,-118.489752
4,Kyte,San Francisco,USA,37.788482,-122.409173,Starbucks,37.787288,-122.407432
5,Wesabe,San Francisco,USA,37.793148,-122.402567,Starbucks,37.792846,-122.404304
6,Jangl SMS,Pleasanton,USA,37.697805,-121.907768,Starbucks,37.700953,-121.910039
7,AdaptiveBlue,NYC,USA,40.801358,-74.3372,Starbucks,40.798594,-74.340772
8,Pando Networks,New York,USA,40.722655,-73.99873,Starbucks,40.722553,-73.997942
9,Livestream,New York,USA,40.726155,-73.995625,Starbucks,40.727212,-73.995485


In [92]:
starbucks_clean.to_csv(r'./input/starbucks_without_nan.csv')

In [93]:
for i in range(len(starbucks_clean)) : 
    lat = starbucks_clean.loc[i, "latitude"]
    lon = starbucks_clean.loc[i, "longitude"]
    airport = googleRequestAuthorized(str(lat),str(lon),"30000","airport")
    try:
        starbucks_clean.loc[i, 'airport'] = airport['results'][0]['name']
        starbucks_clean.loc[i, 'air_lat'] = airport['results'][0]['geometry']['location']['lat']
        starbucks_clean.loc[i, 'air_lon'] = airport['results'][0]['geometry']['location']['lng']

    except:
        starbucks_clean.loc[i, 'airport'] = np.nan
        starbucks_clean.loc[i, 'air_lat'] = np.nan
        starbucks_clean.loc[i, 'air_lon'] = np.nan

In [94]:
pd.set_option('display.max_rows', None)
display(starbucks_clean)

Unnamed: 0,name,city,country,latitude,longitude,Starbucks,stbk_lat,stbk_lon,airport,air_lat,air_lon
0,Wetpaint,Seattle,USA,47.603122,-122.333253,Starbucks,47.604156,-122.330827,Seattle-Tacoma International Airport,47.45025,-122.308817
1,Wetpaint,New York,USA,40.723731,-73.996431,Starbucks,40.722553,-73.997942,John F. Kennedy International Airport,40.641311,-73.778139
2,Slacker,San Diego,USA,33.022176,-117.081406,Starbucks,33.023491,-117.081837,McClellan-Palomar Airport,33.126822,-117.279241
3,Mahalo,Culver City,USA,34.017606,-118.487267,Starbucks,34.01724,-118.489752,Los Angeles International Airport,33.941589,-118.40853
4,Kyte,San Francisco,USA,37.788482,-122.409173,Starbucks,37.787288,-122.407432,San Francisco International Airport,37.621313,-122.378955
5,Wesabe,San Francisco,USA,37.793148,-122.402567,Starbucks,37.792846,-122.404304,San Francisco International Airport,37.621313,-122.378955
6,Jangl SMS,Pleasanton,USA,37.697805,-121.907768,Starbucks,37.700953,-121.910039,Oakland International Airport,37.712569,-122.219743
7,AdaptiveBlue,NYC,USA,40.801358,-74.3372,Starbucks,40.798594,-74.340772,Newark Liberty International Airport,40.689531,-74.174462
8,Pando Networks,New York,USA,40.722655,-73.99873,Starbucks,40.722553,-73.997942,John F. Kennedy International Airport,40.641311,-73.778139
9,Livestream,New York,USA,40.726155,-73.995625,Starbucks,40.727212,-73.995485,John F. Kennedy International Airport,40.641311,-73.778139
