In [1]:
import pandas as pd
from pymongo import MongoClient
from pandas.io.json import json_normalize

client = MongoClient('mongodb://localhost:27017/')
db = client.companies



In [17]:
#Starting consuting time on Mongo DB

one_office = db.companies.find({    #Companies with at least 1 office
    "offices":{
        "$not":{
            "$size":0
        }
    }
},{"name":1, "offices":1})

one_office_df = pd.DataFrame(one_office)
#display(one_office_df)
print(one_office_df.shape)
one_office_df.columns

# With this query we know that there are at least 13,744 registers with offices

(13744, 3)


Index(['_id', 'name', 'offices'], dtype='object')

In [27]:
# We have some criteria when it comes to get the data from our database. The minimums are:
# - There must be some nearby companies that also do design.
# - Nobody in the company likes to have companies with more than 10 years in a radius of 2 KM.
# - Developers like to be near successful tech startups with that have raised at least 1 Million dollars.
# - Account managers need to travel a lot


In [72]:
# Filtering with mongo
# Therefore, we ask mongodb (politetly) about some of the criteria we are considering.
 
mongodata = db.companies.find( {"$and":[
                                {"offices.latitude": {"$exists": True}},
                                {"offices.latitude": {"$ne": None}},   
                                {"offices.longitude": {"$exists": True}},
                                {"offices.longitude": {"$ne": None}},
                                {"category_code": {"$exists": True}},
                                {"category_code": {"$ne": None}},   
                                {"founded_year": {"$exists": True}}, 
                                {"founded_year": {"$gte": 2000}},
                                {"total_money_raised": {"$exists": True}},
                                {"total_money_raised": {"$not":{"$size":0}}}, 
                                {"$or": [
                                {"total_money_raised": {"$gte": 1000000}},
                                {"category_code":  "design" } ,
                                {"category_code":  "nanotech" } , 
                                {"category_code":  "web" } , 
                                {"category_code":  "software" } , 
                                {"category_code":  "games_video" } , 
                                {"category_code":  "mobile" } , 
                                {"category_code":  "ecommerce" } ,
                                {"category_code":  "advertising" } ,
                                {"category_code":  "enterprise" } ,   
                                {"category_code":  "analytics" } ,   
                                {"category_code":  "photo_video" } ,   
                                {"category_code":  "biotech" } ]} ,
                                      
                                ]
                                },
                                {
                               "_id": 0, "category_code": 1,"founded_year": 1, "name": 1, 
                               "offices.country_code": 1, "offices.latitude": 1, "offices.longitude": 1, 
                                "total_money_raised": 1   
                                }
                            )

In [48]:
#df = pd.DataFrame(mongodata)
#display(df.head())
#print(df["offices"][0])
#print(df["name"][0])

Unnamed: 0,category_code,founded_year,name,offices,total_money_raised
0,web,2005,Wetpaint,"[{'country_code': 'USA', 'latitude': 47.603122...",$39.8M
1,software,2005,Zoho,"[{'country_code': 'USA', 'latitude': 37.692934...",$0
2,web,2006,Geni,"[{'country_code': 'USA', 'latitude': 34.090368...",$16.5M
3,web,2003,Gizmoz,"[{'country_code': 'USA', 'latitude': 37.48413,...",$18.1M
4,mobile,2005,Helio,"[{'country_code': 'USA', 'latitude': 34.057498...",$0


Wetpaint


In [73]:
# However, the serie "offices" it's a list with embed dicts. The serie contains information about location and number
# of offices.
# We use json_normalize to extract this information

In [74]:
ydataframe = json_normalize(data = mongodata, record_path = "offices", 
                             meta = ["category_code", "founded_year", "name",
                                    "total_money_raised"])

In [75]:
display(ydataframe.head())
print(ydataframe.shape)
print(ydataframe.columns)

Unnamed: 0,country_code,latitude,longitude,category_code,founded_year,name,total_money_raised
0,USA,47.603122,-122.333253,web,2005,Wetpaint,$39.8M
1,USA,40.723731,-73.996431,web,2005,Wetpaint,$39.8M
2,USA,37.692934,-121.904945,software,2005,Zoho,$0
3,USA,34.090368,-118.393064,web,2006,Geni,$16.5M
4,USA,37.48413,-122.169472,web,2003,Gizmoz,$18.1M


(4835, 7)
Index(['country_code', 'latitude', 'longitude', 'category_code',
       'founded_year', 'name', 'total_money_raised'],
      dtype='object')


In [76]:
df = ydataframe.reindex(columns=['country_code', 'name', 'founded_year', 'category_code', 'latitude', 'longitude', 
                                 'total_money_raised'])

In [77]:
display(df.head())

Unnamed: 0,country_code,name,founded_year,category_code,latitude,longitude,total_money_raised
0,USA,Wetpaint,2005,web,47.603122,-122.333253,$39.8M
1,USA,Wetpaint,2005,web,40.723731,-73.996431,$39.8M
2,USA,Zoho,2005,software,37.692934,-121.904945,$0
3,USA,Geni,2006,web,34.090368,-118.393064,$16.5M
4,USA,Gizmoz,2003,web,37.48413,-122.169472,$18.1M


In [78]:
df["total_offices"] = df["name"].map(df["name"].value_counts())

In [79]:
display(df.head())
# On this stage, we've disclouse all the data inside "offices" and added another column with the number of offices for 
# each company. We have the coordinates of each office but need to proceed to generate geopoints.

Unnamed: 0,country_code,name,founded_year,category_code,latitude,longitude,total_money_raised,total_offices
0,USA,Wetpaint,2005,web,47.603122,-122.333253,$39.8M,2
1,USA,Wetpaint,2005,web,40.723731,-73.996431,$39.8M,2
2,USA,Zoho,2005,software,37.692934,-121.904945,$0,1
3,USA,Geni,2006,web,34.090368,-118.393064,$16.5M,1
4,USA,Gizmoz,2003,web,37.48413,-122.169472,$18.1M,1


In [86]:
def get_offices_locat(data):
    #for data["latitude"] & data["longitude"]:
    offices_locat = {
            "type":"Point",
            "coordinates":[data["longitude"], data["latitude"]]
        }
    return offices_locat

def get_offices_locat2(lon, lat):
    print(lon, lat)
    #for data["latitude"] & data["longitude"]:
    offices_locat = {
            "type":"Point",
            "coordinates":[lon, lat]
        }
    return offices_locat

In [87]:
#df.apply(lambda x: get_offices_locat2(x['longitude'], x['latitude']), axis=1)
# With this function we apply the points coordinate conversion

In [88]:
%%df["geoloc"] = df.apply(lambda x: get_offices_locat2(x['longitude'], x['latitude']), axis=1)


UsageError: Cell magic `%%df["geoloc"]` not found.


In [89]:
display(df.head())

Unnamed: 0,country_code,name,founded_year,category_code,latitude,longitude,total_money_raised,total_offices,geoloc
0,USA,Wetpaint,2005,web,47.603122,-122.333253,$39.8M,2,"{'type': 'Point', 'coordinates': [-122.333253,..."
1,USA,Wetpaint,2005,web,40.723731,-73.996431,$39.8M,2,"{'type': 'Point', 'coordinates': [-73.9964312,..."
2,USA,Zoho,2005,software,37.692934,-121.904945,$0,1,"{'type': 'Point', 'coordinates': [-121.904945,..."
3,USA,Geni,2006,web,34.090368,-118.393064,$16.5M,1,"{'type': 'Point', 'coordinates': [-118.393064,..."
4,USA,Gizmoz,2003,web,37.48413,-122.169472,$18.1M,1,"{'type': 'Point', 'coordinates': [-122.169472,..."
