In [19]:
import pandas as pd
from pymongo import MongoClient
from pandas.io.json import json_normalize
import folium
from folium.plugins import HeatMap

client = MongoClient('mongodb://localhost:27017/')
db = client.companies

In [2]:
#Starting consuting time on Mongo DB

In [None]:
# We have some criteria when it comes to get the data from our database. The minimums are:
# - There must be some nearby companies that also do design.
# - Nobody in the company likes to have companies with more than 10 years in a radius of 2 KM.
# - Developers like to be near successful tech startups with that have raised at least 1 Million dollars.
# - Account managers need to travel a lot


In [None]:
# Filtering with mongo
# Therefore, we ask mongodb (politetly) about some of the criteria we are considering.
 
mongodata = db.companies.find( {"$and":[
                                {"offices.latitude": {"$exists": True}},
                                {"offices.latitude": {"$ne": None}},   
                                {"offices.longitude": {"$exists": True}},
                                {"offices.longitude": {"$ne": None}},
                                {"category_code": {"$exists": True}},
                                {"category_code": {"$ne": None}},   
                                {"founded_year": {"$exists": True}}, 
                                {"founded_year": {"$gte": 2003}},
                                {"deadpooled_year": None},
                                {"number_of_employees": {"$exists": True}},
                                {"total_money_raised": {"$exists": True}},
                                {"total_money_raised":{"$ne":None}},
                                {"total_money_raised": {"$not":{"$size":0}}}, 
                                {"$or": [
                                {"total_money_raised": {"$gte": 1000000}},
                                {"category_code":  "design" } ,
                                {"category_code":  "nanotech" } , 
                                {"category_code":  "web" } , 
                                {"category_code":  "software" } , 
                                {"category_code":  "games_video" } , 
                                {"category_code":  "mobile" } , 
                                {"category_code":  "ecommerce" } ,
                                {"category_code":  "advertising" } ,
                                {"category_code":  "enterprise" } ,   
                                {"category_code":  "analytics" } ,
                                {'category_code':'search'},
                                {'category_code':'network_hosting'} ,   
                                {"category_code":  "photo_video" } ,   
                                {"category_code":  "biotech" } ]} ,
                                      
                                ]
                                },
                                {
                               "_id": 0, "category_code": 1,"founded_year": 1, "name": 1, 
                               "offices.country_code": 1, "offices.latitude": 1, "offices.longitude": 1, 
                                "total_money_raised": 1, "number_of_employees":1   
                                }
                            )


In [3]:
# However, the serie "offices" it's a list with embed dicts. The serie contains information about location and number
# of offices.
# We use json_normalize to extract this information and normalize the serie

In [4]:
ydataframe = json_normalize(data = mongodata, record_path = "offices", 
                             meta = ["category_code", "founded_year", "name",
                                    "total_money_raised", "number_of_employees"])

In [5]:
display(ydataframe.head())
print(ydataframe.shape)
print(ydataframe.columns)

Unnamed: 0,country_code,latitude,longitude,category_code,founded_year,name,total_money_raised,number_of_employees
0,USA,34.090368,-118.393064,web,2006,Geni,$16.5M,18.0
1,USA,37.48413,-122.169472,web,2003,Gizmoz,$18.1M,
2,USA,34.057498,-118.446596,mobile,2005,Helio,$0,
3,USA,40.746497,-74.009447,games_video,2006,Joost,$45M,0.0
4,USA,37.778613,-122.395289,search,2006,Powerset,$22.5M,60.0


(4207, 8)
Index(['country_code', 'latitude', 'longitude', 'category_code',
       'founded_year', 'name', 'total_money_raised', 'number_of_employees'],
      dtype='object')


In [6]:
# We reorder the columns distribution

df = ydataframe.reindex(columns=['country_code', 'name', 'founded_year', 'category_code', 'latitude', 'longitude', 
                                 'total_money_raised', 'number_of_employees'])
display(df.head())


Unnamed: 0,country_code,name,founded_year,category_code,latitude,longitude,total_money_raised,number_of_employees
0,USA,Geni,2006,web,34.090368,-118.393064,$16.5M,18.0
1,USA,Gizmoz,2003,web,37.48413,-122.169472,$18.1M,
2,USA,Helio,2005,mobile,34.057498,-118.446596,$0,
3,USA,Joost,2006,games_video,40.746497,-74.009447,$45M,0.0
4,USA,Powerset,2006,search,37.778613,-122.395289,$22.5M,60.0


In [8]:
# With this, we create a new column with the number of offices per company.
# As we can see, we have checked that the normalization of the previous column of "offices" has finished properly
df["total_offices"] = df["name"].map(df["name"].value_counts())

In [9]:
display(df.head())
print(df.shape)
# On this stage, we've disclouse all the data inside "offices" and added another column with the number of offices for 
# each company. We have the coordinates of each office but need to proceed to generate geopoints.

Unnamed: 0,country_code,name,founded_year,category_code,latitude,longitude,total_money_raised,number_of_employees,total_offices
0,USA,Geni,2006,web,34.090368,-118.393064,$16.5M,18.0,1
1,USA,Gizmoz,2003,web,37.48413,-122.169472,$18.1M,,1
2,USA,Helio,2005,mobile,34.057498,-118.446596,$0,,1
3,USA,Joost,2006,games_video,40.746497,-74.009447,$45M,0.0,1
4,USA,Powerset,2006,search,37.778613,-122.395289,$22.5M,60.0,1


(4207, 9)


In [10]:
def get_offices_locat2(lon, lat): # This function works on POINTS creation (for Mongodb)
    offices_locat = {
            "type":"Point",
            "coordinates":[lon, lat]
        }
    return offices_locat

#df.apply(lambda x: get_offices_locat2(x['longitude'], x['latitude']), axis=1)
# With this function we apply the points coordinate conversion

In [11]:
df["geoloc"] = df.apply(lambda x: get_offices_locat2(x['longitude'], x['latitude']), axis=1)


In [12]:
display(df.head())
print(df.shape)

Unnamed: 0,country_code,name,founded_year,category_code,latitude,longitude,total_money_raised,number_of_employees,total_offices,geoloc
0,USA,Geni,2006,web,34.090368,-118.393064,$16.5M,18.0,1,"{'type': 'Point', 'coordinates': [-118.393064,..."
1,USA,Gizmoz,2003,web,37.48413,-122.169472,$18.1M,,1,"{'type': 'Point', 'coordinates': [-122.169472,..."
2,USA,Helio,2005,mobile,34.057498,-118.446596,$0,,1,"{'type': 'Point', 'coordinates': [-118.446596,..."
3,USA,Joost,2006,games_video,40.746497,-74.009447,$45M,0.0,1,"{'type': 'Point', 'coordinates': [-74.0094471,..."
4,USA,Powerset,2006,search,37.778613,-122.395289,$22.5M,60.0,1,"{'type': 'Point', 'coordinates': [-122.395289,..."


(4207, 10)


In [13]:
# Now, we clean the column number_of_employees to fill NaN values 

df["number_of_employees"].fillna(0, inplace = True)
df["number_of_employees"] =  df["number_of_employees"].fillna(0.0).astype(int)
print(df.shape)

(4207, 10)


In [14]:
# Then, we select values on this column above 50, so we filter using this variable (we need to remeber that we are 
# pursuing companies with dynamic teams and lot of activity)
df = df[df["number_of_employees"]>50]


In [17]:
# Also, we have been asked for the value of the company as a condition, therefore:
df = df[df.total_money_raised != "$0"]


def capital_search (m):
    money = { # We are only considering the money ponderated in $. We'll filter amount-wise.
        "$":1, "M":2, "k":3, "K":4, "B":5
    }
    for key, numero in money.items(): 
        if key in m: 
            return m
    return None


df["total_money_raised"] = df["total_money_raised"].apply(capital_search)
df["total_money_raised"].value_counts().tail()


$91.5M    1
$40.3M    1
$18.4M    1
$39.3M    1
$1M       1
Name: total_money_raised, dtype: int64

In [18]:
# So, our DF changed a bit:
df.shape

(239, 10)

In [16]:
# Let's check now how many companies (by category_code) we have left
df["category_code"].value_counts()

software           51
advertising        49
enterprise         40
web                35
games_video        15
analytics          11
ecommerce          11
mobile              8
network_hosting     8
search              6
design              3
photo_video         1
biotech             1
Name: category_code, dtype: int64

In [24]:
# Just for the purpose of see where are more companies with the conditions we have established, we paint a map:

for_map = df.copy()

def hotspots(col_lat, col_lon): #insert values as (df[col_lat], df[col_lon])
    statmad = for_map[['latitude', 'longitude']]
    statmadlist = statmad.as_matrix()
    statmap = folium.Map(zoom_start=16) 
    statmap.add_children(HeatMap(statmadlist, radius=15))
    return statmap    

In [25]:
# As we can see, the majority of our companies are located in North America & Europe (such a surprise!)
hotspots(for_map['latitude'], for_map['longitude'])

  import sys
  if __name__ == '__main__':


In [None]:
# Now we move to mongodb to find their proximity by launching geoqueries using its geopoints and the new index.

In [26]:
df.to_json('afines.json', orient="records")