In [87]:
from pymongo import MongoClient
import numpy as np
import pandas as pd
import time
import geopandas as gdp
from cartoframes.viz import Map, Layer, popup_element
from dotenv import load_dotenv
from getpass import getpass


In [51]:
client = MongoClient("localhost:27017")

db = client["ironhack"]


In [52]:
db.list_collection_names()

['nyrest', 'Crunchbase', 'restaurants', 'nyneigh']

In [53]:
c = db.get_collection("Crunchbase")

In [6]:
token = getpass()

In [54]:
company_category = set()

In [78]:
tech_name_regex = {"name":{"$regex":"^(tech|frontend|backend|web develop|ux develop|ui develop|gaming|gamer)", "$options":"i"}}
tech_category_regex ={"category_code":{"$regex":"^(tech|frontend|backend|web develop|ux develop|ui develop|gaming|gamer)", "$options":"i"}}
tech_description_regex = {"description":{"$regex":"^(tech|frontend|backend|web develop|ux develop|ui develop|gaming|gamer)", "$options":"i"}}
tech_tags_regex = {"tag_list":{"$regex":"^(tech|frontend|backend|web develop|ux develop|ui develop|gaming|gamer)", "$options":"i"}}

tech_cat_or_descr_or_tags = {"$or": [tech_name_regex, tech_category_regex, tech_description_regex, tech_tags_regex]}

projection_name_category_tags = {"_id" : 0, 
                                "name" : 1, 
                                "category_code": 1, 
                                "tag_list": 1,
                                "total_money_raised": 1,
                                "offices.country_code" : 1, 
                                "offices.city" : 1, 
                                "offices.zip_code" : 1, 
                                "offices.latitude" : 1, 
                                "offices.longitude" : 1}

tech_companies_list = list(c.find(tech_cat_or_descr_or_tags, projection_name_category_tags).sort("offices.city", 1))

In [79]:
df_tech_general = pd.DataFrame(tech_companies_list)

In [97]:
df_tech_general.sample(5)

Unnamed: 0,name,category_code,tag_list,total_money_raised,offices
203,Atlantic BT,web,"web-development, web-design, internet-marketin...",$0,"[{'zip_code': '27613', 'city': 'Raleigh', 'cou..."
123,Burt,advertising,"techcrunch50, tc50, advertising, reactive-adve...",$3.5M,"[{'zip_code': '41125', 'city': 'Gothenburg', '..."
19,MIXTT,web,"techcrunch50, tc50, social-networking, social-...",$0,[]
85,qcue,software,"technology, consulting, software, pricing, pri...",$1.13M,"[{'zip_code': '78759', 'city': 'Austin', 'coun..."
234,TechWeb,other,,$0,"[{'zip_code': '94107', 'city': 'San Francisco'..."


In [99]:
def clean_monetary_values(dataframe, column):
    
    """
    This function works with vectorized operations instead of iterrows() looping.
    It takes a DataFrame and a column name as input, and converts the monetary values
    listed in the format of "$X.XM" or "$XXXk" to integers. Then, it multiplies according
    to the letter indicator.
    It drops the temporary column for letters, and returns the updated clean DataFrame.
    """


    dataframe[['value', 'letter']] = dataframe[column].str.extract('(\d+\.\d+|\d+)([MmKk])')
    dataframe['value'] = dataframe['value'].astype(int)
    dataframe['value'] = dataframe['value'].mul(1000000) if dataframe['letter'] == 'M' else dataframe['value'].mul(1000)
    dataframe.drop('letter', axis=1, inplace=True)
    dataframe.rename(columns={'value': column}, inplace=True)
    return dataframe

In [81]:
design_name_regex = {"name":{"$regex":"design", "$options":"i"}}
design_category_regex ={"category_code":{"$regex":"design", "$options":"i"}}
design_description_regex = {"description":{"$regex":"design", "$options":"i"}}
design_tags_regex = {"tag_list":{"$regex":"design", "$options":"i"}}

design_cat_or_descr_or_tags = {"$or": [design_name_regex, design_category_regex, design_description_regex, design_tags_regex]}

design_projection_name_category_tags = {"_id" : 0, 
                                        "name" : 1, 
                                        "category_code": 1, 
                                        "tag_list": 1,
                                        "offices.country_code" : 1, 
                                        "offices.city" : 1, 
                                        "offices.zip_code" : 1, 
                                        "offices.latitude" : 1, 
                                        "offices.longitude" : 1}

design_companies_list = list(c.find(design_cat_or_descr_or_tags, design_projection_name_category_tags).sort("offices.city", 1))

In [82]:
len(design_companies_list)

779

In [83]:
df_design = pd.DataFrame(design_companies_list)

In [84]:
df_design.sample()

Unnamed: 0,name,category_code,tag_list,offices
523,n3w media,consulting,"web-design, digital, marketing, social-media","[{'zip_code': 'MK9 2AH', 'city': 'Milton Keyne..."


In [88]:
def split_dic_df_columns(dataframe, dataframe_column):
    
    """This funcion takes 2 parameters:
    1. name of given dataframe
    2. name of column that contains dictionaries as values

    It will iterate over the rows, then the specified column, checking for missing values.
    If the values is not missing, then iterate over each 
    dictionary item, and chek if the column to be created already exists.
    Then initialize the column values to zero, and then update values
    with the ones corresponding in the dictionary.
    return the original dataframe updated with a new column per each 
    key in the dictionaries, and its values.
    """
    for i, row in dataframe.iterrows():
        for j in row[dataframe_column]:
            if not pd.isnull(j):
                for x, y in j.items():
                    if x not in dataframe.columns:
                        dataframe[str(x)] = np.nan
                    dataframe.at[i, x] = y
    return dataframe



In [None]:
split_dic_df_columns(df_design, "offices")

In [95]:
df_design.sample(3)

Unnamed: 0,name,category_code,tag_list,offices,zip_code,city,country_code,latitude,longitude
605,Mobclix,mobile,"techcrunch50, tc50, iphone, analytics, free-de...","[{'zip_code': '94301', 'city': 'Palo Alto', 'c...",94301.0,Palo Alto,USA,37.448598,-122.158497
129,Focus-online,ecommerce,"designer-sunglasses, designer-frames, optical-...",[],,,,,
209,MEDDIA,public_relations,"knowledge-managent, web-design, consulting, di...","[{'zip_code': '08013', 'city': 'Barcelona', 'c...",8013.0,Barcelona,ESP,41.401809,2.181565
