In [113]:
from pymongo import MongoClient
import numpy as np
import pandas as pd
import time
import geopandas as gdp
from cartoframes.viz import Map, Layer, popup_element
from dotenv import load_dotenv
from getpass import getpass
import re


In [6]:
token = getpass()

In [140]:
client = MongoClient("localhost:27017")

db = client["ironhack"]


In [141]:
db.list_collection_names()

['nyrest', 'Crunchbase', 'restaurants', 'nyneigh']

In [142]:
c = db.get_collection("Crunchbase")

In [143]:
company_category = set()

In [149]:
def tech_df_function():

    tech_name_regex = {"name":{"$regex":"^(tech|frontend|backend|web develop|ux develop|ui develop|gaming|gamer)", "$options":"i"}}
    tech_category_regex ={"category_code":{"$regex":"^(tech|frontend|backend|web develop|ux develop|ui develop|gaming|gamer)", "$options":"i"}}
    tech_description_regex = {"description":{"$regex":"^(tech|frontend|backend|web develop|ux develop|ui develop|gaming|gamer)", "$options":"i"}}
    tech_tags_regex = {"tag_list":{"$regex":"^(tech|frontend|backend|web develop|ux develop|ui develop|gaming|gamer)", "$options":"i"}}

    tech_cat_or_descr_or_tags = {"$or": [tech_name_regex, tech_category_regex, tech_description_regex, tech_tags_regex]}

    projection_name_category_tags = {"_id" : 0, 
                                    "name" : 1, 
                                    "category_code": 1, 
                                    "tag_list": 1,
                                    "total_money_raised": 1,
                                    "offices.country_code" : 1, 
                                    "offices.city" : 1, 
                                    "offices.zip_code" : 1, 
                                    "offices.latitude" : 1, 
                                    "offices.longitude" : 1}

    tech_companies_list = list(c.find(tech_cat_or_descr_or_tags, projection_name_category_tags).sort("offices.city", 1))
    df_tech_general = pd.DataFrame(tech_companies_list)

    return df_tech_general

In [151]:
tech_df_function()

Unnamed: 0,name,category_code,tag_list,total_money_raised,offices
0,Sparter,games_video,"gaming, game, wow, worldofwarcraft, virtualgoods",$0,"[{'zip_code': None, 'city': None, 'country_cod..."
1,Devunity,web,"techcrunch50, tc50",$100k,[]
2,TechJuicer,,,$0,[]
3,Lockergnome,,"tech-news, it-news, blog",$0,[]
4,Apperceptive,web,design,$0,[]
...,...,...,...,...,...
280,Pacecode Technologies,consulting,"website-development-in-india, website-design-i...",$0,"[{'zip_code': '600020', 'city': 'chennai', 'co..."
281,MakeMyLink,advertising,"technology, web-design, seo, software-blog, guide",$0,"[{'zip_code': '110075', 'city': 'new delhi', '..."
282,Tech Support 4 NYC,consulting,"computer-consulting, tech-support, new-york-city",$0,"[{'zip_code': '10018', 'city': 'new york', 'co..."
283,GoPlanit,web,"techcrunch50, tc50, travel, trip-planner, mobi...",$500k,"[{'zip_code': '94111', 'city': 'san francisco'..."


In [147]:
def clean_monetary_values(dataframe, column):
    
    for index, row in dataframe.iterrows():
        if pd.isnull(row[column]):
            continue
        
        match = re.search('(\d+\.\d+|\d+)([MmKk])', row[column])
        if not match:
            dataframe.at[index, column] = pd.np.nan
            continue
        
        value, letter = match.groups()
        value = float(value)
        if value <= 0:
            dataframe.at[index, column] = pd.np.nan
            continue
        
        if letter.upper() == 'M':
            value *= 1000000
        else:
            value *= 1000
        
        dataframe.at[index, column] = value
    
    return dataframe


In [148]:
clean_monetary_values(df_tech_general, "total_money_raised")

  dataframe.at[index, column] = pd.np.nan


Unnamed: 0,name,category_code,tag_list,total_money_raised,offices
0,Sparter,games_video,"gaming, game, wow, worldofwarcraft, virtualgoods",,"[{'zip_code': None, 'city': None, 'country_cod..."
1,Devunity,web,"techcrunch50, tc50",100000.0,[]
2,TechJuicer,,,,[]
3,Lockergnome,,"tech-news, it-news, blog",,[]
4,Apperceptive,web,design,,[]
...,...,...,...,...,...
280,Pacecode Technologies,consulting,"website-development-in-india, website-design-i...",,"[{'zip_code': '600020', 'city': 'chennai', 'co..."
281,MakeMyLink,advertising,"technology, web-design, seo, software-blog, guide",,"[{'zip_code': '110075', 'city': 'new delhi', '..."
282,Tech Support 4 NYC,consulting,"computer-consulting, tech-support, new-york-city",,"[{'zip_code': '10018', 'city': 'new york', 'co..."
283,GoPlanit,web,"techcrunch50, tc50, travel, trip-planner, mobi...",500000.0,"[{'zip_code': '94111', 'city': 'san francisco'..."


In [134]:
def design_df_function():
   
    design_name_regex = {"name":{"$regex":"design", "$options":"i"}}
    design_category_regex ={"category_code":{"$regex":"design", "$options":"i"}}
    design_description_regex = {"description":{"$regex":"design", "$options":"i"}}
    design_tags_regex = {"tag_list":{"$regex":"design", "$options":"i"}}

    design_cat_or_descr_or_tags = {"$or": [design_name_regex, design_category_regex, design_description_regex, design_tags_regex]}

    design_projection_name_category_tags = {"_id" : 0, 
                                            "name" : 1, 
                                            "category_code": 1, 
                                            "tag_list": 1,
                                            "offices.country_code" : 1, 
                                            "offices.city" : 1, 
                                            "offices.zip_code" : 1, 
                                            "offices.latitude" : 1, 
                                            "offices.longitude" : 1}

    design_companies_list = list(c.find(design_cat_or_descr_or_tags, design_projection_name_category_tags).sort("offices.city", 1))
    df_design = pd.DataFrame(design_companies_list)
    return df_design

In [135]:
design_df_function()

Unnamed: 0,name,category_code,tag_list,offices
0,Jasper Design,,,[]
1,Trunkt,ecommerce,"art, design, directory, marketplace, wholesale",[]
2,Apperceptive,web,design,[]
3,Medium Design Group,,,[]
4,BootB,advertising,"online-marketplace, creativity, marketing-serv...",[]
...,...,...,...,...
774,ecreative,other,"design, freelance-","[{'zip_code': '80300', 'city': 'istanbul', 'co..."
775,magento xperts,ecommerce,"magento-company, magento-themes, magento-templ...","[{'zip_code': '700064', 'city': 'kolkata', 'co..."
776,MakeMyLink,advertising,"technology, web-design, seo, software-blog, guide","[{'zip_code': '110075', 'city': 'new delhi', '..."
777,25 Pixels Media,other,"web-design, costa-rica, blogs, network, spanis...","[{'zip_code': '', 'city': 'san jose', 'country..."


In [88]:
def split_dic_df_columns(dataframe, dataframe_column):
    
    """This funcion takes 2 parameters:
    1. name of given dataframe
    2. name of column that contains dictionaries as values

    It will iterate over the rows, then the specified column, checking for missing values.
    If the values is not missing, then iterate over each 
    dictionary item, and chek if the column to be created already exists.
    Then initialize the column values to zero, and then update values
    with the ones corresponding in the dictionary.
    return the original dataframe updated with a new column per each 
    key in the dictionaries, and its values.
    """
    for i, row in dataframe.iterrows():
        for j in row[dataframe_column]:
            if not pd.isnull(j):
                for x, y in j.items():
                    if x not in dataframe.columns:
                        dataframe[str(x)] = np.nan
                    dataframe.at[i, x] = y
    return dataframe



In [None]:
split_dic_df_columns(df_design, "offices")

In [95]:
df_design.sample(3)

Unnamed: 0,name,category_code,tag_list,offices,zip_code,city,country_code,latitude,longitude
605,Mobclix,mobile,"techcrunch50, tc50, iphone, analytics, free-de...","[{'zip_code': '94301', 'city': 'Palo Alto', 'c...",94301.0,Palo Alto,USA,37.448598,-122.158497
129,Focus-online,ecommerce,"designer-sunglasses, designer-frames, optical-...",[],,,,,
209,MEDDIA,public_relations,"knowledge-managent, web-design, consulting, di...","[{'zip_code': '08013', 'city': 'Barcelona', 'c...",8013.0,Barcelona,ESP,41.401809,2.181565


In [None]:
split_dic_df_columns(df_tech_general, "offices")