In [1]:
import requests
import json
import os
import pandas as pd
from functools import reduce
import operator
import pyjsonviewer
import geopandas as gpd
import time
from bs4 import BeautifulSoup
from shapely.geometry import Point
from pymongo import MongoClient
from dotenv import load_dotenv
import numpy as np
from pymongo import GEOSPHERE
import shapely.geometry

load_dotenv()

True

## Funciones útiles

In [2]:
def getFromDict(diccionario,mapa):
    return reduce(operator.getitem,mapa,diccionario)

In [3]:
def type_point(lista):
    return {"type":"Point", "coordinates": lista}

In [4]:
def geocode(direccion):
    """
    Esta función saca las coordenadas de la dirección que le pases
    """
    data = requests.get(f"https://geocode.xyz/{direccion}?json=1").json()
    try:
        return {"type": "Point", "coordinates": [data["latt"], data["longt"]]}
    except:
        return data

In [5]:
client = MongoClient("localhost:27017")
db = client.get_database("ironhack")
geospatial = db.get_collection("geospatial")
geoaus = db.get_collection("geoaus")
geonew = db.get_collection("geonew")
geoham = db.get_collection("geoham")

## Lista de categorías e id de Foursquare
- scrapping

In [6]:
def get_categories():
    url = "https://developer.foursquare.com/docs/build-with-foursquare/categories/"
    html = requests.get(url)
    soup = BeautifulSoup(html.content,"html.parser")
    venue_cat = soup.find_all("ul", {"class": "VenueCategories__Wrapper-sc-15dn453-0 dmcDKQ"})
    cat = {}
    li = venue_cat[0].find_all("li")
    for l in li:
        cat[l.find("h3").getText()] = l.find("p").getText()
        
    return pd.DataFrame.from_dict(cat,orient="Index").reset_index().rename(columns={"index": "category", 0: "categoryId"})

categories = get_categories()

In [7]:
categories.sample(3)

Unnamed: 0,category,categoryId
595,Waterfall,56aa371be4b08b9a8d573560
99,Medical School,4bf58dd8d48988d1b3941735
154,Szechuan Restaurant,52af3b773cf9994f4e043c03


In [8]:
def categories_dic(cat_list):    
    dicc = {}
    for c in cat_list:
        dicc[c] = categories.categoryId[categories["category"] == c].values[0]
    return dicc

In [9]:
venues_list = ["Design Studio","Nursery School","Preschool","Elementary School","Tech Startup","Coffee Shop",
               "Airport Terminal","Train Station",
              "Nightlife Spot","Vegetarian / Vegan Restaurant","Basketball Stadium",
               "Veterinarian","Pet Service","Pet Store"]  

In [10]:
cat_dic = categories_dic(venues_list)
cat_dic

{'Design Studio': '4bf58dd8d48988d1f4941735',
 'Nursery School': '4f4533814b9074f6e4fb0107',
 'Preschool': '52e81612bcbc57f1066b7a45',
 'Elementary School': '4f4533804b9074f6e4fb0105',
 'Tech Startup': '4bf58dd8d48988d125941735',
 'Coffee Shop': '4bf58dd8d48988d1e0931735',
 'Airport Terminal': '4bf58dd8d48988d1eb931735',
 'Train Station': '4bf58dd8d48988d129951735',
 'Nightlife Spot': '4d4b7105d754a06376d81259',
 'Vegetarian / Vegan Restaurant': '4bf58dd8d48988d1d3941735',
 'Basketball Stadium': '4bf58dd8d48988d18b941735',
 'Veterinarian': '4d954af4a243a5684765b473',
 'Pet Service': '5032897c91d4c4b30a586d69',
 'Pet Store': '4bf58dd8d48988d100951735'}

In [11]:
def localizations_coor(dicc):
    local = {}
    for city,loc in localizations_dicc.items():
        local[city] = geocode(loc)["coordinates"]
        time.sleep(15)
    return local
    

In [12]:
# localizations_dicc={"Austin":"Texas Capitol, Austin, Texas","Hamburg":"Hamburg city center,Hamburg,Germany","Newcastle":"Newcastle upon Tyne, United Kingdom"}
# localizations = localizations_coor(localizations_dicc)
# localizations

In [13]:
 localizations = {'Austin': ['30.27125853189173', '-97.75551842126049'],
 'Hamburg': ['53.55687309076416', '10.002900693677232'],
 'Newcastle': ['54.96923892979405', '-1.6162802662146134']}

In [14]:
def place_store_dicc(localizations,cat_dic):
    url_query = 'https://api.foursquare.com/v2/venues/search'
    client_id = os.getenv("four")
    client_secret = os.getenv("foursec")
    
    venues_dic = {}
    aus_dic ={}
    ham_dic = {}
    ncas_dic ={}
    for loc,coor in localizations.items():
        for cat,cat_id in cat_dic.items():
            time.sleep(1)
            parametros={
                "client_id": client_id,
                "client_secret": client_secret,
                "v": "20180323",
                "ll": f"{coor[0]}, {coor[1]}",
                "categoryId":cat_id,
                "radius":1000
            }
            if loc == "Austin":
                aus_dic[cat] = requests.get(url_query, params = parametros).json()["response"]["venues"] 
                if cat == "Pet Store":
                    print(f"{loc} done")
            elif loc == "Hamburg":                
                ham_dic[cat] = requests.get(url_query, params = parametros).json()["response"]["venues"]
                if cat == "Pet Store":
                    print(f"{loc} done")
            elif loc == "Newcastle":                
                ncas_dic[cat] = requests.get(url_query, params = parametros).json()["response"]["venues"] 
                if cat == "Pet Store":
                    print(f"{loc} done")
            
    venues_dic = {"Austin":aus_dic,
                 "Hamburg":ham_dic,
                  "Newcastle": ncas_dic
                 }

    return venues_dic


In [15]:
total_venues = place_store_dicc(localizations,cat_dic)

Austin done
Hamburg done
Newcastle done


In [16]:
with open('../data/total_venues.json', 'w') as f:
    json.dump(total_venues, f)

In [17]:
def extraetodo(json):
    total = []
    for city,categories in json.items():
        for cat,value in categories.items():
            try:
                todo = {"nombre": ["name"],"sub_category":["categories",0,"name"],
                        "latitud": ["location", "lat"], "longitud": ["location", "lng"],
                        "distance":["location","distance"]}

                for elemento in value:
                    store = {key: getFromDict(elemento,value) for key,value in todo.items()}
                    store["location"] = type_point([store["latitud"], store["longitud"]])
                    store["city"] = city
                    store["category"] = cat
                    total.append(store)

                #print(f"{elemento} done")
            except:

                print(f"{value.index(elemento)} wrong")

    return total

In [18]:
venues = extraetodo(total_venues)

In [19]:
venues_df = pd.DataFrame(venues)

In [20]:
venues_df.sample(3)

Unnamed: 0,nombre,sub_category,latitud,longitud,distance,location,city,category
307,Pink Lane Coffee,Coffee Shop,54.969221,-1.617804,97,"{'type': 'Point', 'coordinates': [54.969220780...",Newcastle,Coffee Shop
320,Costa Coffee,Coffee Shop,54.973743,-1.62146,600,"{'type': 'Point', 'coordinates': [54.973743, -...",Newcastle,Coffee Shop
206,Elbgold,Coffee Shop,53.549764,9.992869,1032,"{'type': 'Point', 'coordinates': [53.549763971...",Hamburg,Coffee Shop


In [21]:
venues_df.category.unique()

array(['Design Studio', 'Preschool', 'Elementary School', 'Tech Startup',
       'Coffee Shop', 'Train Station', 'Nightlife Spot',
       'Vegetarian / Vegan Restaurant', 'Veterinarian', 'Pet Service',
       'Pet Store'], dtype=object)

In [22]:
venues_df.category = venues_df.category.str.replace(r"(.*[Ss](chool|CHOOL).*)","School",regex=True)
venues_df.category = venues_df.category.str.replace(r"(.*[Pp](et|ET)\s.*|.*[Vv](eterinarian|ETERINARIAN).*)","Dobby",regex=True)#

In [23]:
ponder = {"Basketball Stadium":13,"Coffee Shop":7,"Design Studio":6,"Dobby":20,"Nightlife Spot":9,"School":15,
         "Tech Startup":4,"Train Station":8,"Vegetarian / Vegan Restaurant":18}

In [24]:
venues_df['ponder']= venues_df['category'].map(ponder)

In [34]:
#borrar todos los cafés que no sean starbucks
no_starbucks = list(venues_df.index[(venues_df["category"] == "Coffee Shop") & (venues_df["nombre"] != "Starbucks")])
venues_df.drop(no_starbucks,axis=0, inplace = True)

In [35]:
venues_df.reset_index(drop = True,inplace = True)

In [36]:
venues_df.to_csv("../data/venues_df.csv",index = False)

In [37]:
geo_venues = gpd.GeoDataFrame(venues_df, geometry=gpd.points_from_xy(venues_df.longitud,venues_df.latitud))
geo_venues.drop("location", axis=1, inplace = True)
geo_venues.head(2)

Unnamed: 0,nombre,sub_category,latitud,longitud,distance,city,category,ponder,geometry
0,Urbanspace Interiors,Design Studio,30.269045,-97.75153,455,Austin,Design Studio,6,POINT (-97.75153 30.26905)
1,Jonathan Adler,Design Studio,30.270651,-97.755635,68,Austin,Design Studio,6,POINT (-97.75563 30.27065)


In [38]:
geo_venues.to_csv("../data/geo_venus.csv",index=False)

In [39]:
geo_venues['geometry']=geo_venues['geometry'].apply(lambda x:shapely.geometry.mapping(x))
geo_venus_dic = geo_venues.to_dict(orient="records")
geospatial.create_index([("geometry", GEOSPHERE)])
geospatial.insert_many(geo_venus_dic)



<pymongo.results.InsertManyResult at 0x7fa23585dd40>

In [40]:
aus_coor = [-97.75551842126049,30.27125853189173] #longitud / latitud
Ham_coor =[10.002900693677232, 53.55687309076416]
Ncas_coor =  [-1.6162802662146134 ,54.96923892979405]

In [41]:
austin_near = [{"$geoNear": {
            "near": aus_coor,
            "distanceField": "lejos",
            "maxDistance": 1000,
             "distanceMultiplier" :6371,
            "spherical": True}}]
geoloc_austin = geospatial.aggregate(austin_near)
geoaus.create_index([("geometry", GEOSPHERE)])
geoaus.insert_many(geoloc_austin)

<pymongo.results.InsertManyResult at 0x7fa23553f4c0>

In [42]:
hamburg_near = [{"$geoNear": {
            "near": Ham_coor,
            "distanceField": "lejos",
            "maxDistance": 2000,
             "distanceMultiplier" :6371,
            "spherical": True}}]
geoloc_hamburg = geospatial.aggregate(hamburg_near)
geoham.create_index([("geometry", GEOSPHERE)])
geoham.insert_many(geoloc_hamburg)

<pymongo.results.InsertManyResult at 0x7fa235931740>

In [43]:
Newcastle_near = [{"$geoNear": {
            "near": Ncas_coor,
            "distanceField": "lejos",
            "maxDistance": 2000,
            "distanceMultiplier" :6371,
            "spherical": True}}]
geoloc_ncas = geospatial.aggregate(Newcastle_near)
geonew.create_index([("geometry", GEOSPHERE)])
geonew.insert_many(geoloc_ncas)

<pymongo.results.InsertManyResult at 0x7fa235497500>

In [62]:
proy = {"nombre":1,"latitud":1,"longitud":1,"category":1,"lejos":1,"_id":0,"ponder":1}

In [63]:
cond_aus = {"city":"Austin"}
austin_venues = list(geoaus.find(cond_aus,proy))

In [64]:
cond_ham = {"city":"Hamburg"}
hamburg_venues = list(geoham.find(cond_ham,proy))

In [65]:
cond_new = {"city":"Newcastle"}
Newcastle_venues = list(geonew.find(cond_new,proy))

In [66]:
aus_dis = pd.DataFrame(austin_venues)
ham_dis = pd.DataFrame(hamburg_venues)
new_dis = pd.DataFrame(Newcastle_venues)

In [67]:
aus_dis["metros"] = round(aus_dis["lejos"]*1000,2)
ham_dis["metros"] = round(ham_dis["lejos"]*1000,2)
new_dis["metros"] = round(new_dis["lejos"]*1000,2)


In [68]:
# #geo_venues.to_csv("../data/geo_venus.csv",index=False)
# aus_dis.to_csv = ("aus_dis.csv")
# ham_dis.to_csv = ("../data/ham_dis.csv")
# new_dis.to_csv = ("../data/new_dis.csv")

In [73]:
aus_media = aus_dis.groupby(["category","ponder"]).agg({"metros":"mean"}).reset_index()
ham_media = ham_dis.groupby(["category","ponder"]).agg({"metros":"mean"}).reset_index()
new_media = new_dis.groupby(["category","ponder"]).agg({"metros":"mean"}).reset_index()

In [74]:
aus_media

Unnamed: 0,category,ponder,metros
0,Coffee Shop,7,810.22
1,Design Studio,6,442.829231
2,Dobby,20,782.315556
3,Nightlife Spot,9,671.856
4,School,15,799.733333
5,Tech Startup,4,691.821333
6,Train Station,8,228.71
7,Vegetarian / Vegan Restaurant,18,561.215


In [75]:
aus_media.metros = round(aus_media.metros,2)
aus_media['punt'] = round(((aus_media["metros"]*aus_media["ponder"])/100),2)
Austin_punt = round(sum(aus_media.punt),2)
Austin_punt

567.17

In [76]:
ham_media.metros = round(ham_media.metros,2)
ham_media['ponder']= ham_media['category'].map(ponder)
ham_media['punt'] = round(((ham_media["metros"]*ham_media["ponder"])/100),2)
Hamburg_punt = round(sum(ham_media.punt),2)
Hamburg_punt

535.79

In [77]:
new_media.metros = round(new_media.metros,2)
new_media['ponder']= new_media['category'].map(ponder)
new_media['punt'] = round(((new_media["metros"]*new_media["ponder"])/100),2)
Newcastle_punt = round(sum(new_media.punt),2)
Newcastle_punt

316.62