In [1]:
import requests
import json
import os
import pandas as pd
from functools import reduce
import operator
import geopandas as gpd
import time
from bs4 import BeautifulSoup
import shapely.geometry
from shapely.geometry import Point
from pymongo import MongoClient
import numpy as np
from pymongo import GEOSPHERE
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
#conectamos Mongo
client = MongoClient("localhost:27017")
db = client.get_database("ironhack")
geospatial = db.get_collection("geospatial")

## Funciones útiles

In [3]:
def getFromDict(diccionario,mapa):
    return reduce(operator.getitem,mapa,diccionario)

In [4]:
def type_point(lista):
    return {"type":"Point", "coordinates": lista}

### Función para requests Foursquare


In [5]:
def place_store_dicc(localizations,cat_dic):
    """
    recibe un diccionario con localizaciones y un diccionario con categorías de Foursquare
    itera por cada localización y cada categoría
    Devuelve diccionarios anidados con las "venues" de cada request que se hace por ciudad/categoría
    """
    url_query = 'https://api.foursquare.com/v2/venues/search'
    client_id = os.getenv("four")
    client_secret = os.getenv("foursec")
    
    venues_dic = {}
    
    aus_dic ={}
    ham_dic = {}
    ncas_dic ={}
    
    for loc,coor in localizations.items():

        for cat,cat_id in cat_dic.items():
            #time.sleep(1)
            parametros={
                "client_id": client_id,
                "client_secret": client_secret,
                "v": "20180323",
                "ll": f"{coor[0]}, {coor[1]}",
                "categoryId":cat_id,
                "radius":1000
            } 
            
            #no he conseguido automatizar esta parte para que no sea personalizada a esta consulta en particular

            if loc == "Austin":
                aus_dic[cat] = requests.get(url_query, params = parametros).json()["response"]["venues"] 
                if cat == "Pet Store":
                    print(f"{loc} done")
            elif loc == "Hamburg":                
                ham_dic[cat] = requests.get(url_query, params = parametros).json()["response"]["venues"]
                if cat == "Pet Store":
                    print(f"{loc} done")
            elif loc == "Newcastle":                
                ncas_dic[cat] = requests.get(url_query, params = parametros).json()["response"]["venues"] 
                if cat == "Pet Store":
                    print(f"{loc} done")
            
    venues_dic = {"Austin":aus_dic,
                 "Hamburg":ham_dic,
                  "Newcastle": ncas_dic
                 }

    return venues_dic


## Lista de categorías e id de Foursquare
- scrapping

In [6]:
def get_categories():
    """
    scrapea las categorías de foursquare y devuelve un dataframe con el nombre e id de estas.
    """
    url = "https://developer.foursquare.com/docs/build-with-foursquare/categories/"
    html = requests.get(url)
    soup = BeautifulSoup(html.content,"html.parser")
    venue_cat = soup.find_all("ul", {"class": "VenueCategories__Wrapper-sc-15dn453-0 dmcDKQ"})
    cat = {}
    li = venue_cat[0].find_all("li")
    for l in li:
        cat[l.find("h3").getText()] = l.find("p").getText()
        
    return pd.DataFrame.from_dict(cat,orient="Index").reset_index().rename(columns={"index": "category", 0: "categoryId"})

categories = get_categories()

In [7]:
categories.sample(3)

Unnamed: 0,category,categoryId
876,Tanning Salon,4d1cf8421a97d635ce361c31
76,College Bookstore,4bf58dd8d48988d1b1941735
221,Café,4bf58dd8d48988d16d941735


In [8]:
def categories_dic(cat_list):
    """
    recibe un lista de categorías y devuleve un diccionario con el nombre de estas categorías y us id de foursquare
    """
    dicc = {}
    for c in cat_list:
        dicc[c] = categories.categoryId[categories["category"] == c].values[0]
    return dicc

In [9]:
venues_list = ["Design Studio","Nursery School","Preschool","Elementary School","Tech Startup","Coffee Shop",
               "Airport Terminal","Train Station",
              "Nightlife Spot","Vegetarian / Vegan Restaurant","Basketball Stadium",
               "Veterinarian","Pet Service","Pet Store"]  

In [10]:
cat_dic = categories_dic(venues_list)
cat_dic

{'Design Studio': '4bf58dd8d48988d1f4941735',
 'Nursery School': '4f4533814b9074f6e4fb0107',
 'Preschool': '52e81612bcbc57f1066b7a45',
 'Elementary School': '4f4533804b9074f6e4fb0105',
 'Tech Startup': '4bf58dd8d48988d125941735',
 'Coffee Shop': '4bf58dd8d48988d1e0931735',
 'Airport Terminal': '4bf58dd8d48988d1eb931735',
 'Train Station': '4bf58dd8d48988d129951735',
 'Nightlife Spot': '4d4b7105d754a06376d81259',
 'Vegetarian / Vegan Restaurant': '4bf58dd8d48988d1d3941735',
 'Basketball Stadium': '4bf58dd8d48988d18b941735',
 'Veterinarian': '4d954af4a243a5684765b473',
 'Pet Service': '5032897c91d4c4b30a586d69',
 'Pet Store': '4bf58dd8d48988d100951735'}

In [11]:
#como geocode no funcionaba muy bien, hemos copiado coordenadas de las ciudades elegidas en el estudio de "companies"
localizations = {'Austin': ['30.27125853189173', '-97.75551842126049'],
                 'Hamburg': ['53.55687309076416', '10.002900693677232'],
                 'Newcastle': ['54.96923892979405', '-1.6162802662146134']}

In [12]:
total_venues = place_store_dicc(localizations,cat_dic) #llamamos a la función

Austin done
Hamburg done
Newcastle done


In [13]:
def extraetodo(json):
    """
    recibe un diccionario con los datos extraídos de foursquare
    devuelve una lista de diccionarios con todos las "venues"
    """
    
    total = []
    for city,categories in json.items():
        for cat,value in categories.items():
            try:
                todo = {"nombre": ["name"],"sub_category":["categories",0,"name"],
                        "latitud": ["location", "lat"], "longitud": ["location", "lng"],
                        "distance":["location","distance"]}

                for elemento in value:
                    store = {key: getFromDict(elemento,value) for key,value in todo.items()}
                    store["location"] = type_point([store["latitud"], store["longitud"]])
                    store["city"] = city
                    store["category"] = cat
                    total.append(store)

                #print(f"{elemento} done")
            except:

                print(f"{value.index(elemento)} wrong")

    return total

In [14]:
venues = extraetodo(total_venues)
venues_df = pd.DataFrame(venues)
venues_df.sample(3)

Unnamed: 0,nombre,sub_category,latitud,longitud,distance,location,city,category
100,Whiskey Tango Foxtrot Icehouse,Bar,30.269775,-97.749836,570,"{'type': 'Point', 'coordinates': [30.269775, -...",Austin,Nightlife Spot
267,Electrik Sheep,Design Studio,54.969819,-1.619147,194,"{'type': 'Point', 'coordinates': [54.969819, -...",Newcastle,Design Studio
368,The Delta Lounge,Beer Bar,54.968422,-1.613041,226,"{'type': 'Point', 'coordinates': [54.968422, -...",Newcastle,Nightlife Spot


#### limpieza del dataframe de foursquare

In [15]:
venues_df.category.unique()

array(['Design Studio', 'Preschool', 'Elementary School', 'Tech Startup',
       'Coffee Shop', 'Train Station', 'Nightlife Spot',
       'Vegetarian / Vegan Restaurant', 'Veterinarian', 'Pet Service',
       'Pet Store'], dtype=object)

In [16]:
venues_df.category = venues_df.category.str.replace(r"(.*[Ss](chool|CHOOL).*)","School",regex=True)
venues_df.category = venues_df.category.str.replace(r"(.*[Pp](et|ET)\s.*|.*[Vv](eterinarian|ETERINARIAN).*)","Dobby",regex=True)#

In [17]:
#ponderamos las categorías según hemos elegido  
ponder = {"Basketball Stadium":0.13,"Coffee Shop":0.07,"Design Studio":0.06,
          "Dobby":0.20,"Nightlife Spot":0.9,"School":0.15,
         "Tech Startup":0.04,"Train Station":0.08,"Vegetarian / Vegan Restaurant":0.18}

In [18]:
venues_df['ponder']= venues_df['category'].map(ponder)

In [19]:
#borrar todos los cafés que no sean starbucks
no_starbucks = list(venues_df.index[(venues_df["category"] == "Coffee Shop") & (venues_df["nombre"] != "Starbucks")])
venues_df.drop(no_starbucks,axis=0, inplace = True)

In [20]:
venues_df.reset_index(drop = True,inplace = True)

In [21]:
venues_df.to_csv("../data/venues_df.csv",index = False) #me guardo el dataframe

## Geoquery 

In [22]:
#geodataframe
geo_venues = gpd.GeoDataFrame(venues_df, geometry=gpd.points_from_xy(venues_df.longitud,venues_df.latitud))
geo_venues.drop("location", axis=1, inplace = True)
geo_venues.head(2)

Unnamed: 0,nombre,sub_category,latitud,longitud,distance,city,category,ponder,geometry
0,Urbanspace Interiors,Design Studio,30.269045,-97.75153,455,Austin,Design Studio,0.06,POINT (-97.75153 30.26905)
1,Jonathan Adler,Design Studio,30.270651,-97.755635,68,Austin,Design Studio,0.06,POINT (-97.75563 30.27065)


In [23]:
geo_venues.to_csv("../data/geo_venus.csv",index=False) #me lo guardo por si acaso 

In [24]:
#insertamos en mongo
geo_venues['geometry']=geo_venues['geometry'].apply(lambda x:shapely.geometry.mapping(x))
geo_venus_dic = geo_venues.to_dict(orient="records")
geospatial.create_index([("geometry", GEOSPHERE)])
geospatial.insert_many(geo_venus_dic)



<pymongo.results.InsertManyResult at 0x7f94a604ce00>

In [25]:
#coordenadas por ciudad
aus_coor = [-97.75551842126049,30.27125853189173] #longitud / latitud
Ham_coor = [10.002900693677232, 53.55687309076416]
Ncas_coor =  [-1.6162802662146134 ,54.96923892979405]

In [26]:
def geonear(coor,dbcoll):
    """
    recobe unas coordenadas y el nombre de una colección de mongodb
    devuelve una geoquery con la distancia a las coordenadas de cada "venue" de la colección
    """
    near = [{"$geoNear": {
            "near": [coor[0],coor[1]],
            "distanceField": "far",
            "$maxDistance": 1000, 
            "distanceMultiplier" :6371, 
            "spherical": True}}]
    return dbcoll.aggregate(near)

In [27]:
austin_near= geonear(aus_coor,geospatial)
hamburg_near= geonear(Ham_coor,geospatial)
newcastle_near= geonear(Ncas_coor,geospatial)

In [28]:
#creamos dataframes con el resultado de las geoquerys y agrupamos en uno solo para comparar ciudades
aus_list = list(austin_near)
aus_df = pd.DataFrame(aus_list)
aus_df = aus_df[aus_df["city"] == "Austin"]
ham_list = list(hamburg_near)
ham_df = pd.DataFrame(ham_list)
ham_df = ham_df[ham_df["city"] == "Hamburg"]
new_list = list(newcastle_near)
new_df = pd.DataFrame(new_list)
new_df = new_df[new_df["city"] == "Newcastle"]

In [29]:
far = pd.concat([aus_df, ham_df,new_df])
distance = far[["nombre","latitud","longitud","city","category","ponder","far"]]

In [30]:
distance["metros"] = round(distance["far"]*1000,2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  distance["metros"] = round(distance["far"]*1000,2)


In [31]:
distance.sample(2)

Unnamed: 0,nombre,latitud,longitud,city,category,ponder,far,metros
44,Blooie HQ,54.970029,-1.617983,Newcastle,Tech Startup,0.04,0.139748,139.75
64,Healthy Pet,30.26734,-97.752343,Austin,Dobby,0.2,0.531835,531.84


In [32]:
media_cat = distance.groupby(["city","category","ponder"]).agg({"metros":"mean"}).reset_index()
media_cat.metros = round(media_cat.metros,2)

In [33]:
def normalizacion(df,col):
    """
    recibe un dataframe y una columa del mismo
    normaliza los datos de esa columna
    """
    norm = []
    for i,row in df.iterrows():
        mini = df[col].min()
        maxi = df[col].max()
        norm.append((row[col]- mini)/(maxi-mini))
    return norm


In [34]:
media_cat["norm"] = normalizacion(media_cat,"metros")

In [35]:
media_cat['punt'] = round((media_cat.metros * media_cat.norm )/100,2)
media_cat.sample(3)

Unnamed: 0,city,category,ponder,metros,norm,punt
15,Newcastle,Coffee Shop,0.07,603.28,0.636656,3.84
20,Newcastle,Train Station,0.08,372.73,0.24479,0.91
17,Newcastle,Dobby,0.2,484.13,0.434137,2.1


In [36]:
cities_punt = media_cat.groupby("city").agg({"punt":"sum"}).reset_index()

## la ciudad con menos puntos es la que mejor cumple las necesiades de nuestra empresa

In [37]:
cities_punt

Unnamed: 0,city,punt
0,Austin,38.42
1,Hamburg,44.47
2,Newcastle,12.54


In [38]:
best_city = cities_punt.sort_values("punt").head(1)
best_city

Unnamed: 0,city,punt
2,Newcastle,12.54
