In [1]:
import re
import numpy as np
import pandas as pd
import openpyxl

import geopandas as gpd
from shapely.geometry import Point
import geographiclib
from geographiclib.geodesic import Geodesic
import utm
from shapely.ops import nearest_points

import sys
import os
import requests
from bs4 import BeautifulSoup
from lxml import html
import warnings

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

import urbanpy as up
import contextily as ctx

import io
import json

pd.set_option('display.max_rows',999)
pd.set_option('display.float_format', lambda x: '%.3f' % x)


warnings.filterwarnings("ignore")
#import unicodedata
#import gzip

from shapely import wkt

import unidecode

In [2]:
import plotly.graph_objects as go
import plotly.express as px
import chart_studio.plotly as py
import plotly

In [3]:
pd.options.display.float_format = '{:,.2f}'.format

## files

In [4]:
dbox = r'c:/Users/Franco/Dropbox/files/clean'
files = os.listdir(dbox)

In [5]:
files

['accesom.xlsx',
 'accesom_2.xlsx',
 'cwpops.xlsx',
 'dbiz.xlsx',
 'dbizi.xlsx',
 'dbiz_extras.xlsx',
 'distances.xlsx',
 'es.xlsx',
 'es_sin_brasil.xlsx',
 'geom_air.xlsx',
 'laboral.xlsx',
 'laboral_2(calculo manual para ciudades sin edu secundaria).xlsx',
 'latlongs.xlsx',
 'metro_areas.xlsx',
 'mlab.xlsx',
 'mlabi.xlsx',
 'prope',
 'sectores.xlsx']

In [6]:
cwpops = pd.read_excel(dbox+'/cwpops.xlsx',index_col=[0,1])

In [7]:
sectores = pd.read_excel(dbox+'/sectores.xlsx',index_col=[0,1])

In [8]:
laboral = pd.read_excel(dbox+'/laboral.xlsx',index_col=[0,1])

In [10]:
distances = pd.read_excel(dbox+'/distances.xlsx',index_col=[0,1])

#### Mergeos

In [844]:
laboral = laboral.reset_index()
laboral['Ciudad'] = laboral['City'].str.strip(' ')

In [845]:
laboral[laboral['Country']=='Uruguay']['Ciudad'] = laboral.loc[laboral['Country']=='Uruguay','Ciudad'].str.title()

In [846]:
laboral = laboral.drop(columns='City').rename(columns={'Ciudad':'City'}).set_index(['Country','City'])

In [847]:
lab = laboral.loc[:,laboral.columns[0]].to_frame().merge(geom.loc[:,['lat','long']],left_index=True,right_index=True,how='inner',indicator=True)

In [848]:
lab['index_lab'] = lab.index

In [849]:
lab = lab.dropna()

In [850]:
cw = cwpops.loc[:,['pops','nearest_loc','index1']].join(geom.loc[:,['lat','long']])

In [851]:
cw = cw.dropna()

In [852]:
cw = gpd.GeoDataFrame(cw, geometry=gpd.points_from_xy(cw.long, cw.lat),crs={'init':'epsg:4326'}) #
lab = gpd.GeoDataFrame(lab, geometry=gpd.points_from_xy(lab.long, lab.lat),crs={'init':'epsg:4326'}) #

In [853]:
cw = cw.to_crs(epsg=3310)

In [854]:
lab = lab.to_crs(epsg=3310)

In [855]:
pts = lab['geometry'].unary_union

In [856]:
cw['lab_cercana'] = cw['geometry'].apply(lambda x: nearest_points(x, pts)[1])

In [857]:
lab['geometry_str'] = lab['geometry'].apply(lambda x: wkt.dumps(x))

In [858]:
cw['lab_cercana_str'] = cw['lab_cercana'].apply(lambda x: wkt.dumps(x))

In [863]:
cwm = cw.reset_index().merge(lab.loc[:,['Salario Horario (PPP 2011)','index_lab','geometry_str']],how='left',left_on='lab_cercana_str',right_on='geometry_str',validate='m:1',indicator=True).set_index(['Country','City'])

In [867]:
cwm = cwm.drop(columns=['lab_cercana_str','geometry_str'])

In [869]:
cwm = cwm.rename(columns={'nearest_loc':'near_loc_old'})

In [905]:
a = cwm.groupby(['Country','City']).count()['pops'].to_frame()

In [906]:
a[a['pops']>1]

Unnamed: 0_level_0,Unnamed: 1_level_0,pops
Country,City,Unnamed: 2_level_1
Argentina,Partidos del GBA,2
Mexico,Pachuca,2


In [915]:
cwm = cwm.reset_index().drop_duplicates(subset=['Country','City']).set_index(['Country','City'])

### Tema Población Metroarea

In [893]:
msa = pd.read_excel('files/aglomerados/msa.xlsx',index_col=[0,1]).reset_index().replace({'Brazil':'Brasil'})

In [894]:
msa['City'] = msa['City'].apply(lambda x: unidecode.unidecode(x))

msa = msa.replace({'City':{'Puebla':'Puebla-Tlaxcala'}})

mydict={'Córdoba':'Gran Cordoba','Mendoza':'Gran Mendoza','Tucumán':'Gran Tucuman-Tafi Viejo','Rosario':'Gran Rosario','Buenos Aires':'Partidos del GBA'}
msa = msa.replace({'City':mydict})

mydict={'Santa Cruz de la Sierra':'Andrés Ibañez/Santa Cruz de la Sierra','Cochabamba':'Cercado/Cochabamba','La Paz':'Pedro Domingo Murillo/La Paz y el Alto'}
msa = msa.replace({'City':mydict})

notlist = ['United States','Canada']

msa_latam = msa.loc[~msa['Country'].isin(notlist)]

msa_latam = msa_latam.sort_values(['Country','City']).set_index(['Country','City'])

msa_latam = msa_latam.rename(columns={'pops':'Población estimada'})

In [895]:
msa_latam.to_excel('files\\clean\\metro_areas.xlsx')

In [887]:
msa_latam.columns

Index(['rank_continent', 'Población estimada', 'year', 'Continent', 'rank'], dtype='object')

### Merge

In [917]:
cwm = cwm.drop(columns=['_merge'])

In [919]:
cwm = cwm.merge(msa_latam.loc[:,'Población estimada'],how='left',left_index=True,right_index=True,indicator=True,validate='1:1')

In [924]:
cwm['Población'] = cwm[['pops','Población estimada']].max(axis=1)

#### CityWage

In [929]:
cwm['City_Wage'] = cwm['Población'] * cwm['Salario Horario (PPP 2011)']

## recalculamos el tema de acceso a mercados

* La idea es obtener la cantidad de horas de automovil que existe entre 1 ciudad y el 25% / 50% / 75% /100% de la masa salarial del pais que se encuentra con un salario por encima de la ciudad objetivo

In [989]:
distances = distances.reset_index()

In [990]:
distances = distances.rename(columns={'City':'City A'})

In [960]:
cwm = cwm.rename(columns={'index1':'index_old'})

In [961]:
cwm['index1'] = cwm.index

Ajustes por ciudades faltantes

In [1000]:
#Mexico
distances = distances.replace({'City A':{'Valle de Mexico':'Mexico'},'City B':{'Valle de Mexico':'Mexico'}})

In [992]:
#Panama
distances = distances[(distances['City A']!='Panama Arraijan')&(distances['City B']!='Panama Arraijan')]

In [997]:
#Paraguay
distances = distances[(distances['City A']!='Caaguazu')&(distances['City B']!='Caaguazu')]

distances = distances[(distances['City A']!='Alto Parana')&(distances['City B']!='Alto Parana')]

distances = distances[(distances['City A']!='Itapua')&(distances['City B']!='Itapua')]

In [1001]:
#Peru
distances = distances[(distances['City A']!='Canete')&(distances['City B']!='Canete')]

In [1002]:
def get_wages_from_cutoff(city_index, cutoff):
    city_wage = cwm.drop_duplicates('index1').loc[city_index]['City_Wage']
    distan = distances[(distances['Country'] == city_index[0])
                            &(distances['City A'] == city_index[1])
                            &(distances['duration_traffic'] <= cutoff)].drop_duplicates('City B')

    nearby_cities = [(x,y) for x,y in distan[['Country','City B']].values.tolist()]
    if len(nearby_cities) == 0:
        return city_wage
    else:
        nearby_wages = cwm.loc[nearby_cities]['City_Wage'].sum()
        return nearby_wages + city_wage

In [1003]:
def get_countryshare_from_cutoff(city_index, cutoff, share_out = True):
    nearwages = get_wages_from_cutoff(city_index, cutoff = cutoff)
    country_wages = cwm.groupby('Country')['City_Wage'].sum().loc[city_index[0]]
    share = nearwages/country_wages
    if share_out == True:
        return share
    else:
        return nearwages

In [1004]:
shares_cutoff = pd.DataFrame(index = cwm.index)

shares_cutoff['index1'] = shares_cutoff.index
for i in range(10):
    shares_cutoff['cutoff_' + str(i)] = shares_cutoff['index1'].apply(
                lambda x: get_countryshare_from_cutoff(x, cutoff = i))

In [1008]:
shares_cutoff.to_excel(direc+'\\accesom_2.xlsx')

In [1007]:
direc

'c:\\Users\\Franco\\Python\\CAF\\files\\clean'

## Ahora hay que cambiar las funciones

In [1002]:
def get_wages_from_cutoff(city_index, cutoff):
    city_wage = cwm.drop_duplicates('index1').loc[city_index]['City_Wage']
    distan = distances[(distances['Country'] == city_index[0])
                            &(distances['City A'] == city_index[1])
                            &(distances['duration_traffic'] <= cutoff)].drop_duplicates('City B')

    nearby_cities = [(x,y) for x,y in distan[['Country','City B']].values.tolist()]
    if len(nearby_cities) == 0:
        return city_wage
    else:
        nearby_wages = cwm.loc[nearby_cities]['City_Wage'].sum()
        return nearby_wages + city_wage

In [1003]:
def get_countryshare_from_cutoff(city_index, cutoff, share_out = True):
    nearwages = get_wages_from_cutoff(city_index, cutoff = cutoff)
    country_wages = cwm.groupby('Country')['City_Wage'].sum().loc[city_index[0]]
    share = nearwages/country_wages
    if share_out == True:
        return share
    else:
        return nearwages

In [1004]:
shares_cutoff = pd.DataFrame(index = cwm.index)

shares_cutoff['index1'] = shares_cutoff.index
for i in range(10):
    shares_cutoff['cutoff_' + str(i)] = shares_cutoff['index1'].apply(
                lambda x: get_countryshare_from_cutoff(x, cutoff = i))

In [1009]:
shares_cutoff.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,index1,cutoff_0,cutoff_1,cutoff_2,cutoff_3,cutoff_4,cutoff_5,cutoff_6,cutoff_7,cutoff_8,cutoff_9
Country,City,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Argentina,Bahia Blanca-Cerri,"(Argentina, Bahia Blanca-Cerri)",0.01,0.01,0.01,0.02,0.02,0.02,0.05,0.06,0.69,0.69
Argentina,Ciudad de Buenos Aires,"(Argentina, Ciudad de Buenos Aires)",0.15,0.62,0.62,0.63,0.68,0.72,0.73,0.73,0.79,0.79
Argentina,Comodoro Rivadavia-Rada Tilly,"(Argentina, Comodoro Rivadavia-Rada Tilly)",0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.02
Argentina,Concordia,"(Argentina, Concordia)",0.0,0.0,0.0,0.0,0.02,0.69,0.72,0.73,0.77,0.8
Argentina,Corrientes,"(Argentina, Corrientes)",0.01,0.02,0.02,0.02,0.03,0.03,0.03,0.04,0.07,0.12
