In [1]:
import re
import numpy as np
import pandas as pd
import openpyxl

import geopandas as gpd
from shapely.geometry import Point
import geographiclib
from geographiclib.geodesic import Geodesic
import utm
from shapely.ops import nearest_points

import sys
import os
import requests
from bs4 import BeautifulSoup
from lxml import html
import warnings

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

import urbanpy as up
import contextily as ctx

import io
import json

pd.set_option('display.max_rows',999)
pd.set_option('display.float_format', lambda x: '%.3f' % x)


warnings.filterwarnings("ignore")
#import unicodedata
#import gzip

import unidecode 

In [2]:
import plotly.graph_objects as go
import plotly.express as px
import chart_studio.plotly as py
import plotly

In [3]:
pd.options.display.float_format = '{:,.2f}'.format

## files

In [4]:
dbox = r'c:/Users/Franco/Dropbox/files/clean'
files = os.listdir(dbox)

In [5]:
cw = pd.read_excel(dbox+'/cwpops.xlsx',index_col=[0,1])

In [6]:
sectores = pd.read_excel(dbox+'/sectores.xlsx',index_col=[0,1])

In [7]:
laboral = pd.read_excel(dbox+'/laboral_final(ajustes).xlsx',index_col=[0,1])

In [8]:
distances = pd.read_excel(dbox+'/distances.xlsx',index_col=[0,1])

### Base Distances

In [9]:
#Obtengo Salarios de todas las ciudades que poseo

drop = laboral.reset_index()[laboral.reset_index()['City'].str.contains('total')].set_index(['Country','City']).index
lab = laboral.drop(index=drop).loc[:,'Salario Horario (PPP 2011)'].to_frame()

In [10]:
lab = lab.reset_index()

In [11]:
lab['City2'] = lab['City'].str.replace(r'\([^)]*\)', '').str.strip()

In [12]:
lab = lab.rename(columns={'City':'City old','City2':'City'}).set_index(['Country','City']).drop(columns='City old')

In [13]:
#Obtengo indice de ciudades que quiero obtener salarios/salario + cercano

indices = distances.reset_index().dropna().drop_duplicates(['Country','City']).loc[:,['Country','City']].set_index(['Country','City']).join(cw.loc[:,['pops']],how='inner')
indices['Salario Horario (PPP 2011)'] = np.nan
indices['Nearest City'] = ""
indices = indices.rename(columns={'pops':'Población (millones)'})

In [14]:
#Creo base de distances con salarios para las ciudades B

distances = distances.dropna()
distances_i = distances.reset_index().set_index(['Country','City B']).join(lab.reset_index().rename(columns={'City':'City B'}).set_index(['Country','City B']),how='inner').reset_index().sort_values(['Country','City','City B']).set_index(['Country','City'])
distances_i = distances_i.reset_index().sort_values(['Country','City','duration_traffic']).drop_duplicates(subset=['Country','City'])

In [15]:
#Necesito Filtrar la base distances_i y dejar solo las ciudades para las que no tengo salario para luego appendearla a Lab, y tener 422 ciudades con sal/sal cercano

distances_i = distances_i.rename(columns={'City B':'nearest_loc'}).drop(columns=['Point A','Point B','Distance','Duration','Duration in traffic','parsed_duration','dit_elementos'])
distances_i = distances_i.set_index(['Country','City'])

In [16]:
for i,r in indices.iterrows():
    if i in lab.index:
        a = lab.loc[i,'Salario Horario (PPP 2011)']
        b = i[1]
    else:
        a = distances_i.loc[i,'Salario Horario (PPP 2011)']
        b = distances_i.loc[i,'nearest_loc']
    indices.at[i,'Salario Horario (PPP 2011)'] = a
    indices.at[i,'Nearest City'] = b

In [17]:
indices['Masa Salarial'] = indices['Población (millones)'] * indices['Salario Horario (PPP 2011)']

#### Debemos ahora, crear la primer función que logre generar los 25,50,75,100

In [18]:
dist = distances[distances.columns[-2:]]
dist = dist.reset_index()

In [19]:
#Distances duplicados
dist = dist.drop(index=dist[dist.duplicated(subset=['Country','City','City B'])].index).set_index(['Country','City'])

In [20]:
#Creo Masa Salarial como % de la MS nacional
indices = indices.join(indices.loc[:,'Masa Salarial'].groupby('Country').sum().to_frame(),rsuffix=' (Total País)')
indices['Masa Salarial (% Total País)'] = indices['Masa Salarial'] / indices['Masa Salarial (Total País)']

In [21]:
indices = indices.reset_index()
dist = dist.reset_index()

In [22]:
#Creo Base Shares, que contiene distancias, MS's y Salarios
shares = dist.merge(indices.loc[:,['Country','City','Masa Salarial (% Total País)','Salario Horario (PPP 2011)']],how='left',left_on=['Country','City B'],right_on=['Country','City'],indicator=True,validate='m:1').dropna()
shares = shares.drop(columns=['_merge','City_y']).rename(columns={'City_x':'City'}).set_index(['Country','City'])
indices = indices.rename(columns={'pops':'Población (millones)'})

In [23]:
indices = indices.set_index(['Country','City'])

In [24]:
def horas(index,tresh):
    
    a = indices.loc[index,'Masa Salarial (% Total País)']
    
    df = shares.loc[index]
    df = df.sort_values(['Country','City','duration_traffic'])
    df['cumsum'] = df['Masa Salarial (% Total País)'].cumsum()
    
    try:
        b = df.loc[df['cumsum']>=(tresh-a),'duration_traffic'][0]
    except IndexError:
        b = df['duration_traffic'].max()
        
    return b

In [25]:
[indices['Horas al 10% de MS'],
 indices['Horas al 20% de MS'],
 indices['Horas al 30% de MS'],
 indices['Horas al 40% de MS'],
 indices['Horas al 50% de MS'],
 indices['Horas al 60% de MS'],
 indices['Horas al 70% de MS'],
 indices['Horas al 80% de MS'],
 indices['Horas al 90% de MS'],
 indices['Horas al 100% de MS']] = [indices.apply(lambda x: horas(x.name,0.1),axis=1),
                                    indices.apply(lambda x: horas(x.name,0.2),axis=1),
                                    indices.apply(lambda x: horas(x.name,0.3),axis=1),
                                    indices.apply(lambda x: horas(x.name,0.4),axis=1),
                                    indices.apply(lambda x: horas(x.name,0.5),axis=1),
                                    indices.apply(lambda x: horas(x.name,0.6),axis=1),
                                    indices.apply(lambda x: horas(x.name,0.7),axis=1),
                                    indices.apply(lambda x: horas(x.name,0.8),axis=1),
                                    indices.apply(lambda x: horas(x.name,0.9),axis=1),
                                    indices.apply(lambda x: horas(x.name,0.99),axis=1)]

In [26]:
def horas(index,tresh):
    
    a = indices.loc[index,'Masa Salarial (% Total País)']
    wage = indices.loc[index,'Salario Horario (PPP 2011)']
    
    df = shares.loc[index]
    df = df.loc[df['Salario Horario (PPP 2011)']>=wage]
    df = df.sort_values(['Country','City','duration_traffic'])
    df['cumsum'] = df['Masa Salarial (% Total País)'].cumsum()
    
    try:
        b = df.loc[df['cumsum']>=(tresh-a),'duration_traffic'][0]
    except IndexError:
        b = df['duration_traffic'].max()
        
    return b

In [27]:
[indices['Horas al 10% de MS (Salarios mayores)'],
 indices['Horas al 20% de MS (Salarios mayores)'],
 indices['Horas al 30% de MS (Salarios mayores)'],
 indices['Horas al 40% de MS (Salarios mayores)'],
 indices['Horas al 50% de MS (Salarios mayores)'],
 indices['Horas al 60% de MS (Salarios mayores)'],
 indices['Horas al 70% de MS (Salarios mayores)'],
 indices['Horas al 80% de MS (Salarios mayores)'],
 indices['Horas al 90% de MS (Salarios mayores)'],
 indices['Horas al 100% de MS (Salarios mayores)']] = [indices.apply(lambda x: horas(x.name,0.1),axis=1),
                                    indices.apply(lambda x: horas(x.name,0.2),axis=1),
                                    indices.apply(lambda x: horas(x.name,0.3),axis=1),
                                    indices.apply(lambda x: horas(x.name,0.4),axis=1),
                                    indices.apply(lambda x: horas(x.name,0.5),axis=1),
                                    indices.apply(lambda x: horas(x.name,0.6),axis=1),
                                    indices.apply(lambda x: horas(x.name,0.7),axis=1),
                                    indices.apply(lambda x: horas(x.name,0.8),axis=1),
                                    indices.apply(lambda x: horas(x.name,0.9),axis=1),
                                    indices.apply(lambda x: horas(x.name,0.99),axis=1)]

In [28]:
indices.to_excel(dbox+'/horas_msII.xlsx')

In [30]:
indices.iloc[:10,-10:]

Unnamed: 0_level_0,Unnamed: 1_level_0,Horas al 10% de MS (Salarios mayores),Horas al 20% de MS (Salarios mayores),Horas al 30% de MS (Salarios mayores),Horas al 40% de MS (Salarios mayores),Horas al 50% de MS (Salarios mayores),Horas al 60% de MS (Salarios mayores),Horas al 70% de MS (Salarios mayores),Horas al 80% de MS (Salarios mayores),Horas al 90% de MS (Salarios mayores),Horas al 100% de MS (Salarios mayores)
Country,City,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Argentina,Bahia Blanca-Cerri,7.17,28.67,28.67,28.67,28.67,28.67,28.67,28.67,28.67,28.67
Argentina,Partidos del GBA,0.43,0.43,0.43,0.43,0.43,0.43,5.0,13.5,13.5,13.5
Argentina,Gran Catamarca,7.67,10.17,12.33,12.5,12.5,12.5,12.5,12.5,16.67,41.67
Argentina,Ciudad de Buenos Aires,,,,,,,,,,
Argentina,Comodoro Rivadavia-Rada Tilly,20.67,20.67,20.67,20.67,20.67,20.67,20.67,20.67,20.67,20.67
Argentina,Concordia,4.5,4.67,4.67,4.67,4.67,4.67,5.67,12.0,17.67,40.17
Argentina,Gran Cordoba,4.33,7.0,7.17,7.17,7.17,7.17,7.17,11.17,36.5,36.5
Argentina,Corrientes,9.83,10.0,10.0,10.0,10.0,10.17,10.17,14.67,30.0,45.67
Argentina,Formosa,11.67,12.67,12.67,12.67,12.67,12.67,12.83,13.17,20.17,47.82
Argentina,Jujuy-Palpala,10.83,15.0,18.0,18.0,18.0,18.0,18.0,18.0,21.17,47.33
