In [1]:
import pandas as pd
import re
import geopandas as gpd
import altair as alt
import GOSTnets as gn
import networkx as nx
import osmnx as ox

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

alt.data_transformers.disable_max_rows()


DataTransformerRegistry.enable('default')

In [3]:
def clean_texts(raw_texts):
    

    clean_texts = []
    for text in raw_texts:
        try:
            if type(text) == bytes:
                text = text.decode("utf-8")
            text = text.lower()
            clean_text = text.replace("á", "a").replace("é", "e").replace("í",
                              "i").replace("ó", "o").replace("ú", "u").replace("é", "e").replace("í",
                              "i").replace("ó", "o").replace("ú", "u")
            clean_texts.append(clean_text)
        except AttributeError:
            print("ERROR CLEANING")
            print(text)
            continue
        except UnicodeDecodeError:
            print("Unicode Error, Skip")
            continue
    cleaned_text = ''.join(clean_texts)
    cleaned_text = cleaned_text.replace('\n ', '\n')
    
    

    return cleaned_text


def set_nombres(csv_file):
    nombres = pd.read_csv(csv_file, encoding = 'latin-1')
    nombres['nombre_spl'] = nombres['nombre'].str.split(' ')
    nombres['primer_nombre'] = nombres['nombre_spl'].apply(lambda x: 
        x[0].lower())
    nombres = nombres[:2000]
    
    set_nombres = set(nombres['primer_nombre'])
    return set_nombres

def pertenece_nombres(xx, set_hombres, set_mujeres):
    xx = set(xx)
    a = xx.intersection(set_hombres)
    b = xx.intersection(set_mujeres)
    
    if len(a) > 0:
        fin = 'm'
    elif len(b) > 0:
        fin = 'f'
    else:
        fin = 'x'
    return fin


In [4]:
h_add = {'blasco', 'capitan', 'faucett', 'blasco', 'vasco', 'frans', 'portinari', 'tasso', 'derain', 'vasari', 
        'morisot', 'moreli', 'dominico', 'holbein', 'vittore', 'regoyos', 'ucello', 'tiziano', 'seurat', 'boccioni', 
        'tintoretto', 'cezzane', 'zurbaran', 'donatello', 'degas', 'matisse', 'barsato', 'cezzane', 'giotto', 'murillo',
        'rembrand', 'velasquez', 'rubens', 'conti', 'bellini', 'garcilazo', 'leonidas', 'comandante', 'tagle', 'oblitas',
        'lope', 'calderon', 'euripides', 'verne', 'cacique', 'catari', 'padre', 'dean', 'manrique', 'manrique',
        'huamampoma', 'pio', 'vallejo', 'villar', 'bolognesi', 'willian', 'william', 'kennedy', 'howard', 
        'bosco', 'poma', 'willy', 'tiziano', 'anton', 'charles', 'jhon', 'john', 'thomas', 'billinghurst',
        'gutarra', 'teniente', 'tte.', 'augusto', 'agusto', 'victorelli', 'garcilazo', 'zola', 'francois', 
        'justiniano', 'giacomo', 'fray', 'francois', 'george', 'judas', 'lizandro', 'thomas' 'elias', 'rembrandt', 'sergio',
        'miro', 'hermilio', 'miguiel', 'edouard', 'roosevelt', 'gino', 'virrey', 'mark', 'thouars', 'alessandro',
         'modigliani', 'kandisnki', 'ghandi', 'houssay', 'mondrian', 'bach', 'ritter', 'gluck', 'bunsen', 'figueredo',
        'peña', 'augustin', 'leonidas', 'alipio', 'guadulfo'}

f_add = {'edecia', 'sor', 'gallagher',  'clorinda', 'isolina', 'barrenechea', 'salaverry', 'arnaldo',
        'thomas', 'perricholi', 'bastidas', 'elvira', 'melchorita', 'beatita', 'justa', 'sarita', 
         'ines', 'leonor', 'eleonor', 'meier' , 'marian', 'sarita', 'jhon'}

hombres_adicional = {'tupac', 'manco', 'saenz', 'pancho', 'mariscal', 'general', 'comandante',
                    'coronel', 'bartolome', 'almirante', 'lizardo', 'augustin', 'inca', 'marques',
                    'vicealmirante', 'leonidas', 'capitan', 'garcillazo', 'abascal', 'joze', 'pachacutec',
                    'capac', 'unanue', 'mariategui', 'vallejo', 'bondi', 'giuseppe', 'pachacutec',
                    'yupanqui', 'almirante', 'leonidas', 'harold', 'bilinghurst', 'grau', 'cmdte.', 'cmdte',
                    'leguia', 'bolognesi', 'arenales', 'salaverry', 'lizardo', 'vivaldi', 'francesco', 'james', 
                    'copernico', 'joseph', 'philipp', 'hans', 'pietro', 'rodin', 'pirandello', 'gaddi',
                    'duccio', 'crepi', 'pissano', 'reni', 'renoir', 'augosto', 'toulousse', 'louis', 'benton', 
                    'zola', 'milet', 'velde', 'louis' 'benton','zola','milet','croix','sorolla','monet',
                     'verrochio','cavallini', 'dyck', 'hadraza','bronzino', 'gogh', 'dore', 'crane', 'corot', 
                    'chardin', 'dominguez', 'astete', 'rousseau', 'greco', 'utrillo', 'correggio', 'veronesse',
                    'tiepolo', 'botiger', 'rosetti', 'preising', 'bellman', 'schubert', 'bethoven', 'liszt',
                    'schipper',  'brahams', 'verdi', 'paganini', 'hassinger', 'schrader', 'bosovich', 
                    'mascagni', 'barbieri', 'otto', 'mendelson', 'faure', 'puccini', 'strauss', 'delibes',
                    'stravinsky',  'clodomiro', 'bernstein', 'toscanini', 'rubinstein', 'torroba', 'toselli', 
                    'scarlatti', 'tchaicovski', 'angelico', 'reynolds', 'donzetti', 'ravel', 'puccini', 'simmoni',
                    'matier', 'neuhaus', 'becqier', 'torrigiano', 'frederic', 'pinerolo', 'sipan', 'armardo',
                    'wolgfang', 'paganini', 'beethoven', 'claude', 'kandisky', 'claude', 'giuseppe', 'donatello',
                    'otto', 'gasset', 'carlyle', 'calvino', 'paderewski', 'green', 'russel', 'hillman', 'adam', 
                     'monstequieu', 'spencer', 'hegel', 'diderot', 'engels', 'pascal', 'confusio', 'roentgen',
                     'fisher', 'borges', 'paicioli', 'carpenter', 'fibacci', 'gorky', 'wide', 'cardich', 'goethe',
                    'shakespeare', 'daudet', 'galdos', 'caceres', 'vasco', 'picasso', 'edison', 'goya', 'sinchi',
                     'yahuar', 'ramsey', 'holbein', 'courbet', 'vinci', 'ghiberti', 'sanzio', 'barton', 'angelico',
                     'prado', 'dulio'}

f_adicional = {'ocllo', 'delmira', 'tita', 'pissarro', 'stowe', 'chabuca',  'emily', 'marie', 'keller',
              'amarilis', }

In [5]:
gdb = gpd.read_file("/Users/ccsuehara/Documents/Lima osm /lima_calles_clipped/lima_calles_clipped.shp")

In [6]:
names = list(gdb.name.unique())

In [7]:
db_names = pd.DataFrame(columns = ['names'], data = names)
db_names = db_names.loc[db_names['names'].notna() ].copy()

In [8]:
n_hombres = set_nombres('/Users/ccsuehara/GH folders/data_vizzes/genero_gabinete/data/hombres.csv')
n_mujeres = set_nombres('/Users/ccsuehara/GH folders/data_vizzes/genero_gabinete/data/mujeres.csv')

remove_mujeres = {'milagros', 'angeles', 'paz',  'abril', 'yuri', 'flor', 'alba'}

n_mujeres = n_mujeres.difference(remove_mujeres)
n_mujeres = n_mujeres.union(f_adicional)
n_mujeres = n_mujeres.union(f_add)

n_hombres = n_hombres.union(h_add)
n_hombres = n_hombres.union(hombres_adicional)

In [9]:
db_names['clean_name'] = db_names['names'].apply(lambda x: clean_texts(x).split())

In [10]:
a = list(db_names['clean_name'].sum())

In [11]:
from collections import Counter
counts = Counter(a)


In [12]:
#sorted(counts.items(), key=lambda x: x[1], reverse=True)

In [13]:
remove = ['calle', 'jiron', 'pasaje', 'de', 'avenida', 'los', 'las', 'la', 'san', 'la', 'san', 'el', 'prolongacion',
          'y', 'del']

In [14]:
db_names['clean_name2'] = db_names['clean_name'].apply(lambda x: [ele for ele in x if ele not in remove])

In [15]:
db_names['genero'] = db_names['clean_name2'].apply(lambda x: pertenece_nombres(x,n_hombres, n_mujeres))

In [16]:
#print(*list(db_names.loc[db_names['genero'] == 'x']['clean_name2']), sep='\n')


In [17]:
#print(*list(db_names.loc[db_names['genero'] == 'f'].names), sep = "\n")


In [18]:
#print(*list(db_names.loc[db_names['genero'] == 'm'].names), sep = "\n")


In [19]:
db_names['genero'].value_counts()

x    10093
m     5142
f      730
Name: genero, dtype: int64

In [20]:
gdb = gdb.merge(db_names, left_on = 'name', right_on = 'names', how = 'left', indicator = True)

In [21]:
gdb.loc[gdb._merge == 'left_only', 'genero'] = 'x'

In [22]:
gdb.genero.value_counts()

x    133488
m     14216
f      1970
Name: genero, dtype: int64

In [23]:
gdb.genero.value_counts()

x    133488
m     14216
f      1970
Name: genero, dtype: int64

In [25]:
gdb_male = gdb.loc[gdb.genero == 'm']
#gdb_male.to_csv('./data/gdb_males22.csv')

gdb_female = gdb.loc[gdb.genero == 'f']
#gdb_female.to_csv('./data/gdb_females22.csv')

gdb_not = gdb.loc[gdb.genero == 'x']
#gdb_not.to_csv('./data/gdb_not22.csv')

In [26]:
gdb_male = gdb_male.drop(['_merge'], axis=1)

In [29]:
#gdb_male.set_crs = 'epsg:4326'
#gdb_male.to_file("./data/gdb_males22.shp")
#nofunciona porque tiene listas

In [38]:
lima_map = alt.Chart(gdb
          , height = 800, width = 1000).mark_geoshape().encode(
    color = alt.Color('genero')).properties(
    title={ "text" : "Lima",
      "subtitle": ["Calles según género del nombre", 
                   "Fuente: Open Street Maps"],
      "color": "Black",
      "subtitleColor": "Black"
    }) 

In [34]:
del gdb_female, gdb_not, gdb_male

In [7]:
path = "/Users/ccsuehara/Downloads/lima_boundaries_shp/clipping_boundary.geojson"


clip = gpd.read_file(path)
bound = clip.geometry.iloc[0]


In [9]:
# gDrive = ox.graph_from_polygon(bound, network_type= 'all')
# gDrive = ox.project_graph(gDrive, to_crs='epsg:4326')

In [10]:
# edges = gn.edge_gdf_from_graph(gDrive)

  return _prepare_from_string(" ".join(pjargs))


In [16]:
# lima_roads = edges[['name', 'geometry', 'length', 'maxspeed']]

In [42]:
# gdb.shape

(117798, 20)

In [27]:
220005 // 10000

22