In [30]:
import pandas as pd
import time
import polars as pl
from geopy.geocoders import Nominatim

## Abriendo con polars

In [2]:
cur_time = time.time()
pl_df = pl.read_parquet("parquet/sitios1.parquet")
print(f"Tiempo en leer usando polars = {round((time.time()-cur_time),2)} segundos")
pl_df.shape

Tiempo en leer usando polars = 4.68 segundos


(3025011, 15)

## Elimino duplicados en name

In [3]:
pl_df = pl_df.filter(pl_df.select(pl.col('name')).is_duplicated())
pl_df.shape

(870913, 15)

str = 'Porter Pharmacy, 129 N Second St, Cochran, GA 31014'
str1 = str[str.index(',')+2:]
str1[:str1.index(',')]

## Aqui empiezo la limpieza

## Establecimientos Gastronomicos de la Florida

In [4]:
def busca_zip(x):
    if x.strip()[len(x)-15:].upper() == ',UNITED STATES':
        x = x.strip()[:len(x)-15]
    return x.strip()[len(x)-5:]  

def busca_state(x):
    if x.strip()[len(x)-15:].upper() == ',UNITED STATES':
        x = x.strip()[:len(x)-15]
    x = x.strip()[:len(x)-6]
    return x.strip()[len(x)-2:] 

def busca_street(x):
    try:
        x1 = x[x.index(',')+2:]
        return x1[:x1.index(',')]+' FL USA'
    except:
        return x

In [5]:
cur_time = time.time()
pl_df1 = (
    pl_df.lazy()
    # Solo restaurant, bakery, desserts, pastry 
    .filter(pl.col('category').apply(lambda x: True if ('RESTAURANT' in ''.join(x).upper() or
                                                        'BAKERY' in ''.join(x).upper() or
                                                        'DESSERTS' in ''.join(x).upper() or
                                                        'PASTRY' in ''.join(x).upper() 
                                                       ) else False))
    # Creo columnas nuevas para ML
    .with_columns(pl.col('category').apply(lambda x: ''.join(x)))
    .with_columns(pl.col('category').apply(lambda x: 1 if 'RESTAURANT' in ''.join(x).upper() else 0).alias('restaurant'))
    .with_columns(pl.col('category').apply(lambda x: 1 if 'BAKERY' in ''.join(x).upper() else 0).alias('bakery'))
    .with_columns(pl.col('category').apply(lambda x: 1 if 'DESSERTS' in ''.join(x).upper() else 0).alias('dessert'))
    .with_columns(pl.col('category').apply(lambda x: 1 if 'PASTRY' in ''.join(x).upper() else 0).alias('pastry')) 
    .with_columns(pl.col('state').apply(lambda x: 1 if 'PERMANENTLY CLOSED' in x.upper() else 0).alias('closed')) 
    .with_columns(pl.col('address').apply(busca_street).alias('street'))
    # Solo los del estado de la Florida
    .with_columns(pl.col('address').apply(busca_zip).alias('zip'))
    .with_columns(pl.col('address').apply(busca_state).alias('st'))
    # Creo una columna que sea una copia de state para trabajarla y sacarle los horarios
    .with_columns(pl.col('state').alias('horario'))
    .filter(pl.col('st')=='FL')
    # Eliminacion de columnas:        
    #.select(pl.exclude(['gmap_id','description','price','hours','MISC','relative_results','state']))  
    .collect()
)    
print(f"{pl_df1.shape[0]} records que quedan, tiempo usado = {round((time.time()-cur_time),2)} segundos")

4127 records que quedan, tiempo usado = 9.96 segundos


### Verifique varios casos de valores NaN en la columna closed y todos estaban 'Permanently closed'
### asi que todos los valores Nan de la columna closed los pongo en 1

In [6]:
pl_df2 = pl_df1.with_columns(pl.col('closed')).fill_null(1)

In [7]:
pl_df2 = pl_df2.with_columns(pl.col("zip").cast(pl.Int64))

### Creo las columnas open y close que van a tener los horarios de apertura y cierre, 
### la informacion sale de la columna state

In [8]:
def busca_close(x):
    if x == 'Permanently closed': return x
    if x[:13] == 'Open ⋅ Closes': return x[14:]
    if x[:8] == 'Closed ⋅': return None
    if x == 'Open 24 hours': return x
    if x[:12] == 'Opens soon ⋅': return None
    if x[:14] == 'Closes soon ⋅ ': return x[14:][:7]
    if x == 'Open now': return None
    if x == 'Temporarily closed': return x
    if x == 'Closed': return x
    return 'nada'

In [9]:
def busca_open(x):
    if x == 'Permanently closed': return x

    if x[:13] == 'Closes soon ⋅':
        try:
            return texto[14:][6:][texto[14:][6:].index('Opens')+6:]
        except:
            try:
                return texto[14:][6:][texto[14:][6:].index('Reopens')+8:]
            except:
                return None
    if x[:12] == 'Opens soon ⋅': return x[13:]
    if x[:8] == 'Open now': return None
    if x[:6] == 'Open ⋅': return None
    if x[:14] == 'Closed ⋅ Opens': return x[15:] 
    if x == 'Open 24 hours': return x 
    if x == 'Temporarily closed': return x
    if x == 'Closed': return x 
    return 'nada'

In [10]:
def rectifico_close(x):
    if x[:5] == '1AM ⋅': return '1AM'
    if x[:5] == '2AM ⋅': return '2AM'
    if x[:5] == '2PM ⋅': return '2PM'
    if x[:5] == '3PM ⋅': return '3PM'
    if x[:5] == '4PM ⋅': return '4PM'
    if x[:5] == '5PM ⋅': return '5PM'
    if x[:5] == '6PM ⋅': return '6PM'
    if x[:5] == '7PM ⋅': return '7PM'
    if x[:5] == '8PM ⋅': return '8PM'
    if x[:5] == '8AM ⋅': return '8AM'
    if x[:5] == '9PM ⋅': return '9PM'
    if x[:6] == '10PM ⋅': return '10PM'
    if x[:6] == '11AM ⋅': return '11AM'
    if x[:6] == '11PM ⋅': return '11PM'
    if x[:6] == '12AM ⋅': return '12AM'
    return x

In [11]:
pl_df3 = pl_df2.with_columns([
    pl.col('state').apply(busca_open).alias('open'),
    pl.col('state').apply(busca_close).alias('close')
])
pl_df3 = pl_df3.with_columns([
   pl.col('close').apply(rectifico_close)
])
pd_df3 = pl_df3.to_pandas()

# Pruebas
texto = pl_df3[1739,'close']
#texto[14:].index('⋅')
texto[:5]

In [12]:
pl_df3 = pl_df3.select(pl.exclude(['state','st']))

## Agrego columnas de county y city que pueden servir al grupo de Analitics y el de Machine Learning

In [13]:
pl_zip = pl.read_csv('csv/zip_florida.csv')
pl_zip.head()

zip,Zipcode name,City,State,County Name
i64,str,str,str,str
32003,"""""FLEMING ISLAN...","""FLEMING ISLAND...","""FL""","""CLAY"""
32004,"""""PONTE VEDRA B...","""PONTE VEDRA BE...","""FL""","""SAINT JOHNS"""
32006,"""""FLEMING ISLAN...","""FLEMING ISLAND...","""FL""","""CLAY"""
32007,"""""BOSTWICK. FL""...","""BOSTWICK""","""FL""","""PUTNAM"""
32008,"""""BRANFORD. FL""...","""BRANFORD""","""FL""","""SUWANNEE"""


In [14]:
pl_df3 = pl_df3.join(pl_zip.select(['zip','City','County Name']), on='zip')

## Arreglar latitud y longitud
### Este es el formato que tengo    'Cape Seafood Shack, 603 Del Prado Blvd S, Cape Coral, FL 33990'
### Este es el formato que necesito   '603 Del Prado Blvd S 33990 FL USA'

 ### Creo dos nuevas columnas latitude1 y longitude1

In [15]:
# '603 Del Prado Blvd S 33990 FL USA'
def busca_latitude(x):
    try:
        location = geolocator.geocode(x)
        return location.latitude
    except:
        return 0

def busca_longitude(x):
    try:
        location = geolocator.geocode(x)
        return location.longitude
    except:
        return 0

In [31]:
geolocator = Nominatim(user_agent='hola')
type(geolocator)

geopy.geocoders.nominatim.Nominatim

In [19]:
cur_time = time.time()
pl_df3 = (
    pl_df3.lazy()
    .with_columns([
        pl.col('street').apply(busca_latitude).alias('latitude1'),
        pl.col('street').apply(busca_longitude).alias('longitude1')    
        #pl.col('latitude').apply(lambda x: busca_latitude if x == 0 else x),
        #pl.col('longitude').apply(lambda x: busca_longitude if x == 0 else x)
    ])
    .collect()
)
print(f"tiempo usado = {round((time.time()-cur_time),2)/60} minutos")

tiempo usado = 107.56833333333334 minutos


In [20]:
pd_df3 = pl_df3.to_pandas()

location = geolocator.geocode(pl_df3[3,'street'])
print(location.latitude,location.longitude)

#'Cape Seafood Shack, 603 Del Prado Blvd S, Cape Coral, FL 33990'
#location = geolocator.geocode('603 Del Prado Blvd S 33990 FL USA')
#print((location.latitude, location.longitude))

## Salvo Datos Establecimientos Gastronomicos de la Florida 

In [23]:
pl_df3.write_parquet("parquet/sitios_FL.parquet")
pl_df3.write_csv("csv/sitios_FL.csv")
#pl_df3.write_parquet("PG_Google_Maps/Ornaldo/Machine Learning/sitios_FL.parquet")
pl_df3.write_csv("PG_Google_Maps/Ornaldo/sitios_FL.csv")

In [None]:
# Unir columnas
# pl.map(["iyear", "imonth", "iday"], lambda s: s[0] + "-" + s[1] + "-" + s[2]).alias("date2"),