## ETL para la WebApp en Streamlit

In [3]:
# Se importan las librerías a utilizar

import pandas as pd
import numpy as np

In [41]:
# Se ingesta la base de datos

df_lugares = pd.read_csv('sitios_Fl_COOR.csv')

In [42]:
# Se da una primera mirada a los registros

df_lugares.head()

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,...,closed,street,zip,horario,open,close,City,County Name,latitude1,longitude1
0,Cape Seafood Shack,"Cape Seafood Shack, 603 Del Prado Blvd S, Cape...",0x88db4147b1d9e6f3:0x943dbd10a92ba1b1,,26.640625,-81.9375,Restaurant,5,1,$$,...,1,603 Del Prado Blvd S FL USA,33990,,,,CAPE CORAL,LEE,26.640758,-81.940728
1,Fresh Point Country Buffet,"Fresh Point Country Buffet, 10525 US-19, Pinel...",0x88c2e4e34f1ed783:0x76c5da381c499d79,,27.875,-82.6875,Buffet restaurant,5,2,,...,1,10525 US-19 FL USA,33782,Permanently closed,Permanently closed,Permanently closed,PINELLAS PARK,PINELLAS,27.867496,-82.703016
2,Oneyda's Bakery,"Oneyda's Bakery, 600 Goodlette-Frank Rd #101, ...",0x88dae191ee505917:0x6ba3e25388d3fad4,,26.15625,-81.8125,"Bakery,Deli",4,19,$,...,1,600 Goodlette-Frank Rd #101 FL USA,34102,Permanently closed,Permanently closed,Permanently closed,NAPLES,COLLIER,26.270501,-81.789587
3,Annie's Bake Shoppe,"Annie's Bake Shoppe, 10331 SW 54th St, Miami, ...",0x88d9c754413f6c9d:0x1f93eff5e0ba9c16,,25.71875,-80.375,"Bakery,Gift basket store",4,3,,...,0,10331 SW 54th St FL USA,33165,Open ⋅ Closes 4PM,,4PM,MIAMI,MIAMI-DADE,25.717426,-80.361958
4,Hot Box,"Hot Box, 1-99 Kinkaid St, Pensacola, FL 32507",0x8890b9241e704667:0x3a1e565c17c00993,,30.390625,-87.25,Restaurant,4,5,,...,1,1-99 Kinkaid St FL USA,32507,Permanently closed,Permanently closed,Permanently closed,PENSACOLA,ESCAMBIA,0.0,0.0


In [43]:
# Se detallan más a fondo los tipos  de datos y valores nulos

df_lugares.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4127 entries, 0 to 4126
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              4127 non-null   object 
 1   address           4127 non-null   object 
 2   gmap_id           4127 non-null   object 
 3   description       2393 non-null   object 
 4   latitude          4127 non-null   float64
 5   longitude         4127 non-null   float64
 6   category          4127 non-null   object 
 7   avg_rating        4127 non-null   int64  
 8   num_of_reviews    4127 non-null   int64  
 9   price             2693 non-null   object 
 10  hours             4127 non-null   float64
 11  MISC              4127 non-null   float64
 12  relative_results  3299 non-null   object 
 13  url               4127 non-null   object 
 14  restaurant        4127 non-null   int64  
 15  bakery            4127 non-null   int64  
 16  dessert           4127 non-null   int64  


In [44]:
# En los registros de las columnas latitude1 y longitude1 (que se generaron a partir de las direcciones),
# donde hay cero ('0), rellenar con la columna que contiene el dato de forma más reducida

for i in range (0, df_lugares.shape[0]):
    if df_lugares['latitude1'][i] == 0:
        df_lugares['latitude1'][i] = df_lugares['latitude'][i]

for i in range (0, df_lugares.shape[0]):
    if df_lugares['longitude1'][i] == 0:
        df_lugares['longitude1'][i] = df_lugares['longitude'][i]

# Se borran las columnas iniciales

df_lugares.drop(columns=['latitude', 'longitude'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_lugares['latitude1'][i] = df_lugares['latitude'][i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_lugares['longitude1'][i] = df_lugares['longitude'][i]


In [45]:
# Se borran los registros duplicados

df_lugares.drop_duplicates(inplace=True)

In [46]:
# Se renombran algunas columnas para la presentación en la WebApp

df_lugares.rename (columns={'name':'Nombre', 'num_of_reviews':'Reviews', 'zip':'Zip', 'avg_rating':'Rating', 'latitude1':'latitude', 'longitude1':'longitude', 'closed':'Condición_Establecimiento'}, inplace=True)

In [47]:
# Se rellenan los valores nulos con 'Sin Dato'

df_lugares['description'].replace(np.nan, 'Sin Dato', inplace=True)
df_lugares['horario'].replace(np.nan, 'Sin Dato', inplace=True)
df_lugares['price'].replace(np.nan, 'Sin Dato', inplace=True)
df_lugares['open'].replace(np.nan, 'Sin Dato', inplace=True)
df_lugares['close'].replace(np.nan, 'Sin Dato', inplace=True)

In [48]:
# En la columna 'Condición_Establecimiento', se cambian los valores: 1 = Permanentemente Cerrado, 2 = Abierto

df_lugares['Condición_Establecimiento'].replace(1, 'Inactivo', inplace=True)
df_lugares['Condición_Establecimiento'].replace(0, 'Activo', inplace=True)

In [49]:
# Se exporta como un nuevo archivo .csv para ser utilizado en el Scrip de la WebApp 'iTakeYou'

df_lugares.to_csv('sitios_FL_New.csv')

In [34]:
#avg_breast_cancer_df = breast_cancer_df.groupby("target").mean()



In [35]:
df_condicion_sitios

Condición_Establecimiento
Abierto                    2595
Permanentemente Cerrado    1466
Name: City, dtype: int64

# Pruebas

In [2]:
import pandas as pd

### City Population

In [50]:
df_poblacion = pd.read_csv ('city_population.csv')

In [51]:
df_poblacion

Unnamed: 0,County Name,City,poblacion
0,ALACHUA,HIGH SPRINGS,6215.0
1,ALACHUA,ARCHER,1140.0
2,ALACHUA,HAWTHORNE,1478.0
3,ALACHUA,GAINESVILLE,141085.0
4,ALACHUA,WALDO,846.0
...,...,...,...
208,WAKULLA,SOPCHOPPY,426.0
209,WALTON,DEFUNIAK SPRINGS,5919.0
210,WALTON,FREEPORT,5861.0
211,WASHINGTON,CHIPLEY,3660.0


In [57]:
df_poblacion['City'].value_counts()

KISSIMMEE       2
SARASOTA        2
WEWAHITCHKA     2
JACKSONVILLE    2
ORLANDO         2
               ..
VERO BEACH      1
MARIANNA        1
GRACEVILLE      1
MONTICELLO      1
VERNON          1
Name: City, Length: 206, dtype: int64

In [55]:
df_lugares['City'].value_counts()

MIAMI              367
ORLANDO            261
TAMPA              203
FORT LAUDERDALE    180
JACKSONVILLE       169
                  ... 
LAKE MONROE          1
FORT WHITE           1
HORSESHOE BEACH      1
JAY                  1
FLORAHOME            1
Name: City, Length: 303, dtype: int64

In [57]:
ciudad = 'MIAMI'
poblacion = df_poblacion[df_poblacion['City'] == ciudad]
if poblacion.shape[0] != 0:
    poblacion = poblacion.iloc[0][2]

In [58]:
df_ciudad = df_lugares [df_lugares ['City'] == ciudad]

categoria = 'Restaurant'
df_categoria = df_ciudad [df_ciudad ['category'] == categoria]

In [59]:
# Cantidad negocios abiertos y cerrados por categoría

df_categoria['closed'].value_counts()

1    19
0    15
Name: closed, dtype: int64

In [None]:
df_condicion_sitios = df_lugares.groupby('Condición_Establecimiento').City.count()

In [1]:
df_sitios_ciudad_categoria = df_ciudad.groupby('category').category.count()

NameError: name 'df_ciudad' is not defined

------------------------------------------------------------------------------------------------

### Review Fl

In [60]:
df_review_Fl = pd.read_parquet ('review_FL.parquet')

### Sitios Fl 18-02

In [63]:
df_sitios_Fl_18_02 = pd.read_csv ('sitios_FL_18_02.csv')