In [13]:
import pandas as pd

In [14]:
# funcion de mapeo para segementacion por rubro
def map_category(categories):
    for cat in categories:
        if any(keyword in cat.lower() for keyword in ['bakery', 'restaurant', 'dessert','pastry','deliver','Pizza','grill']):
            return 0 # 0 indica que pertence al rubro de estudio y debera conservarse
    return 1

In [15]:
def pipeline_ETL(df):
    # 1.Eliminando filas duplicadas
    df = df.drop_duplicates(subset=['name','address','gmap_id','description','latitude','longitude','avg_rating','num_of_reviews','price','state','url'])
    # 2.Eliminando los permanentemente cerrados
    df = df[df['state']!='Permanently closed']
    # 3.eliminando los nulos en 'name'
    df.dropna(subset=['name'],inplace=True)
    # 4.Los que tienen null en categoy y tienen la cadena restaurant/bakery/dessert/pastry en name se les imputa restaurante
    df.loc[(df['name'].str.contains('restaurant')|df['name'].str.contains('pizza')|df['name'].str.contains('grill')) & (df['category'].isnull()), 'category'] = ['restaurant']
    df.loc[(df['name'].str.contains('bakery')) & (df['category'].isnull()), 'category'] = ['bakery']
    df.loc[(df['name'].str.contains('dessert')) & (df['category'].isnull()), 'category'] = ['desserts']
    df.loc[(df['name'].str.contains('pastry')) & (df['category'].isnull()), 'category'] = ['pastry']
    # 5.Eliminiando los nulos de la columna category
    df=df.dropna(subset=['category'])
    # 6.Marcando con 0 a aquella filas que pertenecen al rubro de estudio
    df['flag'] = df['category'].apply(map_category)
    # 7.eliminando a los que no pertenecen al rubro de estudio
    df = df[df['flag']==0]
    # 8.Eliminacion de columnas: price
    df.drop(columns='price',inplace=True)
    return df

In [16]:
# Proceso de carga secuencial y exportacion a parquet
dfs = []
for i in range(1,12):
    filename = f'Datasets/{i}.json'
    df = pd.read_json(filename,lines=True)
    df = pipeline_ETL(df=df)
    dfs.append(df)
result= pd.concat(dfs)
result.to_parquet('Datasets/metadata-sitios.parquet')