# ETL inicial para normalizar archivos y convertir a formatos óptimos

### 1.0 Instalar e importar librerías

In [None]:
!pip install pandas
!pip install ijson
!pip install pickle5



Collecting ijson
  Downloading ijson-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ijson
Successfully installed ijson-3.3.0
Collecting pickle5
  Downloading pickle5-0.0.11.tar.gz (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.1/132.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pickle5
  Building wheel for pickle5 (setup.py) ... [?25l[?25hdone
  Created wheel for pickle5: filename=pickle5-0.0.11-cp310-cp310-linux_x86_64.whl size=255319 sha256=0d612c6b7d469b2e65054bd1f45a0ff2d749771d013f2706d9234ed38d415fb8
  Stored in directory: /root/.cache/pip/wheels/7d/14/ef/4aab19d27fa8e58772be5c71c16add0426acf9e1f64353235c
Successfully built pickle5
Installing collected packages: pick

In [None]:
import pyarrow as pa
import pyarrow.parquet as pq
import os
import glob
import json
import pandas as pd
import pickle

### 1.1 Cargar, combinar y exportar a parquet los datos de reviews-estados/california

In [None]:


# Define la ruta de la carpeta principal donde se encuentran los archivos JSON
main_folder = '/content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/reviews-estados/review-California'

# Busca todos los archivos JSON en la carpeta principal y subcarpetas
all_json_files = glob.glob(os.path.join(main_folder, '**', '*.json'), recursive=True)

# Lista para almacenar los DataFrames
dataframes = []

# Función para leer y procesar archivos JSON
def read_json_file(file_path):
    data = []
    try:
        with open(file_path, 'r') as file:
            content = file.read()
            # Intenta cargar todo el contenido del archivo como un solo objeto JSON
            try:
                data = json.loads(content)
                if not isinstance(data, list):
                    data = [data]
            except json.JSONDecodeError:
                # Si falla, intenta cargar línea por línea
                lines = content.splitlines()
                for line in lines:
                    try:
                        data.append(json.loads(line))
                    except json.JSONDecodeError:
                        continue
    except Exception as e:
        print(f"Error reading file {file_path}, error: {e}")
    return data

# Leer y cargar cada archivo JSON en un DataFrame de pandas
for json_file in all_json_files:
    data = read_json_file(json_file)
    if data:
        df = pd.DataFrame(data)
        dataframes.append(df)

# Combinar todos los DataFrames en uno solo (opcional, si es que tienen la misma estructura)
combined_df = pd.concat(dataframes, ignore_index=True)

# Mostrar el DataFrame combinado
print(combined_df)



In [None]:
combined_df.head()

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
0,108991152262655788985,Song Ro,1609909927056,5,Love there korean rice cake.,,,0x80c2c778e3b73d33:0xbdc58662a4a97d49
1,111290322219796215751,Rafa Robles,1612849648663,5,Good very good,,,0x80c2c778e3b73d33:0xbdc58662a4a97d49
2,112640357449611959087,David Han,1583643882296,4,They make Korean traditional food very properly.,,,0x80c2c778e3b73d33:0xbdc58662a4a97d49
3,117440349723823658676,Anthony Kim,1551938216355,5,Short ribs are very delicious.,,,0x80c2c778e3b73d33:0xbdc58662a4a97d49
4,100580770836123539210,Mario Marzouk,1494910901933,5,Great food and prices the portions are large,,,0x80c2c778e3b73d33:0xbdc58662a4a97d49


In [None]:
# Suponiendo que combined_df es tu DataFrame combinado
# Asegúrate de que combined_df existe y tiene datos

# Define la ruta y el nombre del archivo de salida
output_file = '/content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/combined_reviews_california.parquet'

# Exportar el DataFrame a un archivo Parquet
combined_df.to_parquet(output_file, engine='pyarrow', index=False)

print(f"DataFrame exportado a {output_file}")


DataFrame exportado a /content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/combined_reviews_estados.parquet


### 1.2 Cargar, combinar y exportar a parquet los datos de "metadata-sitios"

In [None]:
import os
import glob
import dask.dataframe as dd

# Define la ruta de la carpeta principal donde se encuentran los archivos JSON
main_folder = '/content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/metadata-sitios'

# Busca todos los archivos JSON en la carpeta principal y subcarpetas
all_json_files = glob.glob(os.path.join(main_folder, '**', '*.json'), recursive=True)

# Cargar los archivos JSON en un DataFrame de Dask
ddf = dd.read_json(all_json_files, blocksize='64MB')  # Ajusta el blocksize según sea necesario

# Convertir a un DataFrame de pandas para análisis adicional
meta_sitios = ddf.compute()

# Mostrar el DataFrame combinado
print(meta_sitios)


                                 name  \
0                     Porter Pharmacy   
1                        City Textile   
2                        San Soo Dang   
3                        Nova Fabrics   
4                    Nobel Textile Co   
...                               ...   
68175                        Steak 48   
68176   Jack Mcnerney Chevrolet, Inc.   
68177  Central Ny Spay Neuter Assista   
68178                   Ok Feed Store   
68179              Crestview Crossing   

                                                 address  \
0      Porter Pharmacy, 129 N Second St, Cochran, GA ...   
1      City Textile, 3001 E Pico Blvd, Los Angeles, C...   
2      San Soo Dang, 761 S Vermont Ave, Los Angeles, ...   
3      Nova Fabrics, 2200 E 11th St, Los Angeles, CA ...   
4      Nobel Textile Co, 719 E 9th St, Los Angeles, C...   
...                                                  ...   
68175   Steak 48, 260 S Broad St, Philadelphia, PA 19102   
68176  Jack Mcnerney Chevro

In [None]:
meta_sitios.head()

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
0,Porter Pharmacy,"Porter Pharmacy, 129 N Second St, Cochran, GA ...",0x88f16e41928ff687:0x883dad4fd048e8f8,,32.3883,-83.3571,['Pharmacy'],4.9,16,,"[['Friday', '8AM–6PM'], ['Saturday', '8AM–12PM...","{'Service options': ['In-store shopping', 'Sam...",Open ⋅ Closes 6PM,"['0x88f16e41929435cf:0x5b2532a2885e9ef6', '0x8...",https://www.google.com/maps/place//data=!4m2!3...
1,City Textile,"City Textile, 3001 E Pico Blvd, Los Angeles, C...",0x80c2c98c0e3c16fd:0x29ec8a728764fdf9,,34.018891,-118.21529,['Textile exporter'],4.5,6,,,,Open now,"['0x80c2c624136ea88b:0xb0315367ed448771', '0x8...",https://www.google.com/maps/place//data=!4m2!3...
2,San Soo Dang,"San Soo Dang, 761 S Vermont Ave, Los Angeles, ...",0x80c2c778e3b73d33:0xbdc58662a4a97d49,,34.058092,-118.29213,['Korean restaurant'],4.4,18,,"[['Thursday', '6:30AM–6PM'], ['Friday', '6:30A...","{'Service options': ['Takeout', 'Dine-in', 'De...",Open ⋅ Closes 6PM,"['0x80c2c78249aba68f:0x35bf16ce61be751d', '0x8...",https://www.google.com/maps/place//data=!4m2!3...
3,Nova Fabrics,"Nova Fabrics, 2200 E 11th St, Los Angeles, CA ...",0x80c2c89923b27a41:0x32041559418d447,,34.023669,-118.23293,['Fabric store'],3.3,6,,"[['Thursday', '9AM–5PM'], ['Friday', '9AM–5PM'...","{'Service options': ['In-store shopping'], 'Pa...",Open ⋅ Closes 5PM,"['0x80c2c8811477253f:0x23a8a492df1918f7', '0x8...",https://www.google.com/maps/place//data=!4m2!3...
4,Nobel Textile Co,"Nobel Textile Co, 719 E 9th St, Los Angeles, C...",0x80c2c632f933b073:0xc31785961fe826a6,,34.036694,-118.249421,['Fabric store'],4.3,7,,"[['Thursday', '9AM–5PM'], ['Friday', '9AM–5PM'...",{'Service options': ['In-store pickup']},Open ⋅ Closes 5PM,"['0x80c2c62c496083d1:0xdefa11317fe870a1', '0x8...",https://www.google.com/maps/place//data=!4m2!3...


In [None]:
#Exportamos el archivo generado a partir del dataset "metadata-sitios" que leímos previamente con dask

# Define la ruta y el nombre del archivo de salida
output_file = '/content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/combined_metadata_sitios.parquet'

# Exportar el DataFrame a un archivo Parquet
meta_sitios.to_parquet(output_file, engine='pyarrow', index=False)

print(f"DataFrame exportado a {output_file}")

DataFrame exportado a /content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/combined_metadata_sitios.parquet


### 1.3 Cargar y exportar a parquet el archivo business.pickle

In [None]:
def cargar_dataset_pickle(file_path):
    try:
        df = pd.read_pickle(file_path)
        return df
    except Exception as e:
        print(f'Error loading pickle file: {e}')
        return None

url = '/content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/Yelp/business.pkl'
dfbusinessYelp = cargar_dataset_pickle(url)

if dfbusinessYelp is not None:
    print(dfbusinessYelp.head())
else:
    print("Failed to load the dataset.")


              business_id                      name  \
0  Pns2l4eNsfO8kk83dixA6A  Abby Rappoport, LAC, CMQ   
1  mpf3x-BjTdTEA3yCZrAYPw             The UPS Store   
2  tUFrWirKiKi_TAnsVWINQQ                    Target   
3  MTSW4McQd7CbVtyjqoe9mw        St Honore Pastries   
4  mWMc6_wTdE0EUBKIGXDVfA  Perkiomen Valley Brewery   

                           address           city state postal_code  \
0           1616 Chapala St, Ste 2  Santa Barbara   NaN       93101   
1  87 Grasso Plaza Shopping Center         Affton   NaN       63123   
2             5255 E Broadway Blvd         Tucson   NaN       85711   
3                      935 Race St   Philadelphia    CA       19107   
4                    101 Walnut St     Green Lane    MO       18054   

    latitude   longitude stars review_count  ... state postal_code latitude  \
0  34.426679 -119.711197   5.0            7  ...   NaN         NaN      NaN   
1  38.551126  -90.335695   3.0           15  ...   NaN         NaN      NaN   
2  32

In [None]:
dfbusinessYelp.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,state.1,postal_code.1,latitude.1,longitude.1,stars.1,review_count.1,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,,93101,34.426679,-119.711197,5.0,7,...,,,,,,,,,,
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,,63123,38.551126,-90.335695,3.0,15,...,,,,,,,,,,
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,,85711,32.223236,-110.880452,3.5,22,...,,,,,,,,,,
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,...,,,,,,,,,,
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,MO,18054,40.338183,-75.471659,4.5,13,...,,,,,,,,,,


In [None]:
# Eliminar columnas duplicadas del DF anterior para poder exportar a parquet.
dfbusinessYelp = dfbusinessYelp.loc[:, ~dfbusinessYelp.columns.duplicated()]

# Define la ruta y el nombre del archivo de salida
output_file = '/content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/yelp_business.parquet'

# Exportar el DataFrame a un archivo Parquet
dfbusinessYelp.to_parquet(output_file, engine='pyarrow', index=False)

print(f"DataFrame exportado a {output_file}")



DataFrame exportado a /content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/yelp_business.parquet


In [None]:
dfbusinessYelp.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,MO,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


### 1.4 Cargar el archivo checkin.json y exportarlo a parquet

In [None]:
# Define la ruta del archivo JSON
file_path = '/content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/Yelp/checkin.json'

# Leer el archivo JSON línea por línea
data = []
with open(file_path, 'r') as file:
    for line in file:
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Error al decodificar JSON: {e}")
            continue

# Convertir la lista de diccionarios en un DataFrame
df_checkin = pd.DataFrame(data)

# Mostrar las primeras filas del DataFrame
print(df_checkin.head())



              business_id                                               date
0  ---kPU91CF4Lq2-WlRu9Lw  2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020...
1  --0iUa4sNDFiZFrAdIWhZQ  2010-09-13 21:43:09, 2011-05-04 23:08:15, 2011...
2  --30_8IhuyMHbSOcNWd6DQ           2013-06-14 23:29:17, 2014-08-13 23:20:22
3  --7PUidqRWpRSpXebiyxTg  2011-02-15 17:12:00, 2011-07-28 02:46:10, 2012...
4  --7jw19RH9JKXgFohspgQw  2014-04-21 20:42:11, 2014-04-28 21:04:46, 2014...


In [None]:
df_checkin.head()

Unnamed: 0,business_id,date
0,---kPU91CF4Lq2-WlRu9Lw,"2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020..."
1,--0iUa4sNDFiZFrAdIWhZQ,"2010-09-13 21:43:09, 2011-05-04 23:08:15, 2011..."
2,--30_8IhuyMHbSOcNWd6DQ,"2013-06-14 23:29:17, 2014-08-13 23:20:22"
3,--7PUidqRWpRSpXebiyxTg,"2011-02-15 17:12:00, 2011-07-28 02:46:10, 2012..."
4,--7jw19RH9JKXgFohspgQw,"2014-04-21 20:42:11, 2014-04-28 21:04:46, 2014..."


In [None]:
#convertir el df_checkin a parquet
# Define la ruta y el nombre del archivo de salida Parquet
output_file = '/content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/checkin.parquet'

# Exportar el DataFrame a un archivo Parquet
df_checkin.to_parquet(output_file, engine='pyarrow', index=False)

print(f"DataFrame exportado a {output_file}")

DataFrame exportado a /content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/checkin.parquet


### 1.5 Leer el archivo review.json, cargarlo como dataframe y convertirlo a parquet

In [None]:
import ijson #usamos la librería ijson
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa

# Define la ruta del archivo JSON
file_path = '/content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/Yelp/review.json'

# Procesar en fragmentos y escribir a Parquet
batch_size = 100000  # Ajustar según sea necesario
batch_data = []
batch_count = 0

# Abrir el archivo JSON para lectura en streaming
with open(file_path, 'r') as file:
    for line in file:
        try:
            record = json.loads(line)
            batch_data.append(record)
        except json.JSONDecodeError as e:
            print(f"Error al decodificar JSON: {e}")
            continue

        if len(batch_data) >= batch_size:
            df_batch = pd.DataFrame(batch_data)
            table = pa.Table.from_pandas(df_batch)
            pq.write_table(table, f'batch_{batch_count}.parquet')
            batch_data = []
            batch_count += 1

# Procesar cualquier dato restante
if batch_data:
    df_batch = pd.DataFrame(batch_data)
    table = pa.Table.from_pandas(df_batch)
    pq.write_table(table, f'batch_{batch_count}.parquet')

# Combinar todos los archivos Parquet en uno solo si es necesario
parquet_files = [f'batch_{i}.parquet' for i in range(batch_count + 1)]
final_table = pq.ParquetDataset(parquet_files).read()
pq.write_table(final_table, '/content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/yelp_review.parquet')

print("DataFrame exportado a review_combined.parquet")


DataFrame exportado a review_combined.parquet


### 1.6 Leer el archivo tip.json, cargarlo como dataframe y convertirlo a parquet

In [None]:
ruta_archivo = "/content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/Yelp/tip.json"

pd.read_json(ruta_archivo, lines=True).to_parquet("/content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/yelp_tip.parquet")

In [None]:
yelp_tip = pd.read_parquet("/content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/yelp_tip.parquet")

In [None]:
yelp_tip.head() #leemos el dataframe resultante

Unnamed: 0,user_id,business_id,text,date,compliment_count
0,AGNUgVwnZUey3gcPCJ76iw,3uLgwr0qeCNMjKenHJwPGQ,Avengers time with the ladies.,2012-05-18 02:17:21,0
1,NBN4MgHP9D3cw--SnauTkA,QoezRbYQncpRqyrLH6Iqjg,They have lots of good deserts and tasty cuban...,2013-02-05 18:35:10,0
2,-copOvldyKh1qr-vzkDEvw,MYoRNLb5chwjQe3c_k37Gg,It's open even when you think it isn't,2013-08-18 00:56:08,0
3,FjMQVZjSqY8syIO-53KFKw,hV-bABTK-glh5wj31ps_Jw,Very decent fried chicken,2017-06-27 23:05:38,0
4,ld0AperBXk1h6UbqmM80zw,_uN0OudeJ3Zl_tf6nxg5ww,Appetizers.. platter special for lunch,2012-10-06 19:43:09,0


### 1.7 Leer el archivo user.parquet para ver cómo está estructurado y determinar posibles transformaciones

In [None]:
yelp_user = pd.read_parquet('/content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/Yelp/user.parquet', engine='pyarrow')

In [None]:
yelp_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2105597 entries, 0 to 2105596
Data columns (total 22 columns):
 #   Column              Dtype  
---  ------              -----  
 0   user_id             object 
 1   name                object 
 2   review_count        int64  
 3   yelping_since       object 
 4   useful              int64  
 5   funny               int64  
 6   cool                int64  
 7   elite               object 
 8   friends             object 
 9   fans                int64  
 10  average_stars       float64
 11  compliment_hot      int64  
 12  compliment_more     int64  
 13  compliment_profile  int64  
 14  compliment_cute     int64  
 15  compliment_list     int64  
 16  compliment_note     int64  
 17  compliment_plain    int64  
 18  compliment_cool     int64  
 19  compliment_funny    int64  
 20  compliment_writer   int64  
 21  compliment_photos   int64  
dtypes: float64(1), int64(16), object(5)
memory usage: 353.4+ MB


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install --upgrade google-cloud-storage
!pip install --upgrade google-cloud-bigquery
!pip install pyarrow
!pip install pandas-gbq

from google.colab import auth
auth.authenticate_user()

from google.cloud import storage
from google.cloud import bigquery






In [None]:
# Autenticar y configurar cliente
project_id = 'pf-henry-426700'
!gcloud config set project {project_id}


Updated property [core/project].


In [None]:
files = [
    '/content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/checkin.parquet',
    '/content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/combined_metadata_sitios.parquet',
    '/content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/combined_reviews_california.parquet',
    '/content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/yelp_business.parquet',
    '/content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/yelp_review.parquet',
    '/content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/yelp_tip.parquet',
    '/content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/Yelp/user.parquet'
]

# Subir archivos a GCS
for file_path in files:
    blob_name = f'ggl_yelp_datasets/{file_path.split("/")[-1]}'
    blob = bucket.blob(blob_name)
    blob.upload_from_filename(file_path)
    print(f'File {file_path} uploaded to {blob_name}.')

print('Files uploaded to Google Cloud Storage.')


File /content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/checkin.parquet uploaded to ggl_yelp_datasets/checkin.parquet.
File /content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/combined_metadata_sitios.parquet uploaded to ggl_yelp_datasets/combined_metadata_sitios.parquet.
File /content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/combined_reviews_california.parquet uploaded to ggl_yelp_datasets/combined_reviews_california.parquet.
File /content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/yelp_business.parquet uploaded to ggl_yelp_datasets/yelp_business.parquet.
File /content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/yelp_review.parquet uploaded to ggl_yelp_datasets/yelp_review.parquet.
File /content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/yelp_tip.parquet uploaded to ggl_yelp_datasets/yelp_tip.parquet.
File /content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/Yelp/user.parquet uploaded to ggl_yelp_datasets/user.parquet.
Files uploaded to Google Cloud Storage.


In [None]:
import hashlib


In [None]:
def calcular_md5(ruta_archivo):
    hash_md5 = hashlib.md5()
    with open(ruta_archivo, "rb") as f:
        # Lee el archivo en bloques para manejar archivos grandes
        for bloque in iter(lambda: f.read(4096), b""):
            hash_md5.update(bloque)
    return hash_md5.hexdigest()


In [None]:
ruta_archivo = '/content/drive/MyDrive/AULATEC/DATASETS PF_HENRY/user.parquet'  # Cambia la ruta por la ruta real de tu archivo
hash_md5_calculado = calcular_md5(ruta_archivo)
print("Hash MD5 calculado:", hash_md5_calculado)


Hash MD5 calculado: 865ca29e915ee5a02044b5c1a572eda9


In [None]:
import hashlib
import base64

def base64_to_hex(base64_hash):
    decoded_bytes = base64.b64decode(base64_hash)
    hex_hash = decoded_bytes.hex()
    return hex_hash

# Hash MD5 calculado en hexadecimal desde el notebook
hex_md5_calculado = "865ca29e915ee5a02044b5c1a572eda9"

# Hash MD5 en base64 desde la consola de Google Cloud
base64_md5_consola = "hlyinpFe5aAgRLXBpXLtqQ=="

# Convertir el hash MD5 en base64 a hexadecimal
hex_md5_consola = base64_to_hex(base64_md5_consola)

# Comparar ambos hashes MD5 en formato hexadecimal
if hex_md5_calculado == hex_md5_consola:
    print("Los hashes MD5 coinciden. El archivo está íntegro.")
else:
    print("Los hashes MD5 no coinciden. Puede haber cambios en el archivo.")


Los hashes MD5 coinciden. El archivo está íntegro.
