<a href="https://colab.research.google.com/github/eleanarey/ProgramingPractices/blob/main/tfm_eleana_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
# Instalación de bibliotecas necesarias
!pip install PyPDF2 kaggle

!rm -rf ./dataset  # Eliminar carpeta local si existe

import os
import requests
import shutil
import hashlib
from google.colab import drive
from PIL import Image
import PyPDF2

# Configuración inicial
localDataset = False
mountPoint = '/content/drive'
remotePath = 'MyDrive/Colab Notebooks'
localPath = './dataset'
output_folder = '/content/drive/MyDrive/Colab Notebooks/dataset'  # Ruta de salida en Google Drive
categories = ['imagenes_validas', 'imagenes_invalidas', 'pdfs_validos', 'pdfs_invalidos', 'duplicados']

# Configuración para Kaggle
kaggle_dataset_images = "jrobischon/wikipedia-movie-plots"  # Dataset de ejemplo
kaggle_dataset_pdfs = "lalitharajesh/documents-dataset"  # Cambiar a un dataset de PDFs
kaggle_folder = "./kaggle_data"

# Descarga desde Open Images (ejemplo limitado para pruebas)
open_images_urls = [
    "https://storage.googleapis.com/openimages/2018_04/test/test-annotations-human-imagelabels-boxable.csv",
    "https://storage.googleapis.com/openimages/2018_04/test/test-images.csv"
]

# Función para calcular hash de un archivo (para detectar duplicados)
def get_file_hash(file_path):
    hasher = hashlib.md5()
    with open(file_path, 'rb') as f:
        buf = f.read()
        hasher.update(buf)
    return hasher.hexdigest()

# Función para detectar duplicados
def detect_duplicates(file_path, hash_dict):
    file_hash = get_file_hash(file_path)
    if file_hash in hash_dict:
        return True, hash_dict[file_hash]
    hash_dict[file_hash] = file_path
    return False, None

# Función para validar imágenes
def validate_image(image_path):
    try:
        with Image.open(image_path) as img:
            img.verify()
        return True
    except Exception as e:
        print(f"Archivo no válido (imagen): {image_path}, Error: {e}")
        return False

# Función para validar PDFs
def validate_pdf(pdf_path):
    try:
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            if len(reader.pages) > 0:
                return True
    except Exception as e:
        print(f"Archivo no válido (PDF): {pdf_path}, Error: {e}")
        return False

# Función para descargar y validar Open Images Dataset
def download_open_images(output_folder):
    os.makedirs(output_folder, exist_ok=True)
    for url in open_images_urls:
        file_name = url.split("/")[-1]
        save_path = os.path.join(output_folder, file_name)
        if not os.path.exists(save_path):  # Validar si el archivo ya existe
            response = requests.get(url, stream=True)
            if response.status_code == 200:
                with open(save_path, "wb") as file:
                    shutil.copyfileobj(response.raw, file)
                print(f"Descargado: {save_path}")
            else:
                print(f"Error al descargar {url}")
        else:
            print(f"Archivo ya existente: {save_path}")

# Función para descargar y procesar Kaggle Datasets
def download_kaggle_dataset(kaggle_dataset, output_folder):
    os.makedirs(kaggle_folder, exist_ok=True)
    zip_path = os.path.join(kaggle_folder, f"{kaggle_dataset.split('/')[-1]}.zip")
    if not os.path.exists(zip_path):  # Validar si el dataset ya fue descargado
        !kaggle datasets download -d {kaggle_dataset} -p {kaggle_folder}
    else:
        print(f"Dataset ya descargado: {zip_path}")
    !unzip -o {zip_path} -d {output_folder}

# Copiar datos desde Google Drive
if not localDataset:
    print('[ COPYING DATA FROM GOOGLE DRIVE TO LOCAL COLAB SPACE ]')
    if os.path.isdir(mountPoint):
        print(f'  * GOOGLE DRIVE ALREADY MOUNTED AT {mountPoint}')
    else:
        print(f'  * MOUNTING GOOGLE DRIVE AT {mountPoint}')
        drive.mount(mountPoint)

    remoteDir = os.path.join(mountPoint, remotePath)
    if not os.path.exists(remoteDir):
        print(f'  * Remote directory {remoteDir} does not exist. Creating it now.')
        os.makedirs(remoteDir, exist_ok=True)

    if os.path.isdir(localPath):
        print(f'  * LOCAL PATH {localPath} ALREADY EXISTS.')
    else:
        print(f'  * COPYING FILES FROM {remoteDir} TO {localPath}')
        shutil.copytree(remoteDir, localPath)

# Descargar y procesar Open Images Dataset
print('[ DOWNLOADING OPEN IMAGES DATASET ]')
download_open_images(output_folder)

# Descargar y procesar Kaggle Dataset de Imágenes
print('[ DOWNLOADING KAGGLE DATASET (IMAGES) ]')
download_kaggle_dataset(kaggle_dataset_images, output_folder)

# Descargar y procesar Kaggle Dataset de PDFs
print('[ DOWNLOADING KAGGLE DATASET (PDFS) ]')
download_kaggle_dataset(kaggle_dataset_pdfs, output_folder)

# Procesar archivos locales y duplicados
print('[ PROCESSING FILES ]')
hash_dict = {}

for file_name in os.listdir(localPath):
    file_path = os.path.join(localPath, file_name)
    is_duplicate, original_path = detect_duplicates(file_path, hash_dict)

    if is_duplicate:
        print(f"Duplicado detectado: {file_path} (original: {original_path})")
        organize_file(file_path, 'duplicados', output_folder)
    elif file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
        if validate_image(file_path):
            organize_file(file_path, 'imagenes_validas', output_folder)
        else:
            organize_file(file_path, 'imagenes_invalidas', output_folder)
    elif file_name.lower().endswith('.pdf'):
        if validate_pdf(file_path):
            organize_file(file_path, 'pdfs_validos', output_folder)
        else:
            organize_file(file_path, 'pdfs_invalidos', output_folder)

print('[ DONE ]')


[ COPYING DATA FROM GOOGLE DRIVE TO LOCAL COLAB SPACE ]
  * GOOGLE DRIVE ALREADY MOUNTED AT /content/drive
  * COPYING FILES FROM /content/drive/MyDrive/Colab Notebooks TO ./dataset
[ DOWNLOADING OPEN IMAGES DATASET ]
Archivo ya existente: /content/drive/MyDrive/Colab Notebooks/dataset/test-annotations-human-imagelabels-boxable.csv
Archivo ya existente: /content/drive/MyDrive/Colab Notebooks/dataset/test-images.csv
[ DOWNLOADING KAGGLE DATASET (IMAGES) ]
Dataset ya descargado: ./kaggle_data/wikipedia-movie-plots.zip
Archive:  ./kaggle_data/wikipedia-movie-plots.zip
caution: filename not matched:  Notebooks/dataset
[ DOWNLOADING KAGGLE DATASET (PDFS) ]
403 - Forbidden - Permission 'datasets.get' was denied
unzip:  cannot find or open ./kaggle_data/documents-dataset.zip, ./kaggle_data/documents-dataset.zip.zip or ./kaggle_data/documents-dataset.zip.ZIP.
[ PROCESSING FILES ]


IsADirectoryError: [Errno 21] Is a directory: './dataset/dataset'

In [11]:
# Configuración para Kaggle
kaggle_dataset = "ajaypalsinghlo/world-bank-invoices"  # Dataset con PDFs
kaggle_folder = "./kaggle_data"

# Función para descargar y procesar Kaggle Datasets
def download_kaggle_dataset(kaggle_dataset, output_folder):
    os.makedirs(kaggle_folder, exist_ok=True)
    !kaggle datasets download -d {kaggle_dataset} -p {kaggle_folder}
    !unzip -o {kaggle_folder}/*.zip -d {output_folder}
