In [None]:
!pip install deep-translator
!pip install pandas langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=082fbb3f68d8a060c04d33c29c4e5af2e83d3d82ba39d0d86fc4b89b65aec738
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3b4bcf1fcabcd6272c167640072e
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import re
from deep_translator import GoogleTranslator
from langdetect import detect

BASE_URL  = "https://repositorio.upch.edu.pe"
START_URL = f"{BASE_URL}/handle/20.500.12866/1318/recent-submissions"
#MAX_ARTICULOS = 20  # Cambia si deseas más artículos
MAX_ARTICULOS = float('inf')
datos = []

def limpia_texto(t):
    return re.sub(r'\s+', ' ', t).strip()

def traducir(texto):
    if texto.strip() == "" or texto == "NA":
        return "NA"
    try:
        return GoogleTranslator(source='auto', target='en').translate(texto)
    except Exception as e:
        print(f"Error al traducir: {e}")
        return "NA"

def traducir_si_espanol(texto):
    if texto.strip() == "" or texto == "NA":
        return "NA"
    try:
        if detect(texto) == 'es':
            return traducir(texto)
        return texto  # Ya está en inglés u otro idioma
    except Exception as e:
        print(f"Error detectando idioma: {e}")
        return texto  # Devuelve el original si no se puede detectar

def extraer_resumen_completo(url):
    try:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, "html.parser")
        resumen_div = soup.select_one("div.simple-item-view-description")

        if not resumen_div:
            return "NA"

        # Convertir el HTML interno a texto dividido por los spacers
        raw_html = str(resumen_div)
        partes = raw_html.split('<div class="spacer">')

        if len(partes) < 3:
            # Si no hay al menos dos separadores, devuelve todo el texto
            texto_completo = BeautifulSoup(raw_html, "html.parser").get_text(" ", strip=True)
            return limpia_texto(re.sub(r'^Resumen[:：]?\s*', '', texto_completo, flags=re.IGNORECASE))

        # La parte en inglés está entre el primer y segundo spacer => partes[1]
        texto_ingles_html = partes[1]
        texto_ingles = BeautifulSoup(texto_ingles_html, "html.parser").get_text(" ", strip=True)
        texto_ingles = limpia_texto(re.sub(r'^Resumen[:：]?\s*', '', texto_ingles, flags=re.IGNORECASE))

        return texto_ingles if texto_ingles.strip() else "NA"

    except Exception as e:
        print(f"Error al extraer resumen en {url}: {e}")
        return "NA"

def scrape_listing_page(url):
    global datos
    print(f"Scraping: {url}")
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    items = soup.select("li.ds-artifact-item")

    """
    for item in items:
        if len(datos) >= MAX_ARTICULOS:
            return
    """
    for item in items:

        titulo_tag    = item.select_one("div.artifact-title a")
        autores_tag   = item.select_one("span.author")
        fecha_tag     = item.select_one("span.date")
        publisher_tag = item.select_one("span.publisher")

        titulo    = limpia_texto(titulo_tag.text) if titulo_tag else "NA"
        link      = BASE_URL + titulo_tag["href"] if titulo_tag else "NA"
        autores   = limpia_texto(autores_tag.text) if autores_tag else "NA"
        fecha     = limpia_texto(fecha_tag.text) if fecha_tag else "NA"
        editorial = limpia_texto(publisher_tag.text) if publisher_tag else "NA"
        resumen   = extraer_resumen_completo(link) if link != "NA" else "NA"

        print(f"Traduciendo artículo {len(datos)+1}: {titulo[:50]}...")
        datos.append({
            "title": traducir_si_espanol(titulo),
            "abstract": traducir_si_espanol(resumen),
            "publisher": traducir_si_espanol(editorial),
            "authors": autores,
            "date": fecha,
            "link": link  # No se traduce
        })

    # Recorrer siguiente página si existe
    next_btn = soup.select_one("a.next-page-link")
    if next_btn and next_btn.get("href") and len(datos) < MAX_ARTICULOS:
        scrape_listing_page(BASE_URL + next_btn["href"])

# Ejecutar scraping y traducción
scrape_listing_page(START_URL)

# Guardar resultados traducidos en formato compatible con Excel en español (delimitador ;)
with open("articulos_upch_traducidos_con_editorial.csv", "w", newline="", encoding="utf-8") as f:
    fieldnames = ["title", "abstract", "publisher", "authors", "date", "link"]
    writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter=';', quoting=csv.QUOTE_ALL)
    writer.writeheader()
    writer.writerows(datos)

print(f"\n Se guardaron {len(datos)} artículos traducidos en 'articulos_upch_traducidos_con_editorial.csv'")

[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
Traduciendo artículo 230: Prevalence of SARS-CoV-2 Variants and Disease Outc...
Traduciendo artículo 231: Development and internal validation of a multifact...
Traduciendo artículo 232: Prevalence of and risk factors for vertebral fract...
Traduciendo artículo 233: Country and policy factors influencing the impleme...
Traduciendo artículo 234: Association between maternity leave policies and p...
Traduciendo artículo 235: Efficacy and Safety of the Use of SGLT2 Inhibitors...
Traduciendo artículo 236: Relationship Between Fear of COVID-19, Conspiracy ...
Traduciendo artículo 237: Observational Study of Fetal Monitoring in Differe...
Traduciendo artículo 238: Neuro-Sweet Syndrome: A Diagnostic Conundrum...
Traduciendo artículo 239: Known unknowns - steps towards the true annual ris...
Traduciendo artículo 240: A qualitative analysis of adolescent motherhood wi...
Scraping: https://repositorio.upch.edu.pe/handle/20

In [None]:
from google.colab import files
files.download("articulos_upch_traducidos_con_editorial.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# PREPROCESAMIENTO DEL TEXTO
!pip install pandas spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m116.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import pandas as pd
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# 3. Cargar modelo de spaCy en inglés
nlp = spacy.load("en_core_web_sm")

# 4. Cargar archivo CSV (ajusta el nombre si es diferente)
df = pd.read_csv("articulos_upch_traducidos_con_editorial.csv", sep=';', quoting=1)

# 5. Función de preprocesamiento
def preprocesar_texto(texto):
    if pd.isna(texto):
        return ""

    # 1. Minusculizar
    texto = texto.lower()

    # 2. Eliminar puntuación y caracteres especiales
    texto = re.sub(r"[^a-zA-Z\s]", " ", texto)  # solo letras y espacios

    # 3. Tokenizar + 4. Eliminar stopwords + 5. Lematizar
    doc = nlp(texto)
    tokens_limpios = [token.lemma_ for token in doc if token.lemma_ not in STOP_WORDS and token.is_alpha]

    return " ".join(tokens_limpios)

# 6. Aplicar a las columnas 'title' y 'abstract'
df["title_clean"] = df["title"].apply(preprocesar_texto)
df["abstract_clean"] = df["abstract"].apply(preprocesar_texto)

# 7. Ver una muestra
df[["title", "title_clean", "abstract", "abstract_clean"]].head()



Unnamed: 0,title,title_clean,abstract,abstract_clean
0,Functional disability in patients with fibromy...,functional disability patient fibromyalgia hos...,Background: Fibromyalgia is a rheumatological ...,background fibromyalgia rheumatological disord...
1,Toothbrushing and Access to Dental Services in...,toothbrushing access dental service peruvian c...,Objective. The aim was to determine the associ...,objective aim determine association access den...
2,The Lancet Diabetes & Endocrinology Commission...,lancet diabete endocrinology commission type d...,"Kofi Annan, former UN Secretary General and No...",kofi annan un secretary general nobel prize wi...
3,Experiences of usual and family members about ...,experience usual family member care receive co...,OBJECTIVE.: To understand the experiences of n...,objective understand experience new continue u...
4,Development and validation of a suspension met...,development validation suspension method base ...,OBJECTIVE.: To develop and validate a cell sus...,objective develop validate cell suspension met...


In [None]:

# 8. Guardar archivo con texto preprocesado
df.to_csv("articulos_preprocesados_con_editorial.csv", sep=";", index=False, quoting=1)



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# 9. Descargar el archivo resultante
from google.colab import files
files.download("articulos_preprocesados_con_editorial.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>