In [None]:
!pip install pdfplumber

In [None]:
!pip install nltk

In [4]:
import nltk
nltk.download('punkt_tab')  

[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [1]:
import pdfplumber
from pyspark.sql import SparkSession, Row
import numpy as np

# Iniciar Spark Session
spark = SparkSession.builder \
    .appName("PySpark PDF Indexing") \
    .config("spark.es.nodes", "elasticsearch") \
    .config("spark.es.port", "9200") \
    .config("spark.es.nodes.wan.only", "true") \
    .getOrCreate()

In [8]:
import nltk

# Función para extraer texto de un PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:  # Asegurar que no se añada texto nulo
                new_content = page_text.replace("\n", "").strip() + " "
                if (len(new_content)>10):
                    text += new_content
    return text

# Función para dividir el texto en oraciones
def chunk_text_by_sentences(text):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    return tokenizer.tokenize(text.strip())

# Ruta al PDF
pdf_path = "paper.pdf"

# Extraer y segmentar el texto del PDF en oraciones
full_text = extract_text_from_pdf(pdf_path)
sentences = chunk_text_by_sentences(full_text)

# Crear un DataFrame con las oraciones
sentences_data = [(i, sentence) for i, sentence in enumerate(sentences)]
df = spark.createDataFrame(sentences_data, ["id", "sentence_text"])
df.show()

+---+--------------------+
| id|       sentence_text|
+---+--------------------+
|  0|MuHeQA: Zero-shot...|
|  1|First, the approa...|
|  2|A novel algorithm...|
|  3|IntroductionKnowl...|
|  4|   knowledge graphs.|
|  5|KGQA systems do n...|
|  6|                  ].|
|  7|In this step,natu...|
|  8|Once the template...|
|  9|Thus,KGQA systems...|
| 10|This approach is ...|
| 11|Moreover,the depe...|
| 12|The combined use ...|
| 13|Thus, our second ...|
| 14|And finally, anot...|
| 15|no need to create...|
| 16|MuHeQAsupportssin...|
| 17|textual content a...|
| 18|Section 2 present...|
| 19|Our proposal is d...|
+---+--------------------+
only showing top 20 rows



In [10]:
# Especificar el índice de Elasticsearch donde se indexarán los datos
index_name = "pdf_sentences"

# Escribir el DataFrame a Elasticsearch
df.write.format("org.elasticsearch.spark.sql") \
    .option("es.resource", f"{index_name}/_doc") \
    .mode("append") \
    .save()

# Detener la sesión Spark
spark.stop()