In [1]:
import pandas as pd
import asyncio
from langchain_core.documents import Document
import openai
import nest_asyncio
from langchain.text_splitter import CharacterTextSplitter

In [2]:
from langchain_elasticsearch import ElasticsearchStore
from langchain_openai import OpenAIEmbeddings
import elasticsearch
import getpass
from functools import partial
from datasets import load_dataset

In [3]:
nest_asyncio.apply()

In [4]:
ds = load_dataset("ELiRF/dacsa", "spanish", token=True)

In [5]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'summary', 'article'],
        num_rows: 1802919
    })
    validation: Dataset({
        features: ['id', 'summary', 'article'],
        num_rows: 104052
    })
    test.i: Dataset({
        features: ['id', 'summary', 'article'],
        num_rows: 104052
    })
    test.ni: Dataset({
        features: ['id', 'summary', 'article'],
        num_rows: 109626
    })
})

In [6]:
def parse_data(ds):
    df = ds["train"].to_pandas()
    return df.sample(n=25)

In [7]:
subset_df = parse_data(ds)
subset_df.head()

Unnamed: 0,id,summary,article
778201,1a6d7e2a3a46ccb0134763068b9cc694ed5efb72243f95...,Uno de los cuatro asaltantes al templo metodis...,Ocho personas han muerto y otras 44 han result...
1441437,990b015fee4b21d566abd9913877c2e59b157064ec1654...,Más de 100.000 personas cumplen los criterios ...,El borrador del plan nacional contra la hepati...
1330621,9d7400d837c79f953f61094c73ffa99cf2e81fcab6a6d2...,"En una serie de tuits, el creador de Tesla aug...","Elon Musk, el padre del Hyperloop, de los coch..."
14390,86d5787b89b15b08e4c9b32b24f204a8c5b4143ab5c18c...,El líder de Podemos afirma que se nota otra re...,"Pablo Iglesias, líder de Podemos, ha emplazado..."
334827,e9ac2ce03b1a81039aadae9c8d4f9b6fc68174746294e4...,"Tom MacMaster, estadounidense de 40 años, escr...","Tom MacMaster, el estadounidense de 40 años qu..."


In [8]:
async def translate_text(text):
    document = Document(page_content=text)
    loop = asyncio.get_event_loop()
    response = await loop.run_in_executor(
        None,
        partial(
            openai.chat.completions.create,
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a translator."},
                {
                    "role": "user",
                    "content": f"Translate the following text to English: {document.page_content}",
                },
            ],
        ),
    )
    translated_text = response.choices[0].message.content.strip()
    return translated_text

In [9]:
async def translate_dataframe(df, column_name1, column_name2):
    tasks_column1 = [translate_text(text) for text in df[column_name1]]
    tasks_column2 = [translate_text(text) for text in df[column_name2]]

    translated_texts1 = await asyncio.gather(*tasks_column1)
    translated_texts2 = await asyncio.gather(*tasks_column2)

    df["translated_" + column_name1] = translated_texts1
    df["translated_" + column_name2] = translated_texts2

    return df

In [10]:
loop = asyncio.get_event_loop()
loop.run_until_complete(translate_dataframe(subset_df, "summary", "article"))

Unnamed: 0,id,summary,article,translated_summary,translated_article
778201,1a6d7e2a3a46ccb0134763068b9cc694ed5efb72243f95...,Uno de los cuatro asaltantes al templo metodis...,Ocho personas han muerto y otras 44 han result...,One of the four assailants at the Methodist te...,Eight people died and another 44 were injured ...
1441437,990b015fee4b21d566abd9913877c2e59b157064ec1654...,Más de 100.000 personas cumplen los criterios ...,El borrador del plan nacional contra la hepati...,"More than 100,000 people meet the criteria for...",The draft of the national plan against hepatit...
1330621,9d7400d837c79f953f61094c73ffa99cf2e81fcab6a6d2...,"En una serie de tuits, el creador de Tesla aug...","Elon Musk, el padre del Hyperloop, de los coch...","In a series of tweets, the creator of Tesla pr...","Elon Musk, the father of Hyperloop, Tesla auto..."
14390,86d5787b89b15b08e4c9b32b24f204a8c5b4143ab5c18c...,El líder de Podemos afirma que se nota otra re...,"Pablo Iglesias, líder de Podemos, ha emplazado...",The leader of Podemos states that there is a n...,"Pablo Iglesias, leader of Podemos, has called ..."
334827,e9ac2ce03b1a81039aadae9c8d4f9b6fc68174746294e4...,"Tom MacMaster, estadounidense de 40 años, escr...","Tom MacMaster, el estadounidense de 40 años qu...","Tom MacMaster, a 40-year-old American, was wri...","Tom MacMaster, the 40-year-old American who ha..."
40366,a0caae74f42c4f0f2c2037ec38e2c2b74db46746ddc1ee...,El rodaje comenzará el 16 de agosto en pueblos...,La serie de televisión Águila Roja que emite T...,Filming will begin on August 16 in towns of Ma...,"The television series Águila Roja, which airs ..."
1628056,7be0bb718118c80312cddb07181cd5144c79354905adfc...,El plan con mejor acogida ha sido el de mejora...,Jóvenes mayores de 16 años votan qué inversion...,The most well-received plan has been the impro...,Young people over the age of 16 vote on their ...
389664,025449f042134e89631c21cb34d59399174ccce2f6721d...,Tanto los tanques como las esferas fueron cons...,La compañía Repsol Butano ha iniciado el desmo...,Both the tanks and the spheres were constructe...,The company Repsol Butano has begun the disman...
176264,3c3ce4dbdd18fd5e2836e4fc98ce20f74c46294a473947...,A estos hombres se les han intervenido cerca d...,"Ocho personas, cinco rumanos, dos españoles y ...",These men have had approximately 90 meters of ...,"Eight people, five Romanians, two Spaniards, a..."
401423,e988e55eaa6906835e2743c924dc4d03976b1aff22a805...,"Varios monumentos, como los castillos de Galve...",“Estamos en una de las provincias con más cast...,"Several monuments, such as the castles of Galv...","""We are in one of the provinces with the most ..."


In [11]:
subset_df.head()

Unnamed: 0,id,summary,article,translated_summary,translated_article
778201,1a6d7e2a3a46ccb0134763068b9cc694ed5efb72243f95...,Uno de los cuatro asaltantes al templo metodis...,Ocho personas han muerto y otras 44 han result...,One of the four assailants at the Methodist te...,Eight people died and another 44 were injured ...
1441437,990b015fee4b21d566abd9913877c2e59b157064ec1654...,Más de 100.000 personas cumplen los criterios ...,El borrador del plan nacional contra la hepati...,"More than 100,000 people meet the criteria for...",The draft of the national plan against hepatit...
1330621,9d7400d837c79f953f61094c73ffa99cf2e81fcab6a6d2...,"En una serie de tuits, el creador de Tesla aug...","Elon Musk, el padre del Hyperloop, de los coch...","In a series of tweets, the creator of Tesla pr...","Elon Musk, the father of Hyperloop, Tesla auto..."
14390,86d5787b89b15b08e4c9b32b24f204a8c5b4143ab5c18c...,El líder de Podemos afirma que se nota otra re...,"Pablo Iglesias, líder de Podemos, ha emplazado...",The leader of Podemos states that there is a n...,"Pablo Iglesias, leader of Podemos, has called ..."
334827,e9ac2ce03b1a81039aadae9c8d4f9b6fc68174746294e4...,"Tom MacMaster, estadounidense de 40 años, escr...","Tom MacMaster, el estadounidense de 40 años qu...","Tom MacMaster, a 40-year-old American, was wri...","Tom MacMaster, the 40-year-old American who ha..."


In [12]:
es_client = elasticsearch.Elasticsearch(
    getpass.getpass("Host: "),
    api_key=getpass.getpass("API Key: "),
)

Host:  ········
API Key:  ········


In [13]:
index_name = "vs-post-data"

# Check if the index exists using keyword argument
if es_client.indices.exists(index=index_name):
    es_client.indices.delete(index=index_name)
    print(f"Deleted existing index: {index_name}")

# Create a new index using keyword argument
es_client.indices.create(index=index_name)
print(f"Created new index: {index_name}")

Deleted existing index: vs-post-data
Created new index: vs-post-data


In [14]:
embedding = OpenAIEmbeddings(model="text-embedding-3-large")

In [15]:
elastic_vector_search = ElasticsearchStore(
    index_name="vs-post-data",
    es_connection=es_client,
    embedding=embedding,
)

In [16]:
translated_texts = subset_df["translated_article"].tolist()
combined_text = "\n".join(translated_texts)
text_splitter = CharacterTextSplitter(chunk_size=5000, chunk_overlap=50)
docs = text_splitter.split_documents([Document(page_content=combined_text)])

Created a chunk of size 7007, which is longer than the specified 5000


In [17]:
elastic_vector_search.add_documents(docs)

['612f958b-a830-4f39-8d25-2b408c1778cc',
 'e227a8b6-7e7b-491c-b93f-aca4291d3eda',
 '5fa946f1-1dd8-491a-a142-55e2cab21507',
 '2101e7be-8889-48e3-a32d-eecd8ed21659',
 '4f7a5fa8-4c0d-4976-ae63-28dc03e8a60d',
 'f6b2b0a5-355a-409f-8d85-16b839df790d',
 '574ae78e-617d-49b6-884b-23ba8676965d',
 'f181ac6d-9703-4691-b53f-ca24b0dfd1dd',
 '1949df0d-c438-4ff3-9710-daa0563f4d23',
 'cc704289-a1dc-461e-a052-72cfe56edb33',
 '91a84211-6481-4c50-8c01-bdea90c5be32',
 '6a5a2dac-051f-44ef-81e7-858f9732a3d9',
 '8cde9f07-40e1-417d-a585-c80688ac1d6a',
 '525fbb94-bb42-44f5-9d41-2c1c5b95d825',
 '6b434e53-f863-4862-a250-383ebd799446',
 '53ca83bf-71e2-4f22-9698-5575e92ad914',
 '697d193d-e8f8-4d21-9a57-66d1ee0942d6',
 'bc5fe419-914c-4919-a7bc-485e73240225',
 '6d9c7b6f-8cdc-494b-aaea-76ed0edeee3d',
 '55710340-557c-4d73-9ce8-a6641fbeceac',
 'dd5dd83f-d655-4a8a-aa9a-cba454b4bdf6',
 'dd1f74cd-b2a3-4567-9ac5-cbb8da578c2b']

In [18]:
query = "What happened in Spain?"

In [19]:
results = elastic_vector_search.similarity_search(query)
results

[Document(page_content='Post-October 1st, the sniper worked during November and until December 21st in the security support teams in an annex space of the CTTI that was set up due to the parliamentary election call. The Department of Digital Policies has requested an informative report today from Enerpro S.L. to gather more information on this matter. The man, who sought logistical support in a WhatsApp chat to commit the assassination, remains in prison by order of a court in Terrassa (Barcelona), in a case open for the crimes of conspiracy to commit attacks against the head of government and possession, storage of illegal arms, ammunition, and explosives, as well as another hate crime. Murillo was arrested on September 19th by the Mossos d\'Esquadra, a few days after receiving a complaint from a local Vox leader in Barcelona, who had received his WhatsApp messages stating that he was willing to "sacrifice" himself for Spain and wanted to kill Sánchez as revenge for his intention to e

In [20]:
k_results = elastic_vector_search.similarity_search(query, k=10)
k_results

[Document(page_content='Post-October 1st, the sniper worked during November and until December 21st in the security support teams in an annex space of the CTTI that was set up due to the parliamentary election call. The Department of Digital Policies has requested an informative report today from Enerpro S.L. to gather more information on this matter. The man, who sought logistical support in a WhatsApp chat to commit the assassination, remains in prison by order of a court in Terrassa (Barcelona), in a case open for the crimes of conspiracy to commit attacks against the head of government and possession, storage of illegal arms, ammunition, and explosives, as well as another hate crime. Murillo was arrested on September 19th by the Mossos d\'Esquadra, a few days after receiving a complaint from a local Vox leader in Barcelona, who had received his WhatsApp messages stating that he was willing to "sacrifice" himself for Spain and wanted to kill Sánchez as revenge for his intention to e