## **Data preparation for election RAG app**

The following notebook was used to create the chroma database used during the workshop. Please note that it wasn't part of the workshop and we added it for people who are interested in creating their own database with different data. Data used for creating this db are scraped wikipedia articles stored in `./data/scrape-january-2024`

In [None]:
import os
import re

import pandas as pd
from langchain_community.document_loaders.dataframe import DataFrameLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import AzureOpenAIEmbeddings
from loguru import logger
from openai import AzureOpenAI

**Functions**

In [None]:
def remove_edit_button(input: str) -> str:
    return re.sub(r"\[edit <[^]]+>\]", "", input)


def remove_cite_nodes(input: str) -> str:
    return re.sub(r"\^\[\d+\]", "", input)


def remove_references(input: str) -> str:
    return re.sub(r"\s*References\s*([\s\S]*)", "", input)


def remove_wiki_links(input: str) -> str:
    return re.sub(r"<[^>]*>", "", input)


def slice_data(input: str) -> pd.DataFrame:
    """Slices the data into a dataframe where rows are chronological and have specified
    type"""
    slices = input.split("\n\n")

    d = []
    for i, slice in enumerate(slices):
        if i == 0:
            d.append({"type": "article_name", "text": slice, "len": len(slice)})
        elif slice[:7] == "\n      ":
            d.append({"type": "subchapter", "text": slice, "len": len(slice)})
        elif slice[:5] == "\n    ":
            d.append({"type": "chapter", "text": slice, "len": len(slice)})
        else:
            d.append({"type": "text", "text": slice, "len": len(slice)})
    return pd.DataFrame(d)


def prepare_sections(df: pd.DataFrame) -> pd.DataFrame:
    """Takes the dataframe of ordered article and splits it into final sections"""

    article = df["text"][0]
    c_chapter = ""
    c_subchapter = ""
    c_text = ""

    sections = []

    for row in df[1:].itertuples():
        # if row is text type, adds it into section. Finishes section if added text is
        # too long
        if row.type == "text":
            c_text += row.text
            if len(c_text) > 1500:
                sections.append(
                    {
                        "article": article,
                        "chapter": c_chapter,
                        "subchapter": c_subchapter,
                        "text": c_text.lstrip(),
                        "length": len(c_text.lstrip()),
                    }
                )
                c_text = ""

        # if type is chapter, finishes previous section and restarts the values
        elif row.type == "chapter":
            sections.append(
                {
                    "article": article,
                    "chapter": c_chapter,
                    "subchapter": c_subchapter,
                    "text": c_text.lstrip(),
                    "length": len(c_text.lstrip()),
                }
            )
            c_chapter = row.text.lstrip()
            c_text = ""

        # if type is subchapter, finishes previous section and restarts the values
        elif row.type == "subchapter":
            sections.append(
                {
                    "article": article,
                    "chapter": c_chapter,
                    "subchapter": c_subchapter,
                    "text": c_text.lstrip(),
                    "length": len(c_text.lstrip()),
                }
            )
            c_subchapter = row.text.lstrip()
            c_text = ""

    df = pd.DataFrame(sections)
    # throws away all sections which have no text
    df = df[df["length"] > 0]

    return df


def split_text_at_sentence_end(text, min_length=2500, overlap=200):
    """Function to split text at the sentence end nearest to the middle,
    with at least 200 characters overlap"""
    if len(text) <= min_length:
        return [text]

    # Find all sentence ends in the text
    sentence_ends = [m.start(0) for m in re.finditer(r"\.\s+", text)]

    # Find the best place to split the text
    split_point = None
    for end in sentence_ends:
        if min_length / 2 - overlap <= end <= min_length / 2 + overlap:
            split_point = end
            break

    # If no suitable split point found, force a split at the minimum length with overlap
    if split_point is None:
        split_point = min_length - overlap

    return [text[: split_point + 1].strip(), text[split_point + 1 :].strip()]


def split_long_sections(df: pd.DataFrame) -> pd.DataFrame:
    new_rows = []  # List to hold the new rows

    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        if row["length"] > 2500:
            # Split the text into two parts
            parts = split_text_at_sentence_end(row["text"])
            for part in parts:
                # Create a new row with the same data except for the split text
                new_row = row.to_dict()
                new_row["text"] = part
                new_row["length"] = len(part)
                new_rows.append(new_row)
        else:
            # If the length is not greater than 2500, keep the row as is
            new_rows.append(row.to_dict())

    # Create a new DataFrame from the list of new rows
    new_df = pd.DataFrame(new_rows)
    return new_df


def data_pipeline(file_path: str):
    # STEP 1 - load txt file into str
    with open(file=file_path, mode="r") as file:
        data_str = file.read()

    # STEP 2 - cleaning of the data from balast
    data_str = remove_edit_button(data_str)
    data_str = remove_cite_nodes(data_str)
    data_str = remove_references(data_str)
    data_str = remove_wiki_links(data_str)

    # STEP 3 - Slicing the data into dataframe
    df = slice_data(data_str)
    sections = prepare_sections(df)

    # STEP 4 - Final cleanup and check
    sections = split_long_sections(sections)
    sections["text"] = (
        "Article: "
        + sections["article"]
        + ", Chapter: "
        + sections["chapter"]
        + ", Subchapter: "
        + sections["subchapter"]
        + ", Text: "
        + sections["text"]
    )

    logger.debug(f"Created {len(df)} sections for {os.path.basename(file_path)}")
    return sections

**Runtime**

In [None]:
def list_files_in_folder(folder_path):
    file_paths = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_paths.append(os.path.join(root, file))
    return file_paths


paths = list_files_in_folder("./data/scrape-january-2024")

In [None]:
articles = []
for path in paths:
    articles.append(data_pipeline(path))

df = pd.concat(articles, ignore_index=True)

In [None]:
# infromation
logger.debug(f"Longest section have {df['length'].max()} characters")
logger.debug(f"Final datatset has {len(df)} sections.")

**Embeddings**

In [None]:
def compute_embedding(text: str) -> list:
    emb_client = AzureOpenAI(
        azure_endpoint=os.environ.get("EMBEDDING_ENDPOINT"),
        api_key=os.environ.get("EMBEDDING_API_KEY"),
        api_version=os.environ.get("EMBEDDING_API_VERSION"),
    )
    response = emb_client.embeddings.create(
        input=text, model=os.environ.get("EMBEDDING_NAME")
    )
    return response.data[0].embedding


df["embedding"] = df["text"].apply(compute_embedding)

**Save results**

In [None]:
df.to_csv("./data/processed_wiki.csv", sep=";")

**VectorStore**

In [None]:
embedding_model = AzureOpenAIEmbeddings(
    api_key=os.environ.get("EMBEDDING_API_KEY"),
    azure_endpoint=os.environ.get("EMBEDDING_ENDPOINT"),
    api_version=os.environ.get("EMBEDDING_API_VERSION"),
    model="embeddings-ada",
)

loader = DataFrameLoader(data_frame=df, page_content_column="text")
documents = loader.load()

db = Chroma.from_documents(documents, embedding_model)