# 🌾 Data Ingestion and Cleaning

#### 📚 Libraries
Import libraries and configure the environment.

In [102]:
# Misc
from omegaconf import OmegaConf
from tqdm import tqdm

tqdm.pandas()

# NLP
import re
import spacy
from unidecode import unidecode

# Data
import pandas as pd
from datasets import Dataset
from datasets import load_dataset

In [50]:
! python -m spacy download es_core_news_lg

Collecting es-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_lg-3.7.0/es_core_news_lg-3.7.0-py3-none-any.whl (568.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m568.0/568.0 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: es-core-news-lg
Successfully installed es-core-news-lg-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_lg')


In [90]:
nlp = spacy.load("es_core_news_lg", disable=["parser", "ner"])

#### 📂 Data
Load the data and take a look at the first few rows.

In [8]:
dataset = load_dataset("Santp98/Secop2_documents")
dataset

Downloading readme:   0%|          | 0.00/640 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 140M/140M [00:11<00:00, 11.7MB/s] 
Downloading data: 100%|██████████| 46.6M/46.6M [00:03<00:00, 12.5MB/s]
Downloading data: 100%|██████████| 46.3M/46.3M [00:03<00:00, 12.5MB/s]


Generating train split:   0%|          | 0/13460 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4487 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4487 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id_doc', 'doc_text'],
        num_rows: 13460
    })
    validation: Dataset({
        features: ['id_doc', 'doc_text'],
        num_rows: 4487
    })
    test: Dataset({
        features: ['id_doc', 'doc_text'],
        num_rows: 4487
    })
})

In [20]:
train_df = dataset["train"].to_pandas()
val_df = dataset["validation"].to_pandas()
test_df = dataset["test"].to_pandas()

secop_df = pd.concat([train_df, val_df, test_df]).reset_index(drop=True)

In [21]:
print("Shape:", secop_df.shape)
secop_df.head()

Shape: (22434, 2)


Unnamed: 0,id_doc,doc_text
0,266671326,SOLICITUD CERTIFICACIÓN DE \nINSUFICIENCIA ...
1,267836089,Usuario Solicitante:\nUnidad ó Subunidad \nEje...
2,321522708,ADENDA Página 1 \n \n ADENDA No. 1 \n \nPe...
3,302712756,HOSPITAL SAN JUAN BAUTISTA \nSEDE HOSP. SAN JU...
4,291869951,\n \n \n \nSISTEMA ESTRATÉGICO DE TRANSPORTE...


#### 🧼 Data Cleaning
Clean the data by removing rows with low quality text.


In [85]:
def clean_text(text: str, normalize: bool = False) -> str:
    """General text cleaning and normalization function."""
    text = text.replace("\n", " ").replace("\r", " ")
    text = re.sub(r" +", " ", text)
    text = re.sub(r"[^\w\s]", "", text)
    text = text.lower()
    if normalize:
        text = unidecode(text)
    return text

In [86]:
def is_high_quality(
    doc: str,
    min_len: int = 200,
    max_len: int = 50000,
    max_special_char_ratio: float = 0.05,
    max_oov_ratio: float = 0.10,
) -> str:
    """Check if a document is of high quality."""
    # Check length criteria
    if len(doc) < min_len:
        return f"Too short: {len(doc)} characters"
    if len(doc) > max_len:
        return f"Too long: {len(doc)} characters"

    # Check special characters
    special_chars = re.findall(r"[^\w\s]", doc)
    special_char_ratio = len(special_chars) / len(doc)
    if special_char_ratio > max_special_char_ratio:
        return f"Too many special characters: {special_char_ratio:.1%}"

    # Basic check for non-dictionary words using spaCy
    doc_nlp = nlp(clean_text(doc))
    oov_words = [token.text for token in doc_nlp if token.is_oov]
    all_words = [token.text for token in doc_nlp]
    oov_ratio = len(oov_words) / len(all_words)
    if oov_ratio > max_oov_ratio:
        return f"Too many out of vocabulary words: {oov_ratio:.1%}"

    return f"High quality: {oov_ratio:.1%}"

In [87]:
cleaning_params = dict(
    min_len=200, max_len=50000, max_special_char_ratio=0.05, max_oov_ratio=0.10
)
secop_df.head()["doc_text"].apply(is_high_quality, **cleaning_params)

0                         High quality: 3.0%
1    Too many out of vocabulary words: 11.4%
2                         High quality: 9.4%
3    Too many out of vocabulary words: 14.0%
4                         High quality: 4.1%
Name: doc_text, dtype: object

In [91]:
secop_df["quality"] = secop_df["doc_text"].progress_apply(
    is_high_quality, **cleaning_params
)

100%|██████████| 22434/22434 [19:45<00:00, 18.93it/s]


In [92]:
quality_mask = secop_df["quality"].str.startswith("High quality")
secop_clean_df = secop_df.loc[quality_mask].reset_index(drop=True)

In [94]:
secop_clean_df

Unnamed: 0,id_doc,doc_text,quality
0,266671326,SOLICITUD CERTIFICACIÓN DE \nINSUFICIENCIA ...,High quality: 3.0%
1,321522708,ADENDA Página 1 \n \n ADENDA No. 1 \n \nPe...,High quality: 9.4%
2,291869951,\n \n \n \nSISTEMA ESTRATÉGICO DE TRANSPORTE...,High quality: 4.1%
3,291901564,CERTIFICACION DE INSUFICIENCIA\nVIGENTE\nDESDE...,High quality: 5.7%
4,304566990,ANE XO Nro. 2 \nOBLIGACIONES DE LA POLICÍA NAC...,High quality: 2.8%
...,...,...,...
10929,298317779,CONTRATO DE PRESTACIÓN DE SERVICIOS DE APOYO A...,High quality: 3.3%
10930,336194098,"Turbaco Bolívar, Dieciocho (18) de Agosto de 2...",High quality: 2.7%
10931,320617007,UNIVERSIDAD MILITAR NUEVA GRANADA \n \nFORMATO...,High quality: 5.6%
10932,300752050,\n \n \n \nLA CONTRALORÍA DELEGADA PARA RESPO...,High quality: 6.3%


In [96]:
clean_dataset = Dataset.from_pandas(secop_clean_df[["id_doc", "doc_text"]])

In [101]:
config = OmegaConf.load("conf/local.yml")
clean_dataset.push_to_hub("dewithsan/secop_corpus_clean", token=config["hf_key"])

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/dewithsan/secop_corpus_clean/commit/7ae8c3ae2d0d5abcabdb889ef3ce338e9ab68db7', commit_message='Upload dataset', commit_description='', oid='7ae8c3ae2d0d5abcabdb889ef3ce338e9ab68db7', pr_url=None, pr_revision=None, pr_num=None)

In [93]:
secop_clean_df.to_csv("data/secop_corpus.csv", index=False)