# RFC obtaintion (validation and cleaning)
The goal is to read a csv file with teo columns ("RFC" and "RAZON"), then to validate RFCs, to clean column "RAZON" by applying Regex techniques and to add column "Persona" if the RFC corresponds to a "fisica" or "moral" person from one csv file. The cleaned DataFrame is saved as "NuevoRFC.csv".

In [None]:
# Import libraries
import pandas as pd
import re
from typing import Optional
import re
import spacy # Recommended for Spanish
from rapidfuzz import fuzz

## Read non-processed csv file

In [54]:
csv_file = 'prueba.csv'
column_names = ['RFC','RAZON']
df = pd.read_csv(csv_file, sep=',', header=None, names=column_names)
df.head(5)

Unnamed: 0,RFC,RAZON
0,AAA08091161A,"APOYANDO A ANGELITOS CON AUTISMO, A. C."
1,AAA081021QD9,APOLINAR ASOCIADOS CONSULTORIA Y SERVICIOS SC
2,AAA090601GY4,DE ALBA & ASOCIADOS FIRMA LEGAL S.C.
3,AAA090924HJ4,"ARGUELLES, ALVAREZ & ASOCIADOS SA DE CV"
4,AAA1002249W6,ADAIR ALONSO ARQUITECTOS SA DE CV


In [55]:
initial_rows = df.shape[0]
print(f"El archivo tiene {initial_rows} filas.")

El archivo tiene 21 filas.


In [56]:
# Delete all rows with null values in some columns
df = df.dropna()

# Change column types
df[column_names] = df[column_names].astype('str')

## Validate RFC

In [57]:
# Regular expressions
RFC_FISICA_REGEX = re.compile(
    r"^[A-ZÑ&]{4}"
    r"\d{2}(0[1-9]|1[0-2])"
    r"(0[1-9]|[12]\d|3[01])"
    r"[A-Z0-9]{3}$"
)

RFC_MORAL_REGEX = re.compile(
    r"^[A-ZÑ&]{3}"
    r"\d{2}(0[1-9]|1[0-2])"
    r"(0[1-9]|[12]\d|3[01])"
    r"[A-Z0-9]{3}$"
)

def normalize_rfc(rfc: str) -> str:
    """Limpia espacios y convierte a mayúsculas"""
    return rfc.strip().strip(".").strip().upper()

# Normalize RFC
df['RFC'] = df['RFC'].apply(normalize_rfc)

In [58]:
def is_persona_fisica(rfc: str) -> bool:
    """Verifica si el RFC corresponde a persona física"""
    return bool(RFC_FISICA_REGEX.match(rfc))


def is_persona_moral(rfc: str) -> bool:
    """Verifica si el RFC corresponde a persona moral"""
    return bool(RFC_MORAL_REGEX.match(rfc))


def get_rfc_type(rfc: str) -> Optional[str]:
    """Validate RFC"""

    if is_persona_fisica(rfc):
        return "FISICA"
    if is_persona_moral(rfc):
        return "MORAL"

    return None

# Validate RFC
df['PERSONA'] = df['RFC'].apply(get_rfc_type)

# Filter
df = df[(df['PERSONA'] == 'FISICA') | (df['PERSONA'] == 'MORAL')]

df.sample(5)

Unnamed: 0,RFC,RAZON,PERSONA
20,&GM060804SN9,"MC GE MULTISISTE DE CTROL Y GEN ELEC, A C",MORAL
4,AAA1002249W6,ADAIR ALONSO ARQUITECTOS SA DE CV,MORAL
15,&GM060804SN9,"MC GE MULTISISTE DE CTROL Y GEN ELEC, A.C. DE",MORAL
0,AAA08091161A,"APOYANDO A ANGELITOS CON AUTISMO, A. C.",MORAL
18,&GM060804SN9,"MC GE MULTISISTE DE CTROLS Y GEN ELEC, A.C",MORAL


In [59]:
final_rows = df.shape[0]
print(f"El archivo tiene {final_rows} RFCs válidos.")

El archivo tiene 21 RFCs válidos.


## Column "RAZON" preprocessing

In [60]:
def normalize_text(text: str, pattern: str, new_value: str) -> str:
    """Delete trash from text"""
    text = str(text).strip(".,; ").replace(",", "").replace(";", "").upper()
    text = re.sub(pattern, new_value, text)
    return re.sub(r'\s+', ' ', text).strip()

# Patterns like S.A., C.V.
norm_rules = {
    r'\bS\.?\s?A\.? ': "SA ",
    r'\bS\.?\s?A\.?\b': "SA",
    r'\bC\.?\s?V\.?\b': "CV",
    r'\bS\.?\s?C\.?\b': "SC",
    r'\bS\.?\s?A\.?P\.?\s?I\.?\s?\b': "SAPI ",
    r'\bA\.?\s?C\.? ': "AC ",
    r'\bA\.?\s?C\.?\b': "AC",
}

# Normalize "RAZON"
df['NOMBRE'] = df[column_names[1]]

for pattern, replacement in norm_rules.items():
    df['NOMBRE'] = df['NOMBRE'].apply(
        normalize_text, 
        pattern=pattern, 
        new_value=replacement
    )

df.sample(5)

Unnamed: 0,RFC,RAZON,PERSONA,NOMBRE
14,AAA110819P81,"ASESORIA ADUANAL Y ACONDICIONADOS, S.C.",MORAL,ASESORIA ADUANAL Y ACONDICIONADOS SC
19,&GM060804SN9,"MC GE MULTISISTE DE CTROLS Y GEN ELEC, AC",MORAL,MC GE MULTISISTE DE CTROLS Y GEN ELEC AC
0,AAA08091161A,"APOYANDO A ANGELITOS CON AUTISMO, A. C.",MORAL,APOYANDO A ANGELITOS CON AUTISMO AC
10,AAA110222IY8,ACA ABOGADOS & CONTADORES ASOCIADOS S C,MORAL,ACA ABOGADOS & CONTADORES ASOCIADOS SC
11,AAA110331TX0,AGRO APICOLA EL ABUELO SPR DE RL,MORAL,AGRO APICOLA EL ABUELO SPR DE RL


In [61]:
def lemmatize_text(text: str) ->str:
    """Apply tokenization and lemmatization to the text"""
    # Load the model
    nlp = spacy.load("es_core_news_sm")

    # Process text
    doc = nlp(text.lower())

    # Tokenization and lemmatization
    tokens = [token.text for token in doc]
    lemmas = [token.lemma_ for token in doc]
    return ' '.join(lemmas).upper()

In [62]:
# Lemmatization
df['LEMMA_SPA'] = df['NOMBRE'].apply(lemmatize_text)

df[['NOMBRE', 'LEMMA_SPA']].head(5)

Unnamed: 0,NOMBRE,LEMMA_SPA
0,APOYANDO A ANGELITOS CON AUTISMO AC,APOYAR A ANGELITO CON AUTISMO AC
1,APOLINAR ASOCIADOS CONSULTORIA Y SERVICIOS SC,APOLINAR ASOCIADO CONSULTORIO Y SERVICIO SC
2,DE ALBA & ASOCIADOS FIRMA LEGAL SC,DE ALBA & ASOCIADOS FIRMA LEGAL SC
3,ARGUELLES ALVAREZ & ASOCIADOS SA DE CV,ARGUELL ALVAREZ & ASOCIADOS SA DE CV
4,ADAIR ALONSO ARQUITECTOS SA DE CV,ADAIR ALONSO ARQUITECTOS SA DE CV


## Sort values

In [63]:
# Sort by 'RFC'
df = df.sort_values(by="RFC", ascending=True).reset_index(drop=True)

df.head(10)

Unnamed: 0,RFC,RAZON,PERSONA,NOMBRE,LEMMA_SPA
0,&GM060804SN9,"MC GE MULTISISTE DE CTROL Y GEN ELEC, A C",MORAL,MC GE MULTISISTE DE CTROL Y GEN ELEC AC,MC GE MULTISISTE DE CTROL Y GEN ELEC AC
1,&GM060804SN9,"MC GE MULTISISTE DE CTROLS Y GEN ELEC, A.C",MORAL,MC GE MULTISISTE DE CTROLS Y GEN ELEC AC,MC GE MULTISISTE DE CTROLS Y GEN ELEC AC
2,&GM060804SN9,"MC GE MULTISISTE DE CTROL Y GEN ELEC, A.C. DE",MORAL,MC GE MULTISISTE DE CTROL Y GEN ELEC AC DE,MC GE MULTISISTE DE CTROL Y GEN ELEC AC DE
3,&GM060804SN9,"MC GE MULTISISTE DE CTROLS Y GEN ELEC, AC",MORAL,MC GE MULTISISTE DE CTROLS Y GEN ELEC AC,MC GE MULTISISTE DE CTROLS Y GEN ELEC AC
4,&PI0405044W6,A&P INTERNATIONAL SERVICE SAPI DE CV,MORAL,A&P INTERNATIONAL SERVICE SAPI DE CV,A&P INTERNATIONAL SERVICE SAPI DE CV
5,&PI0405044W6,A&P INTERNATIONAL SERVICES S.A.P.I. DE C.V.,MORAL,A&P INTERNATIONAL SERVICES SAPI DE CV,A&P INTERNATIONAL SERVIZ SAPI DE CV
6,AAA08091161A,"APOYANDO A ANGELITOS CON AUTISMO, A. C.",MORAL,APOYANDO A ANGELITOS CON AUTISMO AC,APOYAR A ANGELITO CON AUTISMO AC
7,AAA081021QD9,APOLINAR ASOCIADOS CONSULTORIA Y SERVICIOS SC,MORAL,APOLINAR ASOCIADOS CONSULTORIA Y SERVICIOS SC,APOLINAR ASOCIADO CONSULTORIO Y SERVICIO SC
8,AAA090601GY4,DE ALBA & ASOCIADOS FIRMA LEGAL S.C.,MORAL,DE ALBA & ASOCIADOS FIRMA LEGAL SC,DE ALBA & ASOCIADOS FIRMA LEGAL SC
9,AAA090924HJ4,"ARGUELLES, ALVAREZ & ASOCIADOS SA DE CV",MORAL,ARGUELLES ALVAREZ & ASOCIADOS SA DE CV,ARGUELL ALVAREZ & ASOCIADOS SA DE CV


In [64]:
# Reorder columns
dataFrame = df[['RFC','RAZON', 'NOMBRE', 'LEMMA_SPA', 'PERSONA']]

# Save DataFrame
file_name = f"NuevosRFC_{csv_file.split(".")[0]}"
dataFrame.to_csv(f'{file_name}.csv', encoding='utf-8', index=False)

## Spacy Models (Recommended for Spanish)
Download the corresponding model from "https://github.com/explosion/spacy-models/releases/tag/es_core_news_sm-3.8.0" ("es_core_news_sm-3.8.0-py3-none-any.whl" file) manually and paste it in this project. After that, run:
```
%pip install "es_core_news_sm-3.8.0-py3-none-any.whl"
```
Or run:
```
python -m spacy download es_core_news_sm
```


In [65]:
# %pip install "es_core_news_sm-3.8.0-py3-none-any.whl"

In [66]:
import spacy # Recommended for Spanish

In [67]:
text = 'Las asociaciones civil mexicana DE ALBA & ASOCIADOS FIRMA'
# 1. Cargas el modelo en español
nlp = spacy.load("es_core_news_sm")

# 2. Procesas el texto
doc = nlp(text.lower())

# 3. Tokenización y Lematización automática
tokens = [token.text for token in doc]
lemmas = [token.lemma_ for token in doc]

print(f"Tokens: {tokens}")
print(f"Lemmas: {lemmas}")

limpio = ' '.join(lemmas)
print(limpio)

Tokens: ['las', 'asociaciones', 'civil', 'mexicana', 'de', 'alba', '&', 'asociados', 'firma']
Lemmas: ['el', 'asociación', 'civil', 'mexicano', 'de', 'alba', '&', 'asociados', 'firma']
el asociación civil mexicano de alba & asociados firma
