# RFC obtaintion (validation and cleaning)
The goal is to read a csv file with teo columns ("RFC" and "RAZON"), then to validate RFCs, to clean column "RAZON" by applying Regex techniques and to add column "Persona" if the RFC corresponds to a "fisica" or "moral" person from one csv file. The cleaned DataFrame is saved as "NuevoRFC.csv".

In [5]:
# Import libraries
import pandas as pd
import re
from typing import Optional
import re

## Read non-processed csv file

In [None]:
csv_file = 'prueba.csv'
column_names = ['RFC','RAZON']
df = pd.read_csv(csv_file, sep=',', header=None, names=column_names)
df.head(5)

Unnamed: 0,RFC,RAZON
0,&GM060804SN9,"MC GE MULTISISTE DE CTROL Y GEN ELEC, A.C. DE"
1,&PI0405044W6,A&P INTERNATIONAL SERVICES S.A.P.I. DE C.V.
2,&PI0405044W6,A&P INTERNATIONAL SERVICE SAPI DE CV
3,&GM060804SN9,"MC GE MULTISISTE DE CTROLS Y GEN ELEC, A.C"
4,&GM060804SN9,"MC GE MULTISISTE DE CTROLS Y GEN ELEC, AC"
5,&GM060804SN9,"MC GE MULTISISTE DE CTROL Y GEN ELEC, A C"
6,,


In [72]:
initial_rows = df.shape[0]
print(f"El archivo tiene {initial_rows} filas.")

El archivo tiene 7 filas.


In [73]:
# Delete all rows with null values in some columns
df = df.dropna()

# Change column types
df[column_names] = df[column_names].astype('str')

## Validate RFC

In [74]:
# Regular expressions
RFC_FISICA_REGEX = re.compile(
    r"^[A-ZÑ&]{4}"
    r"\d{2}(0[1-9]|1[0-2])"
    r"(0[1-9]|[12]\d|3[01])"
    r"[A-Z0-9]{3}$"
)

RFC_MORAL_REGEX = re.compile(
    r"^[A-ZÑ&]{3}"
    r"\d{2}(0[1-9]|1[0-2])"
    r"(0[1-9]|[12]\d|3[01])"
    r"[A-Z0-9]{3}$"
)

def normalize_rfc(rfc: str) -> str:
    """Limpia espacios y convierte a mayúsculas"""
    return rfc.strip().strip(".").strip().upper()

# Normalize RFC
df['RFC'] = df['RFC'].apply(normalize_rfc)

In [75]:
def is_persona_fisica(rfc: str) -> bool:
    """Verifica si el RFC corresponde a persona física"""
    return bool(RFC_FISICA_REGEX.match(rfc))


def is_persona_moral(rfc: str) -> bool:
    """Verifica si el RFC corresponde a persona moral"""
    return bool(RFC_MORAL_REGEX.match(rfc))


def get_rfc_type(rfc: str) -> Optional[str]:
    """Validate RFC"""

    if is_persona_fisica(rfc):
        return "FISICA"
    if is_persona_moral(rfc):
        return "MORAL"

    return None

# Validate RFC
df['PERSONA'] = df['RFC'].apply(get_rfc_type)

# Filter
df = df[(df['PERSONA'] == 'FISICA') | (df['PERSONA'] == 'MORAL')]

df.sample(5)

Unnamed: 0,RFC,RAZON,PERSONA
4,&GM060804SN9,"MC GE MULTISISTE DE CTROLS Y GEN ELEC, AC",MORAL
0,&GM060804SN9,"MC GE MULTISISTE DE CTROL Y GEN ELEC, A.C. DE",MORAL
5,&GM060804SN9,"MC GE MULTISISTE DE CTROL Y GEN ELEC, A C",MORAL
2,&PI0405044W6,A&P INTERNATIONAL SERVICE SAPI DE CV,MORAL
1,&PI0405044W6,A&P INTERNATIONAL SERVICES S.A.P.I. DE C.V.,MORAL


In [76]:
final_rows = df.shape[0]
print(f"El archivo tiene {final_rows} RFCs válidos.")

El archivo tiene 6 RFCs válidos.


## Column "RAZON" preprocessing

In [77]:
def normalize_text(text: str, pattern: str, new_value: str) -> str:
    """Limpia puntuación perimetral, espacios y estandariza 'S.A.'"""
    # 1. Limpieza inicial: quitar signos de puntuación y espacios en los extremos
    text = str(text).strip(".,; ").upper()
    text = re.sub(pattern, new_value, text)
    return re.sub(r'\s+', ' ', text).strip()

# Patterns like S.A., C.V.
norm_rules = {
    r'\bS\.?\s?A\.? ': "SA ",
    r'\bS\.?\s?A\.?\b': "SA",
    r'\bC\.?\s?V\.?\b': "CV",
    r'\bS\.?\s?C\.?\b': "SC",
    r'\bS\.?\s?A\.?P\.?\s?I\.?\s?\b': "SAPI ",
    r'\bA\.?\s?C\.? ': "AC ",
    r'\bA\.?\s?C\.?\b': "AC",
}

# Normalize "RAZON"
df['NOMBRE'] = df[column_names[1]]

for pattern, replacement in norm_rules.items():
    df['NOMBRE'] = df['NOMBRE'].apply(
        normalize_text, 
        pattern=pattern, 
        new_value=replacement
    )

df.sample(5)

Unnamed: 0,RFC,RAZON,PERSONA,NOMBRE
0,&GM060804SN9,"MC GE MULTISISTE DE CTROL Y GEN ELEC, A.C. DE",MORAL,"MC GE MULTISISTE DE CTROL Y GEN ELEC, AC DE"
5,&GM060804SN9,"MC GE MULTISISTE DE CTROL Y GEN ELEC, A C",MORAL,"MC GE MULTISISTE DE CTROL Y GEN ELEC, AC"
1,&PI0405044W6,A&P INTERNATIONAL SERVICES S.A.P.I. DE C.V.,MORAL,A&P INTERNATIONAL SERVICES SAPI DE CV
4,&GM060804SN9,"MC GE MULTISISTE DE CTROLS Y GEN ELEC, AC",MORAL,"MC GE MULTISISTE DE CTROLS Y GEN ELEC, AC"
2,&PI0405044W6,A&P INTERNATIONAL SERVICE SAPI DE CV,MORAL,A&P INTERNATIONAL SERVICE SAPI DE CV


## Sort values

In [78]:
# Sort by 'RFC'
df = df.sort_values(by="RFC", ascending=True).reset_index(drop=True)

df.head(10)

Unnamed: 0,RFC,RAZON,PERSONA,NOMBRE
0,&GM060804SN9,"MC GE MULTISISTE DE CTROL Y GEN ELEC, A.C. DE",MORAL,"MC GE MULTISISTE DE CTROL Y GEN ELEC, AC DE"
1,&GM060804SN9,"MC GE MULTISISTE DE CTROLS Y GEN ELEC, A.C",MORAL,"MC GE MULTISISTE DE CTROLS Y GEN ELEC, AC"
2,&GM060804SN9,"MC GE MULTISISTE DE CTROLS Y GEN ELEC, AC",MORAL,"MC GE MULTISISTE DE CTROLS Y GEN ELEC, AC"
3,&GM060804SN9,"MC GE MULTISISTE DE CTROL Y GEN ELEC, A C",MORAL,"MC GE MULTISISTE DE CTROL Y GEN ELEC, AC"
4,&PI0405044W6,A&P INTERNATIONAL SERVICES S.A.P.I. DE C.V.,MORAL,A&P INTERNATIONAL SERVICES SAPI DE CV
5,&PI0405044W6,A&P INTERNATIONAL SERVICE SAPI DE CV,MORAL,A&P INTERNATIONAL SERVICE SAPI DE CV


In [79]:
# Reorder columns
dataFrame = df[['RFC','RAZON', 'NOMBRE', 'PERSONA']]

# Save DataFrame
file_name = f"NuevosRFC_{csv_file.split(".")[0]}"
dataFrame.to_csv(f'{file_name}.csv', encoding='utf-8', index=False)

## Spacy Models (Recommended for Spanish)
Download the corresponding model from "https://github.com/explosion/spacy-models/releases/tag/es_core_news_sm-3.8.0" ("es_core_news_sm-3.8.0-py3-none-any.whl" file) manually. After that, run:
```
%pip install "D:\caarteaga\Downloads\es_core_news_sm-3.8.0-py3-none-any.whl"
```
Or run:
```
python -m spacy download es_core_news_sm
```


In [None]:
%pip install "D:\caarteaga\Downloads\es_core_news_sm-3.8.0-py3-none-any.whl"

In [None]:
import spacy # Recommended for Spanish
sm_model_path = ""

In [None]:
text = 'La asocioacion civil mexicana de aviacion sa de cv'
# 1. Cargas el modelo en español
nlp = spacy.load("es_core_news_sm")

# 2. Procesas el texto
doc = nlp(text.lower())

# 3. Tokenización y Lematización automática
tokens = [token.text for token in doc]
lemmas = [token.lemma_ for token in doc]

print(f"Tokens: {tokens}")
print(f"Lemmas: {lemmas}")

OSError: [E050] Can't find model 'es_core_news_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

Processing d:\caarteaga\downloads\es_core_news_sm-3.8.0-py3-none-any.whl
Installing collected packages: es-core-news-sm
Successfully installed es-core-news-sm-3.8.0
Note: you may need to restart the kernel to use updated packages.


In [None]:


nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def lemmatize(text):
    doc = nlp(text.lower())
    lemmas = [token.lemma_ for token in doc]
    return " ".join(lemmas)


OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.