# RFC obtaintion (validation and cleaning)
The goal is to validate RFCs, to clean column "Razón social" by applying Regex techniques and to add column "Persona" if the RFC corresponds to a "fisica" or "moral" person from one csv file. The cleaned DataFrame is saved as "NuevoRFC.csv".

In [252]:
# Import libraries
import pandas as pd
import re
from typing import Optional
import re

## Read non-processed csv file

In [253]:
csv_file = 'prueba.csv'
df = pd.read_csv(csv_file, sep=',', header=None, names=['RFC','Razón social'])
df.head(10)

Unnamed: 0,RFC,Razón social
0,&GM060804SN9,"MC GE MULTISISTE DE CTROL Y GEN ELEC, A.C. DE"
1,&GM060804SN9,"MC GE MULTISISTE DE CTROL Y GEN ELEC, A.C."
2,&PI0405044W6,A&P INTERNATIONAL SERVICE SAPI DE CV
3,&PI0405044W6,A&P INTERNATIONAL SERVICES S.A.P.I. DE C.V.
4,&RD021121TV4,A & R DOORS & WINDOWS SA DE CV
5,A&A010802V67,"Arciniega & Asociados, S.C."
6,A&A970821TZ5,AD & ASOCIADOS SA DE CV SA DE CV
7,A&B0609046F3,A + A & B TRANSPORTES SA DE CV
8,A&C031205B59,AN S & CAR S.A. DE C.V.


In [254]:
initial_rows = df.shape[0]
print(f"El archivo tiene {initial_rows} filas.")

El archivo tiene 9 filas.


In [255]:
# Delete all rows with null values in some columns
df = df.dropna()

# Change column types
df[['RFC','Razón social']] = df[['RFC','Razón social']].astype('str')

## Validate RFC

In [256]:
# Regular expressions
RFC_FISICA_REGEX = re.compile(
    r"^[A-ZÑ&]{4}"
    r"\d{2}(0[1-9]|1[0-2])"
    r"(0[1-9]|[12]\d|3[01])"
    r"[A-Z0-9]{3}$"
)

RFC_MORAL_REGEX = re.compile(
    r"^[A-ZÑ&]{3}"
    r"\d{2}(0[1-9]|1[0-2])"
    r"(0[1-9]|[12]\d|3[01])"
    r"[A-Z0-9]{3}$"
)

def normalize_rfc(rfc: str) -> str:
    """Limpia espacios y convierte a mayúsculas"""
    return rfc.strip().strip(".").strip().upper()

# Normalize RFC
df['RFC'] = df['RFC'].apply(normalize_rfc)

In [257]:
def is_persona_fisica(rfc: str) -> bool:
    """Verifica si el RFC corresponde a persona física"""
    return bool(RFC_FISICA_REGEX.match(rfc))


def is_persona_moral(rfc: str) -> bool:
    """Verifica si el RFC corresponde a persona moral"""
    return bool(RFC_MORAL_REGEX.match(rfc))


def get_rfc_type(rfc: str) -> Optional[str]:
    """Validate RFC"""

    if is_persona_fisica(rfc):
        return "FISICA"
    if is_persona_moral(rfc):
        return "MORAL"

    return None

# Validate RFC
df['Persona'] = df['RFC'].apply(get_rfc_type)

# Filter
df = df[(df['Persona'] == 'FISICA') | (df['Persona'] == 'MORAL')]

df.sample(5)

Unnamed: 0,RFC,Razón social,Persona
2,&PI0405044W6,A&P INTERNATIONAL SERVICE SAPI DE CV,MORAL
4,&RD021121TV4,A & R DOORS & WINDOWS SA DE CV,MORAL
7,A&B0609046F3,A + A & B TRANSPORTES SA DE CV,MORAL
8,A&C031205B59,AN S & CAR S.A. DE C.V.,MORAL
6,A&A970821TZ5,AD & ASOCIADOS SA DE CV SA DE CV,MORAL


In [258]:
final_rows = df.shape[0]
print(f"El archivo tiene {final_rows} RFCs válidos.")

El archivo tiene 9 RFCs válidos.


## Column "Razón social" preprocessing

In [259]:
def normalize_text(text: str, pattern: str, new_value: str) -> str:
    """Limpia puntuación perimetral, espacios y estandariza 'S.A.'"""
    # 1. Limpieza inicial: quitar signos de puntuación y espacios en los extremos
    text = str(text).strip(".,; ").upper()
    text = re.sub(pattern, new_value, text)
    return re.sub(r'\s+', ' ', text).strip()

# Patterns like S.A., C.V.
norm_rules = {
    r'\bS\.?\s?A\.? ': "SA ",
    r'\bS\.?\s?A\.?\b': "SA",
    r'\bC\.?\s?V\.?\b': "CV",
    r'\bS\.?\s?C\.?\b': "SC",
    r'\bS\.?\s?A\.?P\.?\s?I\.?\s?\b': "SAPI ",
    r'\bA\.?\s?C\.? ': "AC ",
    r'\bA\.?\s?C\.?\b': "AC",
}

# Normalize "Asociación"
df['Nombre'] = df['Razón social']

for pattern, replacement in norm_rules.items():
    df['Nombre'] = df['Nombre'].apply(
        normalize_text, 
        pattern=pattern, 
        new_value=replacement
    )

df.sample(5)

Unnamed: 0,RFC,Razón social,Persona,Nombre
4,&RD021121TV4,A & R DOORS & WINDOWS SA DE CV,MORAL,A & R DOORS & WINDOWS SA DE CV
6,A&A970821TZ5,AD & ASOCIADOS SA DE CV SA DE CV,MORAL,AD & ASOCIADOS SA DE CV SA DE CV
8,A&C031205B59,AN S & CAR S.A. DE C.V.,MORAL,AN S & CAR SA DE CV
7,A&B0609046F3,A + A & B TRANSPORTES SA DE CV,MORAL,A + A & B TRANSPORTES SA DE CV
5,A&A010802V67,"Arciniega & Asociados, S.C.",MORAL,"ARCINIEGA & ASOCIADOS, SC"


In [260]:
# Sort by 'RFC'
df['RFC'].sort_values()

df.head(10)

Unnamed: 0,RFC,Razón social,Persona,Nombre
0,&GM060804SN9,"MC GE MULTISISTE DE CTROL Y GEN ELEC, A.C. DE",MORAL,"MC GE MULTISISTE DE CTROL Y GEN ELEC, AC DE"
1,&GM060804SN9,"MC GE MULTISISTE DE CTROL Y GEN ELEC, A.C.",MORAL,"MC GE MULTISISTE DE CTROL Y GEN ELEC, AC"
2,&PI0405044W6,A&P INTERNATIONAL SERVICE SAPI DE CV,MORAL,A&P INTERNATIONAL SERVICE SAPI DE CV
3,&PI0405044W6,A&P INTERNATIONAL SERVICES S.A.P.I. DE C.V.,MORAL,A&P INTERNATIONAL SERVICES SAPI DE CV
4,&RD021121TV4,A & R DOORS & WINDOWS SA DE CV,MORAL,A & R DOORS & WINDOWS SA DE CV
5,A&A010802V67,"Arciniega & Asociados, S.C.",MORAL,"ARCINIEGA & ASOCIADOS, SC"
6,A&A970821TZ5,AD & ASOCIADOS SA DE CV SA DE CV,MORAL,AD & ASOCIADOS SA DE CV SA DE CV
7,A&B0609046F3,A + A & B TRANSPORTES SA DE CV,MORAL,A + A & B TRANSPORTES SA DE CV
8,A&C031205B59,AN S & CAR S.A. DE C.V.,MORAL,AN S & CAR SA DE CV


In [261]:
# Reorder columns
dataFrame = df[['RFC','Razón social', 'Nombre', 'Persona']]

# Save DataFrame
file_name = f"NuevosRFC_{csv_file.split(".")[0]}"
dataFrame.to_csv(f'{file_name}.csv', encoding='utf-8', index=False)