# RFC obtaintion
In this document RFCs from different files are obtained. The goal is to concactenate many tables (with 2 columns only: "RFC" and "Razón social"), then to validate RFCs, to clean column "Razón social" by applying Regex techniques and to add column "Persona" if the RFC corresponds to a "fisica" or "moral" person.

In [13]:
# Import libraries
import pandas as pd
import re
from typing import Optional

## Cleaning

In [61]:
csv_file = 'SAT_DGATIC_2022_Emisor.csv'
df = pd.read_csv(csv_file, sep=',', header=None, names=['RFC','Razón social'])
df.sample(10)

Unnamed: 0,RFC,Razón social
41421,BEAG880319374,JOSE GUADALUPE BECERRIL ACUÃ‘A
157670,FOOA910709DX0,ANTONIO FORSECK ORENDAIN
192089,GOMJ850514IT4,JONATHAN GOMEZ MOHEDANO
478436,YEFL561106TJ9,LEONARDA YEPEZ FLORES
165183,GARE750427315,Maria Elisa Garcia Rodriguez
506064,MEZL760728JP0,LEONEL MELCHOR ZAMORA
60968,CACJ6804097Q7,JUAN ERMILO CARRILLO CASTUL
223806,HEGS991120259,SIDDHARTHA HERNANDEZ GALAVIZ
290014,MLI0306128K5,MAS LIMPIO
243015,DIGD830201GZ1,DANIELA DIAZ GARCIA


In [62]:
initial_rows = df.shape[0]
print(f"El archivo tiene {initial_rows} filas.")

El archivo tiene 532850 filas.


In [63]:
# Delete all rows with null values in some columns
df = df.dropna()

# Change column types
df[['RFC','Razón social']] = df[['RFC','Razón social']].astype('str')

## Validate RFC

In [64]:
# Regular expressions
RFC_FISICA_REGEX = re.compile(
    r"^[A-ZÑ&]{4}"
    r"\d{2}(0[1-9]|1[0-2])"
    r"(0[1-9]|[12]\d|3[01])"
    r"[A-Z0-9]{3}$"
)

RFC_MORAL_REGEX = re.compile(
    r"^[A-ZÑ&]{3}"
    r"\d{2}(0[1-9]|1[0-2])"
    r"(0[1-9]|[12]\d|3[01])"
    r"[A-Z0-9]{3}$"
)

def normalize_rfc(rfc: str) -> str:
    """Limpia espacios y convierte a mayúsculas"""
    return rfc.strip().upper()


In [65]:
# Normalize RFC
df['RFC'] = df['RFC'].apply(normalize_rfc)

In [66]:
def is_persona_fisica(rfc: str) -> bool:
    """Verifica si el RFC corresponde a persona física"""
    return bool(RFC_FISICA_REGEX.match(rfc))


def is_persona_moral(rfc: str) -> bool:
    """Verifica si el RFC corresponde a persona moral"""
    return bool(RFC_MORAL_REGEX.match(rfc))


def get_rfc_type(rfc: str) -> Optional[str]:
    """
    Retorna el tipo de RFC:
    - 'persona_fisica'
    - 'persona_moral'
    - None si es inválido
    """

    if is_persona_fisica(rfc):
        return "FISICA"
    if is_persona_moral(rfc):
        return "MORAL"

    return None

In [70]:
# Validate RFC
df['Persona'] = df['RFC'].apply(get_rfc_type)

# Filter
df = df[(df['Persona'] == 'FISICA') | (df['Persona'] == 'MORAL')]

df.sample(10)

Unnamed: 0,RFC,Razón social,Persona
104153,CSG080401D91,CENTRO DE SERVICIO G DE MENDOZA DE MATEHUALA S...,MORAL
465874,TSI060630RT1,TITAN SOLUCIONES INTEGRALES,MORAL
453528,HECC820702JE5,CLARA ELENA ZETKIN HERNANDEZ CARDONA,FISICA
342561,PAAB651213A41,BLANCA ESTELA PRADO ALCANTARA,FISICA
12107,AARJ810705TI7,JUAN PABLO ANDRADE ROSAS,FISICA
256051,LOVD8804185W9,DIEGO LOZANO VERDUZCO,FISICA
421106,SAVE911017BE9,EDWIN SANCHEZ VELEZ,FISICA
389913,ROPS980514TM3,SAUL MAURICIO RODRIGUEZ PEREZ,FISICA
344037,PEHA860616FI8,ANA FABIOLA PEREZ HERNANDEZ,FISICA
151301,FOVM930412R56,Marina Flores vidrio,FISICA


In [71]:
final_rows = df.shape[0]
print(f"El archivo tiene {final_rows} filas.")

El archivo tiene 530418 filas.


In [None]:
# Create DataFrame from CSV file
BASE = 'Catalogo_APF_RFCs.csv'
df_base = pd.read_csv(BASE, sep=',', header=None, names=['RFC','Razón social'])
print(df_base.head(3))

            RFC                                 Razón social
0  ABA930301AZ5       Agroindustrias del Balsas, S.A. de C.V
1  AEM1007317T7                    Agencia Espacial Mexicana
2  AEM230616A93  Aerolínea del Estado Mexicano, S.A. de C.V.


In [None]:
print(f"El catálogo base tiene {df_base.shape[0]} filas.")

El catálogo base tiene 789 registros.
