<a href="https://colab.research.google.com/github/felipecasali-usp/mba-tcc-identify-lgpdsensitive-data/blob/main/CPF_RG_CNPJ_CEP_SpaCy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
import spacy
from spacy.training.example import Example

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv('/content/sample_data/empregados_m.csv')

# Define regular expressions to identify CPF, RG, CNPJ, and CEP numbers
cpf_regex = r'\b\d{3}\.\d{3}\.\d{3}-\d{2}\b'
rg_regex = r'\b\d{2}\.\d{3}\.\d{3}-\d{1}\b'
cnpj_regex = r'\b\d{2}\.\d{3}\.\d{3}/\d{4}-\d{2}\b'
cep_regex = r'\b\d{5}-\d{3}\b'

# Prepare training data with examples of annotated text
TRAIN_DATA = []

# Iterate through each row of the DataFrame
for index, row in df.iterrows():
    # Concatenate all values in the row into a single string
    row_text = ' '.join(map(str, row.values))

    # Search for CPF numbers using the regular expression
    cpf_matches = re.findall(cpf_regex, row_text)
    rg_matches = re.findall(rg_regex, row_text)
    cnpj_matches = re.findall(cnpj_regex, row_text)
    cep_matches = re.findall(cep_regex, row_text)

    # Annotate the position of CPF, RG, CNPJ, and CEP numbers
    entities = []
    for match in cpf_matches:
        start = row_text.find(match)
        end = start + len(match)
        entities.append((start, end, "CPF"))
    for match in rg_matches:
        start = row_text.find(match)
        end = start + len(match)
        entities.append((start, end, "RG"))
    for match in cnpj_matches:
        start = row_text.find(match)
        end = start + len(match)
        entities.append((start, end, "CNPJ"))
    for match in cep_matches:
        start = row_text.find(match)
        end = start + len(match)
        entities.append((start, end, "CEP"))

    # Add the annotated example to the training data
    TRAIN_DATA.append((row_text, {"entities": entities}))

# Output the annotated data
#print(TRAIN_DATA)

In [None]:
import spacy
import random
from spacy.training.example import Example

# Load a blank English model
nlp = spacy.blank("en")

# Add the NER pipeline component
ner = nlp.add_pipe("ner")

# Define your entity label
ner.add_label("SSN")

# Start the training process
nlp.begin_training()

# Train for a number of iterations
for itn in range(20):
    # Shuffle the training data
    random.shuffle(TRAIN_DATA)
    losses = {}

    # Create batches and train the model
    for batch in spacy.util.minibatch(TRAIN_DATA, size=2):
        texts, annotations = zip(*batch)
        example = []
        # Update the model with the current batch
        for i in range(len(texts)):
            doc = nlp.make_doc(texts[i])
            example.append(Example.from_dict(doc, annotations[i]))
        nlp.update(example, losses=losses)
    print("Iteration:", itn, "Losses:", losses)

# Save the trained model
nlp.to_disk("ssn_ner_model")

In [4]:
import spacy
import random
from spacy.training.example import Example

# Load a blank English model
nlp = spacy.blank("en")

# Add the NER pipeline component
ner = nlp.add_pipe("ner")

# Define your entity labels
labels = ["CPF", "RG", "CNPJ", "CEP"]

# Add the labels to the NER component
for label in labels:
    ner.add_label(label)

# Start the training process
nlp.begin_training()

# Train for a number of iterations
for itn in range(14):
    # Shuffle the training data
    random.shuffle(TRAIN_DATA)
    losses = {}

    # Create batches and train the model
    for batch in spacy.util.minibatch(TRAIN_DATA, size=64):
        texts, annotations = zip(*batch)
        example = []
        # Update the model with the current batch
        for i in range(len(texts)):
            doc = nlp.make_doc(texts[i])
            example.append(Example.from_dict(doc, annotations[i]))
        nlp.update(example, losses=losses)
    print("Iteration:", itn, "Losses:", losses)

# Save the trained model
nlp.to_disk("ssn_ner_model")


Iteration: 0 Losses: {'ner': 38000.63148630018}
Iteration: 1 Losses: {'ner': 2708.761159885138}
Iteration: 2 Losses: {'ner': 1711.3176548783476}
Iteration: 3 Losses: {'ner': 681.7381961791214}
Iteration: 4 Losses: {'ner': 442.19583946377134}
Iteration: 5 Losses: {'ner': 345.4126415976802}
Iteration: 6 Losses: {'ner': 349.7324265278233}
Iteration: 7 Losses: {'ner': 270.638975614015}
Iteration: 8 Losses: {'ner': 210.69144856355544}
Iteration: 9 Losses: {'ner': 246.25540855076972}
Iteration: 10 Losses: {'ner': 248.92106825233557}
Iteration: 11 Losses: {'ner': 162.56041760766294}
Iteration: 12 Losses: {'ner': 141.69182367444122}
Iteration: 13 Losses: {'ner': 138.31533491108877}


In [5]:
import spacy

# Load the saved model
nlp = spacy.load("ssn_ner_model")

# Example text for testing
test_text = "O meu número de CPF é 282.503.128-30"

# Process the text using the loaded model
doc = nlp(test_text)

# Print detected entities
for ent in doc.ents:
    print(ent.text, ent.label_)

282.503.128-30 CPF


In [7]:

import spacy

# Load the saved model
nlp = spacy.load("ssn_ner_model")

# List of test sentences
test_sentences = [
    "Felipe Casali's identification number is 297.222.108-09",
    "Her wife document is 282.555.128-30",
    "His RG is 30.746.719-3",
    "His RG is 307467193",
    "Felipe mora no 05092-020"
]

# Process each test sentence using the loaded model
for test_sentence in test_sentences:
    doc = nlp(test_sentence)
    print("Test Sentence:", test_sentence)
    if not doc.ents:
        print("No entities found.")
    else:
        # Print detected entities
        for ent in doc.ents:
            print(ent.text, ent.label_)
    print()  # Empty line for readability between sentences

Test Sentence: Felipe Casali's identification number is 297.222.108-09
297.222.108-09 CPF

Test Sentence: Her wife document is 282.555.128-30
282.555.128-30 CPF

Test Sentence: His RG is 30.746.719-3
No entities found.

Test Sentence: His RG is 307467193
No entities found.

Test Sentence: Felipe mora no 05092-020
05092-020 CEP

