<a href="https://colab.research.google.com/github/felipecasali-usp/mba-tcc-identify-lgpdsensitive-data/blob/main/CPF_SpaCy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import re
import spacy
from spacy.training.example import Example

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv('/content/sample_data/empregados_m.csv')

# Define a regular expression to identify CPF numbers
cpf_regex = r'\b\d{3}\.\d{3}\.\d{3}-\d{2}\b'

# Prepare training data with examples of annotated text
TRAIN_DATA = []

# Iterate through each row of the DataFrame
for index, row in df.iterrows():
    # Concatenate all values in the row into a single string
    row_text = ' '.join(map(str, row.values))

    # Search for CPF numbers using the regular expression
    cpf_matches = re.findall(cpf_regex, row_text)

    # Annotate the position of CPF numbers
    if cpf_matches:
        start = row_text.find(cpf_matches[0])
        end = start + len(cpf_matches[0])
        entities = [(start, end, "CPF")]
    else:
        entities = []

    # Add the annotated example to the training data
    TRAIN_DATA.append((row_text, {"entities": entities}))

# Output the annotated data
print(TRAIN_DATA)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [4]:
import spacy
import random
from spacy.training.example import Example

# Load a blank English model
nlp = spacy.blank("en")

# Add the NER pipeline component
ner = nlp.add_pipe("ner")

# Define your entity label
ner.add_label("SSN")

# Start the training process
nlp.begin_training()

# Train for a number of iterations
for itn in range(20):
    # Shuffle the training data
    random.shuffle(TRAIN_DATA)
    losses = {}

    # Create batches and train the model
    for batch in spacy.util.minibatch(TRAIN_DATA, size=2):
        texts, annotations = zip(*batch)
        example = []
        # Update the model with the current batch
        for i in range(len(texts)):
            doc = nlp.make_doc(texts[i])
            example.append(Example.from_dict(doc, annotations[i]))
        nlp.update(example, losses=losses)
    print("Iteration:", itn, "Losses:", losses)

# Save the trained model
nlp.to_disk("ssn_ner_model")

Iteration: 0 Losses: {'ner': 826.5058272229269}
Iteration: 1 Losses: {'ner': 7.796192319582059e-11}
Iteration: 2 Losses: {'ner': 3.487956603800758e-15}
Iteration: 3 Losses: {'ner': 2.09136662085791e-17}
Iteration: 4 Losses: {'ner': 3.6408721432834087e-16}
Iteration: 5 Losses: {'ner': 1.8490647134956856e-17}
Iteration: 6 Losses: {'ner': 2.441915775543297e-16}
Iteration: 7 Losses: {'ner': 5.982679088363775e-17}
Iteration: 8 Losses: {'ner': 1.1032500517476287e-16}
Iteration: 9 Losses: {'ner': 1.4598413199724418e-16}
Iteration: 10 Losses: {'ner': 2.6977667809849795e-16}
Iteration: 11 Losses: {'ner': 8.667267534035975e-17}
Iteration: 12 Losses: {'ner': 6.766589634309798e-17}
Iteration: 13 Losses: {'ner': 1.1025064108034637e-16}
Iteration: 14 Losses: {'ner': 5.185695818649276e-17}
Iteration: 15 Losses: {'ner': 4.3176444655833144e-17}
Iteration: 16 Losses: {'ner': 4.6071608035357966e-17}
Iteration: 17 Losses: {'ner': 3.747026258190429e-17}
Iteration: 18 Losses: {'ner': 5.673759995712206e-17}


In [12]:
import spacy

# Load the saved model
nlp = spacy.load("ssn_ner_model")

# Example text for testing
test_text = "O meu número de CPF é 282.503.128-30"

# Process the text using the loaded model
doc = nlp(test_text)

# Print detected entities
for ent in doc.ents:
    print(ent.text, ent.label_)

282.503.128-30 CPF


In [17]:
import spacy

# Load the saved model
nlp = spacy.load("ssn_ner_model")

# List of test sentences
test_sentences = [
    "Felipe Casali's identification number is 297.222.108-09",
    "Her wife document is 282.555.128-30",
    "His RG is 30.746.719-3"
]

# Process each test sentence using the loaded model
for test_sentence in test_sentences:
    doc = nlp(test_sentence)
    print("Test Sentence:", test_sentence)
    if not doc.ents:
        print("No entities found.")
    else:
        # Print detected entities
        for ent in doc.ents:
            print(ent.text, ent.label_)
    print()  # Empty line for readability between sentences


Test Sentence: Felipe Casali's identification number is 297.222.108-09
297.222.108-09 CPF

Test Sentence: Her wife document is 282.555.128-30
282.555.128-30 CPF

Test Sentence: His RG is 30.746.719-3
No entities found.

