In [35]:
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from typing import List, Optional, Dict, Union, Iterator, Iterable
import collections
from dataclasses import dataclass
import pprint
from collections import defaultdict
import pandas as pd
from presidio_analyzer import AnalyzerEngine, PatternRecognizer
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerResult, DictAnalyzerResult, RecognizerRegistry
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import EngineResult


In [83]:
global_analyzer = AnalyzerEngine()

In [37]:
dataset = pd.read_csv('/mnt/c/Users/marti/Downloads/final_testing.csv')
testing_ds = pd.read_csv('/mnt/c/Users/marti/Downloads/sample_2.csv')
testing_ds_second = pd.read_csv('/mnt/c/Users/marti/Downloads/sample3.csv')
testing_ds_third = pd.read_csv('/mnt/c/Users/marti/Downloads/sample4.csv')


In [96]:
from fastapi import FastAPI, File, UploadFile
from io import StringIO

app = FastAPI()

def dataset_analysis(dataset, analyzer=None, cutoff=0.6):
   
    # create analyzer engine
    if analyzer == None:
        analyzer = global_analyzer

    # create a dictionary to store the findings for each column
    all_columns_info = {}

    # analyze each value and record entity scores and counts
    for col in dataset.columns:
        # create dictionaries to store entity scores and counts for each column
        entity_scores = defaultdict(list)
        entity_counts = defaultdict(int)

        total_rows = len(dataset[col])

        for value in dataset[col]:
            # Call analyzer to get results
            try:
                results = analyzer.analyze(text=value, language='en')
                
                if not results:  # If no entities are found
                    entity_counts['NOT PII'] += 1

                # iterate over each result
                for result in results:
                    # add score and count to entity_scores and entity_counts dictionaries
                    entity_scores[result.entity_type].append(result.score)
                    entity_counts[result.entity_type] += 1
            except ValueError:
                pass

        # calculate the total entities found in the column
        total_entities = sum(entity_counts.values())

        # calculate average score, count, and percentage for each entity
        entity_scores_counts = {}
        for entity, scores in entity_scores.items():
            score_avg = sum(scores) / len(scores)
            
            if score_avg < cutoff:
                entity_counts['NOT PII'] += entity_counts.pop(entity)
            else:
                count = entity_counts[entity]
                percentage = (count / total_rows) * 100
                entity_scores_counts[entity] = {'score_average': score_avg, 'number_of_datapoints': count, 'percentage': percentage}

        # Add the 'NOT PII' class and its percentage
        not_pii_count = entity_counts['NOT PII']
        not_pii_percentage = (not_pii_count / total_rows) * 100
        entity_scores_counts['NOT PII'] = {'score_average': 0, 'number_of_datapoints': not_pii_count, 'percentage': not_pii_percentage}

        # store findings for the current column
        all_columns_info[col] = entity_scores_counts

    results_df = pd.DataFrame.from_dict({(i, j): all_columns_info[i][j]
                                        for i in all_columns_info.keys()
                                        for j in all_columns_info[i].keys()},
                                        orient='index')

    results_df.columns = ['score_average', 'n_datapoints', 'percentage']

    # display the results
    print(results_df)


@app.post("/analyze-csv")
async def analyze_csv(file: UploadFile = File(...)):
    # Read the uploaded CSV file
    content = await file.read()
    content_str = content.decode()
    dataset = pd.read_csv(StringIO(content_str))

    # Perform the analysis
    results_df = dataset_analysis(dataset)

    # Convert the results DataFrame to a dictionary and return it
    results_dict = results_df.reset_index().to_dict(orient="records")
    return results_dict

In [99]:
import time
def take_time(dataset, analyzer = None):
    row_count = dataset.shape[0]
    start = time.time() 
    dataset_analysis(dataset, global_analyzer)
    end = time.time()

    print(f'For {row_count} and {dataset.shape[1]} columns, it took {end - start} seconds')


In [100]:
take_time(dataset)

                            score_average  n_datapoints  percentage
address      ADDRESS                 0.70           297        59.4
             LOCATION                0.85            37         7.4
             PERSON                  0.85            17         3.4
             DATE_TIME               0.85            11         2.2
             NRP                     0.85             4         0.8
             NOT PII                 0.00           172        34.4
currency     NOT PII                 0.00           500       100.0
postalZip    DATE_TIME               0.85            56        11.2
             PERSON                  0.85             4         0.8
             NOT PII                 0.00           462        92.4
phone        DATE_TIME               0.85             3         0.6
             UK_NHS                  1.00            18         3.6
             NOT PII                 0.00           487        97.4
name         PERSON                  0.85       

In [103]:
take_time(testing_ds)


                   score_average  n_datapoints  percentage
name     PERSON             0.85            99        99.0
         LOCATION           0.85             1         1.0
         NOT PII            0.00             1         1.0
 age     NOT PII            0.00             0         0.0
 gender  NOT PII            0.00           100       100.0
 country LOCATION           0.85             8         8.0
         PERSON             0.85             4         4.0
         NOT PII            0.00            87        87.0
For 100 and 4 columns, it took 1.4725275039672852 seconds


In [102]:
take_time(testing_ds_second)

                            score_average  n_datapoints  percentage
country      LOCATION                0.85           943        94.3
             NRP                     0.85            37         3.7
             PERSON                  0.85            35         3.5
             NOT PII                 0.00            55         5.5
first_name   PERSON                  0.85           530        53.0
             LOCATION                0.85            49         4.9
             NRP                     0.85            13         1.3
             DATE_TIME               0.85             1         0.1
             NOT PII                 0.00           407        40.7
last_name    PERSON                  0.85           451        45.1
             LOCATION                0.85            46         4.6
             NRP                     0.85            12         1.2
             NOT PII                 0.00           491        49.1
email        EMAIL_ADDRESS           1.00       

In [101]:
take_time(testing_ds_third)
full_ds = pd.read_csv('/mnt/c/Users/marti/Downloads/final_testing.csv')

                        score_average  n_datapoints  percentage
date   DATE_TIME                 0.85          1000       100.0
       NOT PII                   0.00             0         0.0
lorem  PERSON                    0.85            44         4.4
       LOCATION                  0.85            38         3.8
       DATE_TIME                 0.85             4         0.4
       NRP                       0.85             3         0.3
       NOT PII                   0.00           911        91.1
random NOT PII                   0.00             0         0.0
guid   DATE_TIME                 0.85            55         5.5
       PERSON                    0.85            45         4.5
       NRP                       0.85            10         1.0
       LOCATION                  0.85             7         0.7
       MEDICAL_LICENSE           1.00             3         0.3
       NOT PII                   0.00           906        90.6
For 1000 and 4 columns, it took 18.86944

In [109]:
def get_csv_column(df):
    # List the columns with numbers
    print("Columns:")
    for i, col in enumerate(df.columns):
        print(f"{i}: {col}")

    # Get the user input for the column number
    col_num = int(input("Enter the column number you want to select: "))

    # Check if the input is a valid column number
    if 0 <= col_num < len(df.columns):
        # Return the selected column
        selected_column = df[[df.columns[col_num]]]
        print(f"Selected column '{df.columns[col_num]}':")
        print(selected_column)
        return selected_column
    else:
        print("Invalid column number. Exiting.")
        return
def user_column_detection(df, analyzer = global_analyzer):
    selected_column = get_csv_column(df)
    dataset_analysis(selected_column, analyzer)

In [110]:
user_column_detection(dataset, global_analyzer)

Columns:
0: address
1: currency
2: postalZip
3: phone
4: name
5: country
6: region
7: email
8: list
9: alphanumeric
10: numberrange
Selected column 'address':
                           address
0               9503 Curabitur Rd.
1    P.O. Box 143, 2253 Aenean Rd.
2                 914-6334 Sed Av.
3          613-4380 Iaculis Avenue
4              739-678 Lectus. Rd.
..                             ...
495           Ap #974-697 Elit St.
496                290-4464 In Rd.
497                7136 Massa. St.
498         2335 Pellentesque, Rd.
499                  1708 Diam Ave

[500 rows x 1 columns]
                   score_average  n_datapoints  percentage
address ADDRESS             0.70           297        59.4
        LOCATION            0.85            37         7.4
        PERSON              0.85            17         3.4
        DATE_TIME           0.85            11         2.2
        NRP                 0.85             4         0.8
        NOT PII             0.00           

### NER customizable modelling

In [94]:
from typing import List
from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry

# Rule based model
class AddressRecognizer(PatternRecognizer):
    PATTERNS = [
        Pattern("Address (Simple Regex)", r"\d{1,5}\s\w+\s(?:St|Ave|Ln|Dr|Rd|Blvd)", 0.7),
    ]

    def __init__(self, patterns: List[Pattern] = None, context: str = None):
        super().__init__(
            supported_entity="ADDRESS",
            patterns=patterns if patterns else self.PATTERNS,
            context=context,
            supported_language="en",
        )

address_recognizer = AddressRecognizer()
registry = RecognizerRegistry()
registry.load_predefined_recognizers()

custom_registry = RecognizerRegistry(recognizers=registry.recognizers)
custom_registry.add_recognizer(address_recognizer)
registry.add_recognizer(address_recognizer)
global_analyzer = AnalyzerEngine(registry=custom_registry)

text = "Roberto lives in Five 10 Broad St."
numbers_results = global_analyzer.analyze(text=text, language="en")
print(numbers_results)

[type: PERSON, start: 0, end: 7, score: 0.85, type: ADDRESS, start: 22, end: 33, score: 0.7]


In [93]:
user_column_detection(dataset, global_analyzer)

Columns:
0: address
1: currency
2: postalZip
3: phone
4: name
5: country
6: region
7: email
8: list
9: alphanumeric
10: numberrange
Selected column 'address':
                           address
0               9503 Curabitur Rd.
1    P.O. Box 143, 2253 Aenean Rd.
2                 914-6334 Sed Av.
3          613-4380 Iaculis Avenue
4              739-678 Lectus. Rd.
..                             ...
495           Ap #974-697 Elit St.
496                290-4464 In Rd.
497                7136 Massa. St.
498         2335 Pellentesque, Rd.
499                  1708 Diam Ave

[500 rows x 1 columns]
                   score_average  n_datapoints  percentage
address ADDRESS             0.70           297        59.4
        LOCATION            0.85            37         7.4
        PERSON              0.85            17         3.4
        DATE_TIME           0.85            11         2.2
        NRP                 0.85             4         0.8
        NOT PII             0.00           

In [122]:
city_list = list(testing_ds_second['city'])
city_recognizer = PatternRecognizer(supported_entity="CITY", deny_list=city_list)
custom_registry.add_recognizer(city_recognizer)
global_analyzer = AnalyzerEngine(registry=custom_registry)
text = "Roberto lives in Five 10 Broad St in Ipoh"
numbers_results = global_analyzer.analyze(text=text, language="en")
print(numbers_results)

[type: CITY, start: 37, end: 41, score: 1.0, type: PERSON, start: 0, end: 7, score: 0.85, type: LOCATION, start: 37, end: 41, score: 0.85, type: ADDRESS, start: 22, end: 33, score: 0.7]


In [73]:
user_column_detection(testing_ds_second)

Columns:
0: country
1: first_name
2: last_name
3: email
4: email2
5: profession
6: city
7: country_code
Selected column 'country':
            country
0             Haiti
1           Eritrea
2             Samoa
3           AndorrA
4           Ireland
..              ...
995       Singapore
996     Isle of Man
997       Lithuania
998      Kyrgyzstan
999  United Kingdom

[1000 rows x 1 columns]
                 score_average  n_datapoints  percentage
country CITY               1.0            28         2.8
        NOT PII            0.0           972        97.2


In [1]:
train_data = []

for address in dataset["address"]:
    train_data.append((address, {"entities": [(0, len(address), "ADDRESS")]}))

import spacy
from spacy.util import minibatch, compounding
from spacy.training.example import Example

# Load the base model
nlp = spacy.blank('en')
ner = nlp.add_pipe("ner", source=spacy.load("en_core_web_sm"))

# Add the new entity label to the NER model
ner.add_label("ADDRESS")

# Initialize the weights of the model
examples = [Example.from_dict(nlp(text), annotations) for text, annotations in train_data]
nlp.initialize(lambda: examples)

# Train the NER model
num_epochs = 30
for epoch in range(num_epochs):
    losses = {}
    batches = minibatch(examples, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        nlp.update(batch, losses=losses)
    print(f"Epoch {epoch + 1}, Loss: {losses['ner']}")

# Save the trained model
#nlp.to_disk("my_custom_ner_model")

#nlp = spacy.load("my_custom_ner_model")

# Test the model on some text


NameError: name 'dataset' is not defined

<spacy.lang.en.English at 0x7fbc4efc7c10>

In [123]:
text = 'Ipoh Atlanta'
numbers_results = global_analyzer.analyze(text=text, language="en")
print(numbers_results)

[type: CITY, start: 0, end: 4, score: 1.0, type: CITY, start: 5, end: 12, score: 1.0, type: LOCATION, start: 0, end: 4, score: 0.85, type: LOCATION, start: 5, end: 12, score: 0.85]


In [113]:
analyzer_2 = AnalyzerEngine(
    nlp_engine=nlp, 
    supported_languages=["en"]
)
text = "Roberto lives in Five 10 Broad St."
numbers_results = analyzer_2.analyze(text=text, language="en")
print(numbers_results)



AttributeError: 'English' object has no attribute 'process_text'

# -----------------------------------------