# PII detection


In [1]:
import pandas as pd

In [2]:
df_all = pd.read_csv("../datasets/local/vast-challenge-2021-mc3/all_tweets.csv")
message_col = df_all["message"]

In [4]:
message_col

0                                   Follow us @POK-Kronos
1       Don't miss a moment!  Follow our live coverage...
2       Come join us in the Park! Music tonight at Abi...
3       POK rally to start in Abila City Park. POK lea...
4       POK rally set to take place in Abila City Park...
                              ...                        
4058    RT @AbilaPost unknown explosion heard from the...
4059    RT @CentralBulletin explosion heard at dancing...
4060    RT @KronosStar There has been an explosion fro...
4061                    RT @redisrad What was that? #boom
4062    RT @CentralBulletin explosion heard at dancing...
Name: message, Length: 4063, dtype: object

In [6]:
from presidio_analyzer import AnalyzerEngine

text = "My phone number is 212-555-5555"

# Set up the engine, loads the NLP module (spaCy model by default)
# and other PII recognizers
analyzer = AnalyzerEngine()

# Call analyzer to get results
results = analyzer.analyze(text=text, entities=["PHONE_NUMBER"], language="en")
print(results)

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m559.0 kB/s[0m eta [36m0:00:00[0m00:01[0m00:28[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[type: PHONE_NUMBER, start: 19, end: 31, score: 0.75]


In [13]:
# Full list of entities at: https://microsoft.github.io/presidio/supported_entities/

text = "My name is Will Epperson. My number is 7706330485, and my email is willepp@live.com"

entities = ["EMAIL_ADDRESS", "PERSON", "PHONE_NUMBER", "URL"]


results = analyzer.analyze(
    text=text, entities=entities, language="en", score_threshold=0.6
)

results

[type: EMAIL_ADDRESS, start: 67, end: 83, score: 1.0,
 type: PERSON, start: 11, end: 24, score: 0.85,
 type: PHONE_NUMBER, start: 39, end: 49, score: 0.75]

In [14]:
results = []

entities = ["EMAIL_ADDRESS", "PERSON", "PHONE_NUMBER", "URL"]


def get_pii_results(sentence):
    results = analyzer.analyze(
        text=sentence, entities=entities, score_threshold=0.6, language="en"
    )

    return results


for s in message_col.values:
    r = get_pii_results(s)
    results.append(r)

In [16]:
message_col

0                                   Follow us @POK-Kronos
1       Don't miss a moment!  Follow our live coverage...
2       Come join us in the Park! Music tonight at Abi...
3       POK rally to start in Abila City Park. POK lea...
4       POK rally set to take place in Abila City Park...
                              ...                        
4058    RT @AbilaPost unknown explosion heard from the...
4059    RT @CentralBulletin explosion heard at dancing...
4060    RT @KronosStar There has been an explosion fro...
4061                    RT @redisrad What was that? #boom
4062    RT @CentralBulletin explosion heard at dancing...
Name: message, Length: 4063, dtype: object

In [15]:
results

[[],
 [],
 [],
 [type: PERSON, start: 50, end: 62, score: 0.85],
 [type: PERSON, start: 60, end: 72, score: 0.85],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [type: PERSON, start: 0, end: 17, score: 0.85,
  type: PERSON, start: 74, end: 86, score: 0.85],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [type: PERSON, start: 0, end: 17, score: 0.85,
  type: PERSON, start: 74, end: 86, score: 0.85],
 [type: PERSON, start: 0, end: 17, score: 0.85,
  type: PERSON, start: 74, end: 86, score: 0.85],
 [],
 [],
 [],
 [],
 [],
 [],
 [type: PERSON, start: 11, end: 24, score: 0.85],
 [],
 [],
 [],
 [],
 [],
 [type: PERSON, start: 0, end: 17, score: 0.85,
  type: PERSON, start: 74, end: 86, score: 0.85],
 [type: PERSON, start: 0, end: 17, score: 0.85,
  type: PERSON, start: 74, end: 86, score: 0.85],
 [type: PERSON, start: 0, end: 17, score: 0.85,
  type: PERSON, start: 74, end: 86, score: 0.85],
 [],
 [],
 [],
 [],
 [type: PERSON, start: 0, end: 12, score: 0.85],
 [],
 [],
 [],
 [type: PERSO

In [17]:
num_pii = [len(r) for r in results]

In [18]:
num_pii

[0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 2,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 2,
 0,
 2,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 2,
 1,
 1,
 0,
 2,
 2,
 0,
 4,
 0,
 0,
 2,
 0,
 0,
 1,
 0,
 1,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 1,
 1,
 2,
 4,
 4,
 2,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 2,
 1,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 1,
 0,
 0,
 1,
 2,
 0,
 4,
 4,
 0,
 0,
 0,
 0,
 4,
 4,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,


In [19]:
dfp = pd.read_parquet("./vast2021.parquet")

In [21]:
df_pii = pd.DataFrame({"message_pii_count": num_pii})

df_pii

Unnamed: 0,message_pii_count
0,0
1,0
2,0
3,1
4,1
...,...
4058,1
4059,0
4060,0
4061,0


In [23]:
df_p = dfp.join(df_pii)

In [25]:
df_p.to_parquet("vast2021.parquet")