In [None]:
pip install st_annotated_text

In [1]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

In [2]:
import pandas as pd
import re

In [3]:
t = "Google News is a news aggregator service developed by Google. It presents a continuous flow of articles organized from thousands of publishers and magazines. Google News is available as an app on Android, iOS, and the Web. Google released a beta version in September 2002 and the official app in January 2006."

In [4]:
nlp_doc = nlp(t)

In [5]:
nlp = spacy.load("en_core_web_sm")

privacy_type_mapping_filename = 'privacy_type_mapping.csv'
privacy_type_mapping = pd.read_csv(
    privacy_type_mapping_filename,
    index_col=0,
    keep_default_na=False,
    converters={"Requirements": lambda x: x.split("\n") if x else None},
).to_dict('index')


def extract_email(text):
    return re.findall('[A-Za-z0-9]+[A-Za-z0-9._%+-]*@\w+.\w{2,4}', text)


def extract_phone(text):
    return re.findall('(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})', text)


def extract_chd(text):
    return re.findall('[0-9]{16}', text)

In [6]:
data_matchings = []
    
for entity in nlp_doc.ents:
    if entity.label_ in privacy_type_mapping and privacy_type_mapping[entity.label_]['Requirements']:
        data_matching_object = {
            'type': entity.label_,
            'value': entity.text,
            'requirements': privacy_type_mapping[entity.label_]['Requirements'],
        }
        data_matchings.append(data_matching_object)
        print(data_matching_object)

for extracted in extract_email(nlp_doc.text) + extract_phone(nlp_doc.text):
    data_matching_object = {
        'type': 'CONTACT',
        'value': extracted,
        'requirements': ['GLBA', 'CCPA', 'PIPEDA'],
    }
    data_matchings.append(data_matching_object)
    print(data_matching_object)

for extracted in extract_chd(nlp_doc.text):
    data_matching_object = {
        'type': 'CHD',
        'value': extracted,
        'requirements': ['PCI'],
    }
    data_matchings.append(data_matching_object)
    print(data_matching_object)

data_result = {
    'match': bool(data_matchings),
    'matchings': data_matchings,
}

{'type': 'PERSON', 'value': 'Android', 'requirements': ['GLBA', 'CCPA ', 'PIPEDA']}
{'type': 'DATE', 'value': 'September 2002', 'requirements': ['GLBA', 'CCPA', 'PIPEDA']}
{'type': 'DATE', 'value': 'January 2006', 'requirements': ['GLBA', 'CCPA', 'PIPEDA']}


In [12]:
def allindices(string, sub, offset=0):
    listindex=[]
    i = string.find(sub, offset)
    while i >= 0:
        listindex.append(i)
        i = string.find(sub, i + 1)
        
    res = [(l, l+len(sub)) for l in listindex]
    return res

In [13]:
allindices(t, "Android")

[(196, 203)]

In [14]:
for m in data_matchings:
    m['offsets'] = allindices(t, m['value'])

In [15]:
data_matchings

[{'type': 'PERSON',
  'value': 'Android',
  'requirements': ['GLBA', 'CCPA ', 'PIPEDA'],
  'offsets': [(196, 203)]},
 {'type': 'DATE',
  'value': 'September 2002',
  'requirements': ['GLBA', 'CCPA', 'PIPEDA'],
  'offsets': [(257, 271)]},
 {'type': 'DATE',
  'value': 'January 2006',
  'requirements': ['GLBA', 'CCPA', 'PIPEDA'],
  'offsets': [(296, 308)]}]

In [24]:
def include_all(l):
    return "(" + ", ".join(l) + ")"

In [20]:
offset2data = {}

In [21]:
all_offsets = []
for m in data_matchings:
    all_offsets += m['offsets']
    for o in m['offsets']:
        offset2data[o] = m

In [22]:
all_offsets.sort()

In [23]:
all_offsets

[(196, 203), (257, 271), (296, 308)]

In [28]:
starting_index = 0
seg = []
for idx, off in enumerate(all_offsets):
    if idx == 0:
        seg.append(t[0:off[0]])
    data = offset2data[off]
    seg.append((data['value'], data['type'] + "|" + include_all(data['requirements']), "#8ef"))
seg.append(t[off[1]:])

In [29]:
seg

['Google News is a news aggregator service developed by Google. It presents a continuous flow of articles organized from thousands of publishers and magazines. Google News is available as an app on ',
 ('Android', 'PERSON|(GLBA, CCPA , PIPEDA)', '#8ef'),
 ('September 2002', 'DATE|(GLBA, CCPA, PIPEDA)', '#8ef'),
 ('January 2006', 'DATE|(GLBA, CCPA, PIPEDA)', '#8ef'),
 '.']

In [9]:
import streamlit as st
from st_annotated_text import annotated_text

"""
# Annotated text example

Below is an example of how to use the annotated_text function:
"""

annotated_text(
    "This ",
    ("is", "verb", "#8ef"),
    " some ",
    ("annotated", "adj", "#faa"),
    ("text", "noun", "#afa"),
    " for those of ",
    ("you", "pronoun", "#fea"),
    " who ",
    ("like", "verb", "#8ef"),
    " this sort of ",
    ("thing", "noun", "#afa"),
)

ModuleNotFoundError: No module named 'st_annotated_text'

In [10]:
pip install st_annotated_text

Looking in indexes: https://registry.affirm-stage.com/artifactory/api/pypi/pypi/simple, https://pypi.org/simple
Looking in links: /var/cache/pip/wheelhouse
You should consider upgrading via the '/Users/jiaruixu/work_space/privacy_api/.env/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
import matplotlib


ModuleNotFoundError: No module named 'matplotlib'