# Setup

## Clone Repo

In [None]:
!git clone https://github.com/d-wang-0/mask_public.git
%cd mask_public/
!git checkout colab

## Install Requirements

In [None]:
!pip install gdown
!pip install plotly==5.10.0
!pip install pandas
!pip install nltk>=3.4.5

## Download Dataset
1. Download `2014_training-PHI-Gold-Set1.tar.gz` and `training-PHI-Gold-Set2.tar.gz` from the i2b2 dataset
2. Upload them to google drive
3. Share the files on drive so "Anyone on the Internet with the link can view"
4. Paste links to the shared files here and it will be extracted and placed in the correct locations.

In [None]:
import gdown
url1 = "" #@param {type:"string"}
gdown.download(url1, "set1.tar.gz", fuzzy=True)
url2 = "" #@param {type:"string"}
gdown.download(url2, "set2.tar.gz", fuzzy=True)

In [None]:
!rm dataset/input/*
!tar -xf set1.tar.gz --strip=1 --directory dataset/input/ training-PHI-Gold-Set1/
!tar -xf set2.tar.gz --strip=1 --directory dataset/input/ training-PHI-Gold-Set2/ 

# Explore Data

## Load Data

In [None]:
from utils.readers import read_i2b2_data
import utils.spec_tokenizers
import plotly.express as px
import pandas as pd
from utils.spec_tokenizers import *
import random

documents = read_i2b2_data('dataset/input/')
tokens_labels = utils.spec_tokenizers.tokenize_to_seq(documents)

## See full tag examples

In [None]:
# Sort all full tagged text into an object
full_tag_examples = {}
for document in documents:
    for tag in document['tags']:
        if tag['tag'] not in full_tag_examples:
            full_tag_examples[tag['tag']] = [tag['text']]
        full_tag_examples[tag['tag']].append(tag['text'])

In [None]:
tag = 'DATE' #@param ["DATE", "ID", "NAME", "AGE", "CONTACT", "LOCATION", "PROFESSION", "PHI"]
num_examples = 5 #@param {type:"integer"}
random.sample(full_tag_examples[tag], num_examples)

## See tagged token examples

In [None]:
tag_examples = {}
for sequence in tokens_labels:
    for token in sequence:
        word = token[0]
        tag = token[1]
        if tag not in tag_examples:
            tag_examples[tag] = [word]
        else:
            tag_examples[tag].append(word)

In [None]:
tag = 'NAME' #@param ["DATE", "ID", "NAME", "AGE", "CONTACT", "LOCATION", "PROFESSION", "PHI"]
num_examples = 5 #@param {type:"integer"}
random.sample(tag_examples[tag], num_examples)

## See number of tags

In [None]:
include_O = True #@param {type:"boolean"}
start = 1
if include_O:
    start = 0
px.bar(x=list(tag_examples.keys())[start:], y=[len(tag_examples[x]) for x in tag_examples][start:],labels={
                     "x": "Tags",
                     "y": "Token Occurrences"
                 },
                 title = "Number of token occurrences for each tag in the dataset",
                 template = "plotly_dark",
       width=700)

In [None]:
px.bar(x=list(full_tag_examples.keys())[1:], y=[len(full_tag_examples[x]) for x in full_tag_examples][1:],labels={
                     "x": "Tags",
                     "y": "Tag Occurrences"
                 },
                 title = "Number of tag occurrences for each tag in the dataset",
                 template = "plotly_dark",
                width=700)

In [None]:
include_O = False #@param {type:"boolean"}
start = 1
if include_O:
    start = 0
px.pie(names=list(tag_examples.keys())[start:], values=[len(tag_examples[x]) for x in tag_examples][start:], width=700, template = "plotly_dark")

## Length of documents

In [None]:
# record the length and number of occurrences of each tag for each document
stats = {}
for tag in tag_examples:
    stats[tag] = []
stats['length'] = []
for document in documents:
    stats['length'].append(len(document["text"]))
    document_tags_count = {}
    for tag in document["tags"]:
        if tag["tag"] in document_tags_count:
            document_tags_count[tag["tag"]] += 1
        else:
            document_tags_count[tag["tag"]] = 1
    for tag in tag_examples:
        if tag in document_tags_count:
            stats[tag].append(document_tags_count[tag])
        else:
            stats[tag].append(0)

In [None]:
df = pd.DataFrame(data=stats)
px.box(df, y="length", width=700, height=500, title="Length of the documents (characters)", template="plotly_dark",labels={
    'length':'Characters'
})

## Length of sequences

In [None]:
sequence_lengths = [len(seq) for seq in tokens_labels]
px.box(y=sequence_lengths, width=700, height=500, title="Length of sequences (tokens)", template="plotly_dark",labels={'y':"Tokens"})

## Tag occurences per document

In [None]:
px.box(df, y=list(tag_examples.keys())[1:], width=700, height=500, title="Tags occurrences in documents", template="plotly_dark", points=False)