# Project Final Code

Objective statement:
- I will create baseline classification standards to predict whether an Android API method manipulates/returns information associated with data types defined in privacy policies by using classical sparse vector space representations of the data with standard linear and non-linear classifiers.
- As a secondary objective I will contrast using nltk for text processing and data exploration with using the newer spaCy and gensim packages. 

## Obtaining the Data
The data required for this project may be obtained as follows.

YOUR ANSWER HERE

## Setup

In [292]:
import os
import csv
from collections import Counter
import re
import numpy as np
import pandas as pd
import pprint
import spacy
from spacy.tokens import Doc
import matplotlib.pyplot as plt
from gensim.summarization.textcleaner import get_sentences
from nltk.tokenize import sent_tokenize, word_tokenize
# from nltk.tokenize.api.StringTokenizer import tokenize

datadir = 'proj_data'
source_filename = 'android_semi_cleaned.csv'
annotated_filename = 'android_cleaned_ANNOTATED_ONLY.csv'
mappings_data_filename = 'mappings_cleaned.csv'
source_path = os.path.join(datadir, source_filename)
mappings_path = os.path.join(datadir, mappings_data_filename) 
annotated_path = os.path.join(datadir, annotated_filename)


## Data Preparation

### Accessing data

In [4]:
def access_data(path):
    with open(path,'r', newline='', encoding='utf-8') as f:
        reader = csv.reader(f, dialect='excel')
        docs = []
        methods = []
        for i, row in enumerate(reader):
            method = row[0]
            doc = row[1] 
            methods.append(method)
            docs.append(doc)
        return methods, docs



### Remove HTML

In [13]:
def my_html_remover(doc):
    return re.sub('''</?\w+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[\^'">\s]+))?)+\s*|\s*)/?>''','', doc)

### General Cleaning

In [203]:
def my_cleaner(docs):
    new_docs = []
    for doc in docs:
        # standardize spaces and newlines
        new_doc = re.sub(r'\s+', ' ', doc)
        #new_doc = re.sub('''['"]''', '', new_doc)
        new_doc = my_html_remover(new_doc)
        new_docs.append(new_doc)
    return new_docs

def get_docs(path):
    methods, docs = access_data(path)
    docs = my_cleaner(docs)
    return pd.DataFrame({'docs': docs, 'methods':methods})

### Notes
Other cleaning such as punctuation removal and lemmatization will not be done here and will be considered later in the presentation. 

## Data Exploration

For the purpose of demonstration, I will show SpaCy's functionality in pieces, but normally text processing with SpaCy is pipelined automatically with options for minor alterations. The tag line from SpaCy is that _____ only the best tools for the job

### Sentence tokenization with NLTK

In [204]:
def sent_toks_nltk(docs):
    return [[s.strip(':. "') for s in sent_tokenize(doc)] for doc in docs ]

### Sentence tokenization with gensim

In [205]:
def sent_toks_gensim(docs):
    return [[s.strip(':. "\n') for s in get_sentences(doc)] for doc in docs ]

### Sentence tokenization comparison

In [260]:
def get_differences(orig_docs, nltk_docs, gensim_docs):
    diff_cnt = 0
    diff_orig, diff_nltk, diff_gensim = [],[],[]
    same = []
    for i, (orig_doc, nltk_doc, gensim_doc) in enumerate(zip(orig_docs,nltk_docs, gensim_docs)):        
        # keep track of differences to prevent over-counting
        diff_list=[]
        
        # must search sequentially to track where differences occur. Set operations would lose alignment
        for i, (nltk_sent, gensim_sent) in enumerate(zip(nltk_doc, gensim_doc)):
            
            if (nltk_sent not in gensim_doc or gensim_sent not in nltk_doc) \
                and ([l for l in diff_list if gensim_sent in l or nltk_sent in l] == []):
                diff_cnt += 1
                diff_list.extend([nltk_sent,gensim_sent])
                diff_orig.append(orig_doc)
                diff_nltk.append(nltk_sent)
                diff_gensim.append(gensim_sent)
        
            else:
                same.append(gensim_sent)
    diff_examples = pd.DataFrame({'Original':diff_orig,'nltk':diff_nltk,'gensim':diff_gensim})
    
    return diff_cnt, diff_examples, same
        
        
def compare_sent_tokenization(docs):
    
    nltk_docs_tokenized = sent_toks_nltk(docs)
    gensim_docs_tokenized = sent_toks_gensim(docs)
    diff_cnt, diff_examples, _ = get_differences(docs, nltk_docs_tokenized, gensim_docs_tokenized)
    num_nltk_sents = sum([len(d) for d in nltk_docs_tokenized])
    num_gensim_sents = sum([len(d) for d in gensim_docs_tokenized])
    avg_diff_len_nltk = sum([len(d.split()) for d in diff_examples['nltk']])/diff_cnt
    avg_diff_len_gensim = sum([len(d.split()) for d in diff_examples['gensim']])/diff_cnt
    diff_sent_lens = [(len(doc1),len(doc2)) for doc1,doc2 in zip(nltk_docs_tokenized,gensim_docs_tokenized)\
                                                if len(doc1) != len(doc2) ]
    frac_nltk_greater = len(list(filter(lambda x:x[0]>x[1], diff_sent_lens)))/len(diff_sent_lens)
    avg_diff_sents_nltk = sum([len(doc) for doc in nltk_docs_tokenized])/len(nltk_docs_tokenized)
    avg_diff_sents_gensim = sum([len(doc) for doc in gensim_docs_tokenized])/len(gensim_docs_tokenized)
#     avg_diff_sents_gensim = diff_examples.groupby('Original').count()
#     print(avg_diff_sents_nltk)
    print(f"Total number of documents: {len(docs):,}")
    print(f"Number of differences: {diff_cnt:,}")
    print(f"{'NLTK':>65}{'Gensim':>20}")
    print(f"{'Number of senteces:':>45}{num_nltk_sents:>20,}{num_gensim_sents:>20,}")
    print(f"{'Avg. #tokens of differences:':>45}{avg_diff_len_nltk:>20.3f}{avg_diff_len_gensim:>20.3f}")
    print(f"{'Pct. documents with the most senteces:':>45}{frac_nltk_greater:>20.2%}{(1-frac_nltk_greater):>20.2%}")
    print('\nExamples of differences:')
    diff_list = []
    printed = 0
    for i,(orig,nltk_sent, gensim_sent) in diff_examples.iterrows():
          if len(orig.split()) > 100:
              continue
          if printed >= 3:
              break
          if orig not in diff_list:
              print('\nORIGINAL:',orig)

          print('NLTK:', nltk_sent)
          print('GENSIM:', gensim_sent)
          diff_list.append(orig)
          printed += 1
    
df = get_docs(source_path)
compare_sent_tokenization(df['docs'].to_list())

Total number of documents: 29,402
Number of differences: 527
                                                             NLTK              Gensim
                          Number of senteces:              73,158              73,154
                 Avg. #tokens of differences:              18.139              20.767
       Pct. documents with the most senteces:              51.10%              48.90%

Examples of differences:

ORIGINAL: Called by a device admin to set the short support message. This will be displayed to the user in settings screens where funtionality has been disabled by the admin. The message should be limited to a short statement such as "This setting is disabled by your administrator. Contact someone@example.com for support." If the message is longer than 200 characters it may be truncated. If the short support message needs to be localized it is the responsibility of the DeviceAdminReceiver to listen to the Intent#ACTION_LOCALE_CHANGED broadcast and set a new vers

### Tokenization with NLTK

In [219]:
def nltk_tokenize(sent):
    return word_tokenize(sent)

### Tokenization with Spacy
SpaCy provides models that are automatically pipelined, so one call will tokenize, tag, and identify named entities. For the purpose of comparison I will separate them here.

In [220]:
def spacy_tokenize(sent, model):
    return [tok.text for tok in model(sent)]

### Tokenization comparison 

In [229]:
def compare_tokenization(docs):
    nltk_tok_by_sent = []
    spacy_tok_by_sent = []
    model = spacy.load('en_core_web_sm')
    for i, doc in enumerate(model.pipe(docs, disable=["tagger", "parser", "ner"])):
        if i %5000 ==0:
            print(i)
        nltk_tok_by_sent.append(nltk_tokenize(doc.text))
    #print(model.pipe_names)
    #for doc in model.pipe(docs, disable=["tagger", "parser", "ner"]):
        spacy_tok_by_sent.append([tok.text for tok in doc])
        
    diff_docs = [(nltk_toks,spacy_toks) for nltk_toks, spacy_toks in zip(nltk_tok_by_sent,spacy_tok_by_sent)
                                      if len(nltk_toks) != len(spacy_toks)]
    return nltk_tok_by_sent, spacy_tok_by_sent, diff_docs
df = get_docs(source_path)
nltk_tok_by_sent, spacy_tok_by_sent, diff_docs = compare_tokenization(df['docs'].to_list())

0
5000
10000
15000
20000
25000


In [301]:
def show_diffs(nltk_tok_by_sent, spacy_tok_by_sent, diff_docs):
    nltk_tot_tokens = sum([len(sent) for sent in nltk_tok_by_sent])
    spacy_tot_tokens = sum([len(sent) for sent in spacy_tok_by_sent])
    nltk_avg_tokens_tot = nltk_tot_tokens/len(nltk_tok_by_sent)
    spacy_avg_tokens_tot = spacy_tot_tokens/len(spacy_tok_by_sent)
#     print([len(sent) for sent in diff_docs)
    nltk_tot_toks_of_diff = sum([len(y) for y in [x for x in zip(*diff_docs)][0]])
    spacy_tot_toks_of_diff = sum([len(y) for y in [x for x in zip(*diff_docs)][1]])
    nltk_avg_tokens_diff = nltk_tot_toks_of_diff/len(diff_docs)
    spacy_avg_tokens_diff = spacy_tot_toks_of_diff/len(diff_docs)
    print(f'Number of documents with different tokenization: {len(diff_docs)} ', end='')
    print(f'Fraction of total: {len(diff_docs)/ len(nltk_tok_by_sent):.2%}')
    print(f"{'NLTK':>65}{'SpaCy':>20}")
    print(f"{'Number of tokens:':>45}{nltk_tot_tokens:>20,}{spacy_tot_tokens:>20,}")
    print(f"{'Avg. tokens per sentence:':>45}{nltk_avg_tokens_tot:>20,.3f}{spacy_avg_tokens_tot:>20,.3f}")
    print(f"{'Number of tokens of differences:':>45}{nltk_tot_toks_of_diff:>20,}{spacy_tot_toks_of_diff:>20,}")
    print(f"{'Avg. #tokens of differences:':>45}{nltk_avg_tokens_diff:>20.3f}{spacy_avg_tokens_diff:>20.3f}")
    print('\nExamples of differences:')
    pp = pprint.PrettyPrinter(indent=2, width=110)
    printed = 0
    for i, (nltk_toks, spacy_toks) in enumerate(diff_docs):
          if len(nltk_toks) > 10 or len(spacy_toks) > 10:
              continue
          if printed >= 10:
              break
          print('\nNLTK:')
          pp.pprint(nltk_toks)
          print('SpaCy:')
          pp.pprint(spacy_toks)
          printed+=1
show_diffs(nltk_tok_by_sent, spacy_tok_by_sent, diff_docs)

Number of documents with different tokenization: 9839 Fraction of total: 33.46%
                                                             NLTK               SpaCy
                            Number of tokens:           1,262,017           1,262,671
                    Avg. tokens per sentence:              42.923              42.945
             Number of tokens of differences:             769,146             769,800
                 Avg. #tokens of differences:              78.173              78.240

Examples of differences:

NLTK:
['Returns', 'the', 'current', 'setStructuredData', '(', 'String', ')', '.']
SpaCy:
['Returns', 'the', 'current', 'setStructuredData(String', ')', '.']

NLTK:
['Returns', 'the', 'system-wide', 'Private', 'DNS', 'host', '.']
SpaCy:
['Returns', 'the', 'system', '-', 'wide', 'Private', 'DNS', 'host', '.']

NLTK:
['Returns', 'the', 'system-wide', 'Private', 'DNS', 'mode', '.']
SpaCy:
['Returns', 'the', 'system', '-', 'wide', 'Private', 'DNS', 'mode', '.']

N

### POS tagging w/ NLTK

### POS tagging w/ SpaCy

### POS tagging comparison

### NER w/ NLTK

### NER w/ SpaCy

### NER comparison

### Semantic similarity w/ NLTK

### Semantic similarity w/ SpaCy

### Semantic similarity comparison

In [None]:
def process_matches(doc, expression, ent_label=None, tok_attrs=None):
    for match in re.finditer(expression, doc.text):
        if match.group(0) not in ['e.g.', 'i.e.']:
            start,end = match.span()
            span = doc.char_span(start, end, label = ent_label)
            if span is not None:
                doc.ents = list(doc.ents) + [span]
                with doc.retokenize() as retokenizer:
                    retokenizer.merge(span, attrs=tok_attrs)
    return doc


def my_retokenizer(doc):
    expression = r'([A-Za-z]+\.)+[A-Za-z]+\.?'
    doc = process_matches(doc, expression, ent_label='MT_OR_CL', tok_attrs={'POS' : 'PROPN'})
    expression = r'[A-Z]*[a-z]+([A-Z]+[a-z]*)+'
    doc = process_matches(doc, expression, ent_label='MT_OR_CL', tok_attrs={'POS' : 'PROPN'})
    return doc


def doc_info(docs):
    '''
    Returns the parsed document, the token counter, POS tag counter,and the POS tag counter by word 
    '''
    nlp = spacy.load("en_core_web_sm", disable=["parser", 'ner'])
    nlp.add_pipe(my_retokenizer, first=True)
    tok_cnt = Counter()
    pos_cnt = Counter()
    pos_byword_cnt = {}
    parsed_docs = []
    for tup in docs:
        doc = nlp(tup[1])
        parsed_docs.append((tup[0], doc))
        for i, tok in enumerate(doc):
            tok_cnt[tok.text] += 1
            pos_cnt[tok.pos_] += 1
            if tok.pos_ in pos_byword_cnt:
                pos_byword_cnt[tok.pos_][tok.text] +=1
            else:
                pos_byword_cnt[tok.pos_] = Counter()
                pos_byword_cnt[tok.pos_][tok.text] += 1
                
    return parsed_docs, tok_cnt, pos_cnt, pos_byword_cnt         


def display_info(parsed_docs, method_documents,tok_cnt, pos_cnt, pos_byword_cnt, classes=None):
    unique_sents = Counter()
    unique_docs = []
    for method,doc in parsed_docs:
        if doc.text not in unique_sents:
            unique_docs.append((method,doc))
        unique_sents[doc.text] +=1  
    if classes:
        print('\t>-total number of classes:', len(set(classes)), '<br>')
        print('\t -total number of methods:', len(method_documents.keys()), '<br>')
    else:
        print('\t>-total number of methods:', len(method_documents.keys()), '<br>')
    print('\t -total records after transform:', len(parsed_docs), '<br>')
    print('\t -number of unique records after transform:', len(unique_sents), '<br>')
    print('\t -method with most sentences:', max([(key,len(method_documents[key])) for key in method_documents]
                                             , key=lambda x: x[1]), '<br>')
    print('\t -method with most tokens:', max([(method,len(doc)) for method,doc in unique_docs]
                                             , key=lambda x: x[1]), '<br>')
    print('\t -total number of tokens:', sum([tok_cnt[key] for key in tok_cnt.keys()]), '<br>')
    print("\t -num unique tokens:", len(tok_cnt.keys()), '<br>')
    print('\t -most common tokens (with 5 or more chars):'
          ,[tup for tup in tok_cnt.most_common() if len(tup[0])>4][:3], '<br>')
    most_freq_pos = pos_cnt.most_common(1)[0][0]
    print('\t -most frequent POS tag:', most_freq_pos, '<br>')
    print('\t -most common words in that tag:', pos_byword_cnt[most_freq_pos].most_common(1)[0], '<br>')
    print('\t -most frequent proper noun:', pos_byword_cnt['PROPN'].most_common(1)[0], '<br>')
    method_and_class_toks = [ent.text for p_doc in parsed_docs for ent in p_doc[1].ents 
                             if ent.label_ == 'MT_OR_CL']
    print('\t -number of unique domain-specific named entities:', len(method_and_class_toks), '<br>')
    print('\t -number of unique domain-specific named entities:', len(set(method_and_class_toks)), '<br>')
    print('\t -most frequent domain-specific named entity:'
          , Counter(method_and_class_toks).most_common()[0], '<br>')
    print()

def process_doc_and_display_attrs(path, mappings_path=None):
    docs, doc_by_method = access_data(path)  
    classes = None
    if mappings_path:
        mapping_docs, _ = access_data(mappings_path) 
        mapped_methods = [mapping_doc[0] for mapping_doc in mapping_docs]
        classes = [mapping_doc[1] for mapping_doc in mapping_docs]
        docs = [doc for doc in docs if doc[0] in mapped_methods]
        doc_by_method = {key:doc_by_method[key] for key in doc_by_method.keys() if key in mapped_methods}

    parsed_docs, tok_cnt, pos_cnt, pos_byword_cnt = doc_info(docs)  
    print('INFO FOR',path)
    display_info(parsed_docs, doc_by_method, tok_cnt, pos_cnt, pos_byword_cnt, classes=classes)
    return parsed_docs, tok_cnt

## Modeling

## Presentation Graphic(s)

## Project approach and overall execution
Do not put anything below this cell

## Code Structure and Organization

## Code Commenting