# Project Final Code

Objective statement:
- I will create baseline classification standards to predict whether an Android API method manipulates/returns information associated with data types defined in privacy policies by using classical sparse vector space representations of the data with standard linear and non-linear classifiers.
- As a secondary objective I will contrast using nltk for text processing and data exploration with using the newer spaCy and gensim packages. 

## Obtaining the Data
The data required for this project may be obtained as follows.

YOUR ANSWER HERE

## Setup

In [None]:
import os
import csv
from collections import Counter
import re
import spacy
from spacy.tokens import Doc
import numpy as np
import matplotlib.pyplot as plt

## Data Preparation

### Accessing data

In [None]:
def access_data(path):
    with open(path,'r', newline='', encoding='utf-8') as f:
        reader = csv.reader(f, dialect='excel')
        docs = []
        methods = []
        for i, row in enumerate(reader):
            method = row[0]
            doc = row[1] 
            methods.append(method)
            docs.append(doc)
        return methods, docs



### Remove HTML

In [None]:
def my_html_remover(doc):
    return re.sub(r"<[^>]*>", doc)

### General Cleaning

In [None]:
def my_cleaner(docs):
    new_docs = []
    for doc in docs:
        # standardize spaces and newlines
        new_doc = re.sub(r'\s+', ' ', doc)
        new_doc = my_html_remover(new_doc)
        new_docs.append(new_doc)
    return new_docs

### Notes
Other cleaning such as punctuation removal and lemmatization will not be done here and will be considered later in the presentation. 

## Data Exploration

For the purpose of demonstration, I will show SpaCy's functionality in pieces, but normally text processing with SpaCy is pipelined automatically with options for minor alterations. The tag line from SpaCy is that _____ only the best tools for the job

### Sentence tokenization with NLTK

### Sentence tokenization with gensim

In [None]:
docs

### Sentence tokenization comparison

### Tokenization with NLTK

### Tokenization with Spacy

### Tokenization comparison 

### POS tagging w/ NLTK

### POS tagging w/ SpaCy

### POS tagging comparison

### NER w/ NLTK

### NER w/ SpaCy

### NER comparison

### Semantic similarity w/ NLTK

### Semantic similarity w/ SpaCy

### Semantic similarity comparison

In [None]:
def process_matches(doc, expression, ent_label=None, tok_attrs=None):
    for match in re.finditer(expression, doc.text):
        if match.group(0) not in ['e.g.', 'i.e.']:
            start,end = match.span()
            span = doc.char_span(start, end, label = ent_label)
            if span is not None:
                doc.ents = list(doc.ents) + [span]
                with doc.retokenize() as retokenizer:
                    retokenizer.merge(span, attrs=tok_attrs)
    return doc


def my_retokenizer(doc):
    expression = r'([A-Za-z]+\.)+[A-Za-z]+\.?'
    doc = process_matches(doc, expression, ent_label='MT_OR_CL', tok_attrs={'POS' : 'PROPN'})
    expression = r'[A-Z]*[a-z]+([A-Z]+[a-z]*)+'
    doc = process_matches(doc, expression, ent_label='MT_OR_CL', tok_attrs={'POS' : 'PROPN'})
    return doc


def doc_info(docs):
    '''
    Returns the parsed document, the token counter, POS tag counter,and the POS tag counter by word 
    '''
    nlp = spacy.load("en_core_web_sm", disable=["parser", 'ner'])
    nlp.add_pipe(my_retokenizer, first=True)
    tok_cnt = Counter()
    pos_cnt = Counter()
    pos_byword_cnt = {}
    parsed_docs = []
    for tup in docs:
        doc = nlp(tup[1])
        parsed_docs.append((tup[0], doc))
        for i, tok in enumerate(doc):
            tok_cnt[tok.text] += 1
            pos_cnt[tok.pos_] += 1
            if tok.pos_ in pos_byword_cnt:
                pos_byword_cnt[tok.pos_][tok.text] +=1
            else:
                pos_byword_cnt[tok.pos_] = Counter()
                pos_byword_cnt[tok.pos_][tok.text] += 1
                
    return parsed_docs, tok_cnt, pos_cnt, pos_byword_cnt         


def display_info(parsed_docs, method_documents,tok_cnt, pos_cnt, pos_byword_cnt, classes=None):
    unique_sents = Counter()
    unique_docs = []
    for method,doc in parsed_docs:
        if doc.text not in unique_sents:
            unique_docs.append((method,doc))
        unique_sents[doc.text] +=1  
    if classes:
        print('\t>-total number of classes:', len(set(classes)), '<br>')
        print('\t -total number of methods:', len(method_documents.keys()), '<br>')
    else:
        print('\t>-total number of methods:', len(method_documents.keys()), '<br>')
    print('\t -total records after transform:', len(parsed_docs), '<br>')
    print('\t -number of unique records after transform:', len(unique_sents), '<br>')
    print('\t -method with most sentences:', max([(key,len(method_documents[key])) for key in method_documents]
                                             , key=lambda x: x[1]), '<br>')
    print('\t -method with most tokens:', max([(method,len(doc)) for method,doc in unique_docs]
                                             , key=lambda x: x[1]), '<br>')
    print('\t -total number of tokens:', sum([tok_cnt[key] for key in tok_cnt.keys()]), '<br>')
    print("\t -num unique tokens:", len(tok_cnt.keys()), '<br>')
    print('\t -most common tokens (with 5 or more chars):'
          ,[tup for tup in tok_cnt.most_common() if len(tup[0])>4][:3], '<br>')
    most_freq_pos = pos_cnt.most_common(1)[0][0]
    print('\t -most frequent POS tag:', most_freq_pos, '<br>')
    print('\t -most common words in that tag:', pos_byword_cnt[most_freq_pos].most_common(1)[0], '<br>')
    print('\t -most frequent proper noun:', pos_byword_cnt['PROPN'].most_common(1)[0], '<br>')
    method_and_class_toks = [ent.text for p_doc in parsed_docs for ent in p_doc[1].ents 
                             if ent.label_ == 'MT_OR_CL']
    print('\t -number of unique domain-specific named entities:', len(method_and_class_toks), '<br>')
    print('\t -number of unique domain-specific named entities:', len(set(method_and_class_toks)), '<br>')
    print('\t -most frequent domain-specific named entity:'
          , Counter(method_and_class_toks).most_common()[0], '<br>')
    print()

def process_doc_and_display_attrs(path, mappings_path=None):
    docs, doc_by_method = access_data(path)  
    classes = None
    if mappings_path:
        mapping_docs, _ = access_data(mappings_path) 
        mapped_methods = [mapping_doc[0] for mapping_doc in mapping_docs]
        classes = [mapping_doc[1] for mapping_doc in mapping_docs]
        docs = [doc for doc in docs if doc[0] in mapped_methods]
        doc_by_method = {key:doc_by_method[key] for key in doc_by_method.keys() if key in mapped_methods}

    parsed_docs, tok_cnt, pos_cnt, pos_byword_cnt = doc_info(docs)  
    print('INFO FOR',path)
    display_info(parsed_docs, doc_by_method, tok_cnt, pos_cnt, pos_byword_cnt, classes=classes)
    return parsed_docs, tok_cnt

## Modeling

## Presentation Graphic(s)

## Project approach and overall execution
Do not put anything below this cell

## Code Structure and Organization

## Code Commenting