# Project Final Code

Objective statement:
- I will create baseline classification standards to predict whether an Android API method manipulates/returns information associated with data types defined in privacy policies by using classical sparse vector space representations of the data with standard linear and non-linear classifiers.
- As a secondary objective I will contrast using nltk for text processing and data exploration with using the newer spaCy and gensim packages. 

## Obtaining the Data
The data required for this project may be obtained as follows.

YOUR ANSWER HERE

## Setup

In [59]:
import os
import csv
from collections import Counter
import re
import numpy as np
import pandas as pd
import pprint
import spacy
import nltk

from spacy.tokens import Doc
import matplotlib.pyplot as plt
from gensim.summarization.textcleaner import get_sentences
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag, ne_chunk

nltk.download('maxent_ne_chunker')
datadir = 'proj_data'
source_filename = 'android_semi_cleaned.csv'
annotated_filename = 'android_cleaned_ANNOTATED_ONLY.csv'
mappings_data_filename = 'mappings_cleaned.csv'
source_path = os.path.join(datadir, source_filename)
mappings_path = os.path.join(datadir, mappings_data_filename) 
annotated_path = os.path.join(datadir, annotated_filename)
MODEL_COMPARISON_DIR = 'model_comparisons'
nltk_tok_f= 'nltk_tok_by_sent.npy'
spacy_tok_f= 'spacy_tok_by_sent.npy'
tok_diffs_f= 'tok_dif.npy'
nltk_pos_f= 'nltk_pos_by_sent.npy'
spacy_pos_f= 'spacy_pos_by_sent.npy'
nltk_ner_f = 'nltk_ner.npy'
spacy_ner_f= 'spacy_ner.npy'
parsed_docs_f = 'parsed_docs.npy'

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\ChrisCrabtree\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


## Data Preparation

### Accessing data

In [49]:
def access_data(path):
    with open(path,'r', newline='', encoding='utf-8') as f:
        reader = csv.reader(f, dialect='excel')
        docs = []
        methods = []
        method_documents = {}
        for i, row in enumerate(reader):
            method = row[0]
            doc = row[1] 
            if method in method_documents:
                if [sent for sent in method_documents[method] if sent == doc] == []:
                    method_documents[method].append(doc)
            methods.append(method)
            docs.append(doc)
        return methods, docs, method_documents



### Remove HTML

In [50]:
def my_html_remover(doc):
    return re.sub('''</?\w+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[\^'">\s]+))?)+\s*|\s*)/?>''','', doc)

### General Cleaning

In [51]:
def my_cleaner(docs):
    new_docs = []
    for doc in docs:
        # standardize spaces and newlines
        new_doc = re.sub(r'\s+', ' ', doc)
        new_doc = re.sub(r'\s+\.\s+', '. ', new_doc)
        new_doc = my_html_remover(new_doc)
        new_docs.append(new_doc)
    return new_docs

def get_docs(path):
    methods, docs, docs_by_method = access_data(path)
    docs = my_cleaner(docs)
    return pd.DataFrame({'docs': docs, 'methods':methods}), docs_by_method

### Notes
Other cleaning such as punctuation removal and lemmatization will not be done here and will be considered later in the presentation. 

## Data Exploration

For the purpose of demonstration, I will show SpaCy's functionality in pieces, but normally text processing with SpaCy is pipelined automatically with options for minor alterations. The tag line from SpaCy is that _____ only the best tools for the job

### Sentence tokenization with NLTK

In [7]:
def sent_toks_nltk(docs):
    return [[s.strip(':. "') for s in sent_tokenize(doc)] for doc in docs ]

### Sentence tokenization with gensim

In [8]:
def sent_toks_gensim(docs):
    return [[s.strip(':. "') for s in get_sentences(doc)] for doc in docs ]

### Sentence tokenization comparison

In [9]:
def get_differences(orig_docs, nltk_docs, gensim_docs):
    diff_cnt = 0
    diff_orig, diff_nltk, diff_gensim = [],[],[]
    same = []
    for i, (orig_doc, nltk_doc, gensim_doc) in enumerate(zip(orig_docs,nltk_docs, gensim_docs)):        
        # keep track of differences to prevent over-counting
        diff_list=[]
        
        # must search sequentially to track where differences occur. Set operations would lose alignment
        for i, (nltk_sent, gensim_sent) in enumerate(zip(nltk_doc, gensim_doc)):
            
            if (nltk_sent not in gensim_doc or gensim_sent not in nltk_doc) \
                and ([l for l in diff_list if gensim_sent in l or nltk_sent in l] == []):
                diff_cnt += 1
                diff_list.extend([nltk_sent,gensim_sent])
                diff_orig.append(orig_doc)
                diff_nltk.append(nltk_sent)
                diff_gensim.append(gensim_sent)
        
            else:
                same.append(gensim_sent)
    diff_examples = pd.DataFrame({'Original':diff_orig,'nltk':diff_nltk,'gensim':diff_gensim})
    
    return diff_cnt, diff_examples, same
        
        
def compare_sent_tokenization(docs):
    
    nltk_docs_tokenized = sent_toks_nltk(docs)
    gensim_docs_tokenized = sent_toks_gensim(docs)
    diff_cnt, diff_examples, _ = get_differences(docs, nltk_docs_tokenized, gensim_docs_tokenized)
    num_nltk_sents = sum([len(d) for d in nltk_docs_tokenized])
    num_gensim_sents = sum([len(d) for d in gensim_docs_tokenized])
    avg_diff_len_nltk = sum([len(d.split()) for d in diff_examples['nltk']])/diff_cnt
    avg_diff_len_gensim = sum([len(d.split()) for d in diff_examples['gensim']])/diff_cnt
    diff_sent_lens = [(len(doc1),len(doc2)) for doc1,doc2 in zip(nltk_docs_tokenized,gensim_docs_tokenized)\
                                                if len(doc1) != len(doc2) ]
    frac_nltk_greater = len(list(filter(lambda x:x[0]>x[1], diff_sent_lens)))/len(diff_sent_lens)
    avg_diff_sents_nltk = sum([len(doc) for doc in nltk_docs_tokenized])/len(nltk_docs_tokenized)
    avg_diff_sents_gensim = sum([len(doc) for doc in gensim_docs_tokenized])/len(gensim_docs_tokenized)
    print(f"Total number of documents: {len(docs):,}")
    print(f"Number of differences: {diff_cnt:,}")
    print(f"{'NLTK':>65}{'Gensim':>20}")
    print(f"{'Number of senteces:':>45}{num_nltk_sents:>20,}{num_gensim_sents:>20,}")
    print(f"{'Avg. #tokens of differences:':>45}{avg_diff_len_nltk:>20.3f}{avg_diff_len_gensim:>20.3f}")
    print(f"{'Pct. documents with the most senteces:':>45}{frac_nltk_greater:>20.2%}{(1-frac_nltk_greater):>20.2%}")
    print('\nExamples of differences:')
    diff_list = []
    printed = 0
    for i,(orig,nltk_sent, gensim_sent) in diff_examples.iterrows():
          if len(orig.split()) > 100:
              continue
          if printed >= 3:
              break
          if orig not in diff_list:
              print('\nORIGINAL:',orig)

          print('NLTK:', nltk_sent)
          print('GENSIM:', gensim_sent)
          diff_list.append(orig)
          printed += 1
    


### Tokenization with NLTK

In [10]:
def nltk_tokenize(sent):
    return word_tokenize(sent)

### Tokenization with Spacy
SpaCy provides models that are automatically pipelined, so one call will tokenize, tag, and identify named entities. For the purpose of comparison I will show how to only obtain the tokens. Spacy also has a clear API with easy instructions to alter the behavior of it's tokenization. I include functions here that include tokens unique to the Android corpus.

In [32]:
def spacy_tokenize(sent, model):
    return [tok.text for tok in model(sent)]

# Here I search for instances of a method or class (e.g. setStructuredData(String))
def my_retokenizer_builder():
    # methods
    expressions = [r'[A-Za-z_]+(\.[A-Za-z_]+)* ?\( ?[A-Za-z_\.]* ?\)']
    # Camel case w/ periods
    expressions.append(r'(([A-Za-z_]*)\.)+[A-Za-z_]*')
    # General camel case 
    expressions.append(r'[A-Z]?([a-z]+[A-Z])+[a-z]+')
    # Constants
    expressions.append(r'[A-Z_]{4,}')

    compiled_expressions = [re.compile(expression)  for expression in expressions]
  
    def my_retokenizer(doc):
        for i,expression in enumerate(compiled_expressions):
            doc = process_matches(doc, expression, ent_label='MT_OR_CL', tok_attrs={'POS' : 'PROPN'})

        return doc
    
    return my_retokenizer


def process_matches(doc, expression, ent_label=None, tok_attrs=None):
    count=0
    for match in re.finditer(expression, doc.text):
        count+=1
        if match.group(0) not in ['e.g.', 'i.e.']:
            start,end = match.span()
            span = doc.char_span(start, end, label = ent_label)
            if span is not None:
                pot_ents = [ent for ent in doc.ents if ent.start >= span.start or ent.end <= span.end]
                if pot_ents != []:
                    new_ents= list(doc.ents)
                    [new_ents.remove(pe) for pe in pot_ents]
                    doc.ents = new_ents
                doc.ents = list(doc.ents) + [span]
                with doc.retokenize() as retokenizer:
                    retokenizer.merge(span, attrs=tok_attrs)
    return doc

### POS tagging w/ NLTK

In [12]:
def nltk_pos(text):
    return pos_tag(nltk_tokenize(text))

### POS tagging w/ SpaCy

In [13]:
def spacy_pos(sent, model):
    return [tok.pos_ for tok in model(sent)]

### NER w/ NLTK

In [14]:
def nltk_ner(text):
    return ne_chunk(nltk_pos(nltk_tokenize(text)))

### NER w/ SpaCy

In [15]:
def spacy_ner(sent, model):
    return [(ent.text,ent.label_) for ent in model(sent).ents]

### Semantic similarity w/ NLTK

### Semantic similarity w/ Gensim

In [None]:
print(re.split(r'[^\(\)]', 'hey ho( s)'))

## Functions for Comparisons

In [69]:

def load_model_info():
    if MODEL_COMPARISON_DIR in os.listdir():
        filenames = [nltk_tok_f,spacy_tok_f,tok_diffs_f, nltk_pos_f, spacy_pos_f, nltk_ner_f, spacy_ner_f,parsed_docs_f]
        variables = {'nltk_tok':None  ,'spacy_tok':None   ,'tok_diffs' :None  , 'nltk_pos':None   
                     , 'spacy_pos' :None  , 'nltk_ner' :None  , 'spacy_ner':None ,'parsed_docs':None }

        if len([f for f in filenames if f in os.listdir(MODEL_COMPARISON_DIR)]) == len(filenames):
            for file, variable in zip(filenames, variables.keys()):
                file_path = os.path.join(MODEL_COMPARISON_DIR,file)
                variables[variable] = np.load(file_path, allow_pickle=True).tolist()
#                 if variable == 'parsed_docs':
#                     print('parsed_docs',variables[variable])
#                 print(file_path)
#                 print(variable)

        tok_info = (variables['nltk_tok'], variables['spacy_tok'], variables['tok_diffs'])
        pos_info = (variables['nltk_pos'],variables['spacy_pos'])
        ner_info = (variables['nltk_ner'], variables['spacy_ner'])
        return tok_info,pos_info, ner_info, variables['parsed_docs']
    else:
        return None
    
def save_model_info(tok_info, pos_info, ner_info,parsed_docs):
    nltk_tok, spacy_tok, tok_diffs = tok_info[0], tok_info[1], tok_info[2]
    nltk_pos, spacy_pos = pos_info[0], pos_info[1]
    nltk_ner, spacy_ner = ner_info[0], ner_info[1]
    try:
        os.makedirs(MODEL_COMPARISON_DIR)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise
    filenames = [nltk_tok_f,spacy_tok_f,tok_diffs_f, nltk_pos_f, spacy_pos_f, nltk_ner_f, spacy_ner_f, parsed_docs_f]
    variables = [nltk_tok  ,spacy_tok  ,tok_diffs  , nltk_pos  , spacy_pos  , nltk_ner  , spacy_ner, parsed_docs]
    for file, variable in zip(filenames, variables):
        np_var = np.array(variable)
        file_path = os.path.join(MODEL_COMPARISON_DIR,file)
        print('Saving to:', file_path)
        np.save(file_path, np_var)

def model_comparison_info(df):
    '''
    This functions gets the token, pos tags, and named entities for all docs using both nltk and Spacy methods.
    This is expensive to compute so the results are stored and later uploaded in liue of computation.
    '''
    model_info = load_model_info()
    if model_info is not None:
        return model_info
    docs = df['docs']
    methods = df['methods']
    
    nltk_tok, spacy_tok, nltk_pos, spacy_pos, nltk_ner, spacy_ner =[],[],[],[],[],[]
    model = spacy.load('en_core_web_sm')
    my_retokenizer = my_retokenizer_builder()
    model.add_pipe(my_retokenizer, first=True)
    total_docs = [(method,sent) for method,doc in zip(methods,sent_toks_gensim(docs)) for sent in doc]
    total_docs_transpose = list(zip(*total_docs))
    methods = total_docs_transpose[0]
    docs = total_docs_transpose[1]
    print('Documents to process:', len(docs))
    parsed_docs = []
    ##### THIS IS THE LOOP FOR PROCESSING DOCUMENTS #####
    for i, doc in enumerate(model.pipe(docs, disable=["parser"])):
        if i %2000 ==0:
            print('Processed',i)
        nltk_sent_toks = nltk_tokenize(doc.text)
        nltk_tok.append(nltk_sent_toks)
        nltk_sent_pos_tags = pos_tag(nltk_sent_toks)
        nltk_pos.append(nltk_sent_pos_tags)
        nltk_sent_ne = [(' '.join([l[0] for l in ne.leaves()]),ne.label()) for ne in ne_chunk(nltk_sent_pos_tags)
                        if type(ne) == nltk.tree.Tree]
        nltk_ner.append(nltk_sent_ne)
        
        spacy_tok.append([tok.text for tok in doc])
        spacy_pos.append([(tok.text,tok.pos_) for tok in doc])
        spacy_ner.append([(ent.text, ent.label_) for ent in doc.ents])
        
        parsed_docs.append((methods[i],doc))
        
    tok_diffs = [(nltk_tok_sent,spacy_tok_sent) for nltk_tok_sent, spacy_tok_sent in zip(nltk_tok,spacy_tok)
                                      if len(nltk_tok_sent) != len(spacy_tok_sent)]
     
    tok_info = (nltk_tok, spacy_tok, tok_diffs)
    pos_info = (nltk_pos,spacy_pos)
    ner_info = (nltk_ner, spacy_ner)
    save_model_info(tok_info, pos_info, ner_info, parsed_docs)
    return tok_info,pos_info, ner_info, parsed_docs

def show_tok_diffs(nltk_tok, spacy_tok, diff_docs):
    nltk_tot_tokens = sum([len(sent) for sent in nltk_tok])
    spacy_tot_tokens = sum([len(sent) for sent in spacy_tok])
    nltk_avg_tokens_tot = nltk_tot_tokens/len(nltk_tok)
    spacy_avg_tokens_tot = spacy_tot_tokens/len(spacy_tok)
    nltk_tot_toks_of_diff = sum([len(y) for y in [x for x in zip(*diff_docs)][0]])
    spacy_tot_toks_of_diff = sum([len(y) for y in [x for x in zip(*diff_docs)][1]])
    nltk_avg_tokens_diff = nltk_tot_toks_of_diff/len(diff_docs)
    spacy_avg_tokens_diff = spacy_tot_toks_of_diff/len(diff_docs)
    print(f'Number of documents with different tokenization: {len(diff_docs)} ', end='')
    print(f'Fraction of total: {len(diff_docs)/ len(nltk_tok):.2%}')
    print(f"{'NLTK':>65}{'SpaCy':>20}")
    print(f"{'Number of tokens:':>45}{nltk_tot_tokens:>20,}{spacy_tot_tokens:>20,}")
    print(f"{'Avg. tokens per sentence:':>45}{nltk_avg_tokens_tot:>20,.3f}{spacy_avg_tokens_tot:>20,.3f}")
    print(f"{'Number of tokens of differences:':>45}{nltk_tot_toks_of_diff:>20,}{spacy_tot_toks_of_diff:>20,}")
    print(f"{'Avg. #tokens of differences:':>45}{nltk_avg_tokens_diff:>20.3f}{spacy_avg_tokens_diff:>20.3f}")
    print('\nExamples of differences:')
    pp = pprint.PrettyPrinter(indent=2, width=110)
    printed = 0
    for i, (nltk_toks, spacy_toks) in enumerate(diff_docs):
          if len(nltk_toks) > 10 or len(spacy_toks) > 10:
              continue
          if printed >= 10:
              break
          print('\nNLTK:')
          pp.pprint(nltk_toks)
          print('SpaCy:')
          pp.pprint(spacy_toks)
          printed+=1
          
          



### Run Comparisons

In [70]:
df, docs_by_method = get_docs(source_path)
tok_info,pos_info, ner_info, parsed_docs = model_comparison_info(df.iloc[:100])
nltk_toks, spacy_toks, tok_diffs = tok_info
# print(len(tok_diffs))
# show_tok_diffs(nltk_tok, spacy_tok, diff_docs)
print(parsed_docs[0])

['android.app.assist.AssistContent.describeContents', Describe the kinds of special objects contained in this Parcelable instance's marshaled representation]


In [57]:
print(nltk_toks[0])
print(spacy_toks[0])
print(pos_info[0][0])
print(pos_info[1][0])
print(ner_info[0][0])
print(ner_info[1][0])

['Describe', 'the', 'kinds', 'of', 'special', 'objects', 'contained', 'in', 'this', 'Parcelable', 'instance', "'s", 'marshaled', 'representation']
['Describe', 'the', 'kinds', 'of', 'special', 'objects', 'contained', 'in', 'this', 'Parcelable', 'instance', "'s", 'marshaled', 'representation']
[('Describe', 'NNP'), ('the', 'DT'), ('kinds', 'NNS'), ('of', 'IN'), ('special', 'JJ'), ('objects', 'NNS'), ('contained', 'VBN'), ('in', 'IN'), ('this', 'DT'), ('Parcelable', 'JJ'), ('instance', 'NN'), ("'s", 'POS'), ('marshaled', 'JJ'), ('representation', 'NN')]
[('Describe', 'VERB'), ('the', 'DET'), ('kinds', 'NOUN'), ('of', 'ADP'), ('special', 'ADJ'), ('objects', 'NOUN'), ('contained', 'VERB'), ('in', 'ADP'), ('this', 'DET'), ('Parcelable', 'ADJ'), ('instance', 'NOUN'), ("'s", 'PART'), ('marshaled', 'ADJ'), ('representation', 'NOUN')]
[('Parcelable', 'ORGANIZATION')]
[('Parcelable', 'ORG')]


In [72]:
# print(len(nltk_toks))
# # print(nltk_toks)
# tok_diffs = [(nltk_tok_sent,spacy_tok_sent) for nltk_tok_sent, spacy_tok_sent in zip(nltk_toks,spacy_toks)
#                                       if len(nltk_tok_sent) != len(spacy_tok_sent)]
show_tok_diffs(nltk_toks, spacy_toks, tok_diffs)

Number of documents with different tokenization: 50 Fraction of total: 28.90%
                                                             NLTK               SpaCy
                            Number of tokens:               2,649               2,599
                    Avg. tokens per sentence:              15.312              15.023
             Number of tokens of differences:                 884                 834
                 Avg. #tokens of differences:              17.680              16.680

Examples of differences:

NLTK:
['Can', 'be', 'modified', 'in-place']
SpaCy:
['Can', 'be', 'modified', 'in', '-', 'place']

NLTK:
['Returns', 'the', 'current', 'setStructuredData', '(', 'String', ')']
SpaCy:
['Returns', 'the', 'current', 'setStructuredData(String)']

NLTK:
['Return', 'a', 'Bundle', 'containing', 'optional', 'vendor-specific', 'extension', 'information']
SpaCy:
['Return', 'a', 'Bundle', 'containing', 'optional', 'vendor', '-', 'specific', 'extension', 'information']

NLT

In [44]:


df = get_docs(source_path)
model = spacy.load('en_core_web_sm')
my_retokenizer = my_retokenizer_builder()
model.add_pipe(my_retokenizer, first=True)
print('starting sentencizer:')
docs = df['docs'].to_list()
#docs = ['Hello my name is Chris. I like Computer Science.', 'I am an undergraduate android.Manifest.permission.MANAGE_ACCOUNTS.']
docs = [sent for doc in sent_toks_gensim(docs) for sent in doc]
# print(docs[5])
# print(docs[500:521])
nltk_toks = []
spacy_toks = []
nltk_pos = []
spacy_pos = []
nltk_ner = []
spacy_ner =[]
print('Documents to process:', len(docs))
for i, doc in enumerate(model.pipe(docs, disable=["parser", "ner", 'tagger'])):
    if i %2000 ==0:
        print('Processed',i)
#     print('before',doc)
    
#     nltk_sent_toks = nltk_tokenize(doc.text)
#     nltk_toks.append(nltk_sent_toks)
#     nltk_sent_pos_tags = pos_tag(nltk_sent_toks)
#     nltk_pos.append(nltk_sent_pos_tags)
#     nltk_sent_ne = [(' '.join([l[0] for l in ne.leaves()]),ne.label()) for ne in ne_chunk(nltk_sent_pos_tags)
#                     if type(ne) == nltk.tree.Tree]
#     nltk_ner.append(nltk_sent_ne)

    spacy_toks.append([tok.text for tok in doc])
    spacy_pos.append([(tok.text,tok.pos_) for tok in doc])
    spacy_ner.append([(ent.text, ent.label_) for ent in doc.ents])


tok_diffs = [(nltk_tok_sent,spacy_tok_sent) for nltk_tok_sent, spacy_tok_sent in zip(nltk_toks,spacy_toks)
                                  if len(nltk_tok_sent) != len(spacy_tok_sent)]

show_tok_diffs(nltk_toks, spacy_toks, tok_diffs)

starting sentencizer:
Documents to process: 73154
Processed 0
Processed 2000
Processed 4000
Processed 6000
Processed 8000
Processed 10000
Processed 12000
Processed 14000
Processed 16000


KeyboardInterrupt: 

In [31]:
df = get_docs(source_path)
model = spacy.load('en_core_web_sm')
#model.add_pipe(my_retokenizer, first=True)
print('starting sentencizer:')
docs = df['docs'].to_list()
print(docs[555])
my_retokenizer = my_retokenizer_builder()
doc = my_retokenizer(model('Requires android.Manifest.permission.MANAGE_ACCOUNTS'))
print([t for t in doc])
# re.search(r'[A-Z_]{4,}',)
print('z')
# for match in re.finditer(r'([A-Z]+[a-z_]*)*\( ?[A-Za-z_\.]* ?\)',docs[555]):
#     print('blah')

starting sentencizer:

[Requires, android, ., Manifest.permission, ., MANAGE_ACCOUNTS]
z


## Modeling

## Presentation Graphic(s)

## Project approach and overall execution
Do not put anything below this cell

## Code Structure and Organization

## Code Commenting

In [None]:



def doc_info(docs):
    '''
    Returns the parsed document, the token counter, POS tag counter,and the POS tag counter by word 
    '''
    nlp = spacy.load("en_core_web_sm", disable=["parser", 'ner'])
    nlp.add_pipe(my_retokenizer, first=True)
    tok_cnt = Counter()
    pos_cnt = Counter()
    pos_byword_cnt = {}
    parsed_docs = []
    for tup in docs:
        doc = nlp(tup[1])
        parsed_docs.append((tup[0], doc))
        for i, tok in enumerate(doc):
            tok_cnt[tok.text] += 1
            pos_cnt[tok.pos_] += 1
            if tok.pos_ in pos_byword_cnt:
                pos_byword_cnt[tok.pos_][tok.text] +=1
            else:
                pos_byword_cnt[tok.pos_] = Counter()
                pos_byword_cnt[tok.pos_][tok.text] += 1
                
    return parsed_docs, tok_cnt, pos_cnt, pos_byword_cnt         


def display_info(parsed_docs, method_documents,tok_cnt, pos_cnt, pos_byword_cnt, classes=None):
    unique_sents = Counter()
    unique_docs = []
    for method,doc in parsed_docs:
        if doc.text not in unique_sents:
            unique_docs.append((method,doc))
        unique_sents[doc.text] +=1  
    if classes:
        print('\t>-total number of classes:', len(set(classes)), '<br>')
        print('\t -total number of methods:', len(method_documents.keys()), '<br>')
    else:
        print('\t>-total number of methods:', len(method_documents.keys()), '<br>')
    print('\t -total records after transform:', len(parsed_docs), '<br>')
    print('\t -number of unique records after transform:', len(unique_sents), '<br>')
    print('\t -method with most sentences:', max([(key,len(method_documents[key])) for key in method_documents]
                                             , key=lambda x: x[1]), '<br>')
    print('\t -method with most tokens:', max([(method,len(doc)) for method,doc in unique_docs]
                                             , key=lambda x: x[1]), '<br>')
    print('\t -total number of tokens:', sum([tok_cnt[key] for key in tok_cnt.keys()]), '<br>')
    print("\t -num unique tokens:", len(tok_cnt.keys()), '<br>')
    print('\t -most common tokens (with 5 or more chars):'
          ,[tup for tup in tok_cnt.most_common() if len(tup[0])>4][:3], '<br>')
    most_freq_pos = pos_cnt.most_common(1)[0][0]
    print('\t -most frequent POS tag:', most_freq_pos, '<br>')
    print('\t -most common words in that tag:', pos_byword_cnt[most_freq_pos].most_common(1)[0], '<br>')
    print('\t -most frequent proper noun:', pos_byword_cnt['PROPN'].most_common(1)[0], '<br>')
    method_and_class_toks = [ent.text for p_doc in parsed_docs for ent in p_doc[1].ents 
                             if ent.label_ == 'MT_OR_CL']
    print('\t -number of unique domain-specific named entities:', len(method_and_class_toks), '<br>')
    print('\t -number of unique domain-specific named entities:', len(set(method_and_class_toks)), '<br>')
    print('\t -most frequent domain-specific named entity:'
          , Counter(method_and_class_toks).most_common()[0], '<br>')
    print()

def process_doc_and_display_attrs(path, mappings_path=None):
    docs, doc_by_method = access_data(path)  
    classes = None
    if mappings_path:
        mapping_docs, _ = access_data(mappings_path) 
        mapped_methods = [mapping_doc[0] for mapping_doc in mapping_docs]
        classes = [mapping_doc[1] for mapping_doc in mapping_docs]
        docs = [doc for doc in docs if doc[0] in mapped_methods]
        doc_by_method = {key:doc_by_method[key] for key in doc_by_method.keys() if key in mapped_methods}

    parsed_docs, tok_cnt, pos_cnt, pos_byword_cnt = doc_info(docs)  
    print('INFO FOR',path)
    display_info(parsed_docs, doc_by_method, tok_cnt, pos_cnt, pos_byword_cnt, classes=classes)
    return parsed_docs, tok_cnt