In [8]:
import csv
import fasttext
import os
import re
import gensim
import spacy
from collections import Counter
from gensim.summarization.textcleaner import get_sentences
from gensim.summarization.textcleaner import clean_text_by_sentences
from nltk.tokenize import sent_tokenize
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
import random

source_filename = 'android_semi_cleaned.csv'
annotated_filename = 'android_cleaned_ANNOTATED_ONLY.csv'
data_dir = os.path.join("proj_data","cleaned_data")
mappings_data_filename = 'mappings_cleaned.csv'
source_path = os.path.join(data_dir, source_filename)
mappings_path = os.path.join(data_dir, mappings_data_filename) 
annotated_path = os.path.join(data_dir, annotated_filename)
prepped_file = 'prepped.csv'
save_name = 'ft.bin'

In [2]:
# with open(source_file, 'r', newline='', encoding='utf-8') as f:
#     reader = csv.reader(f, dialect='excel')
#     docs1 = []
#     docs2 = []
#     methods = []
#     diff = 0
#     for i, row in enumerate(reader):

#         gen = get_sentences(re.sub(r'\s+', ' ', row[1]))
#         doc1 = [d for d in gen]
#         doc2 = sent_tokenize(re.sub(r'\s+', ' ', row[1]))
#         flag = True
#         for i, (sent1, sent2) in enumerate(zip(doc1, doc2)):
#             diff_list=[]
#             if sent1 not in doc2 or  sent2 not in doc1 and sent1 not in diff_list and sent2 not in diff_list:
#                 if flag:
#                     print('ORIGINAL: ', re.sub(r'\s+', ' ', row[1]))
#                     flag = False
#                 print('GENSIM: ', sent1)
#                 print('NLTK: ', sent2)
#                 diff+= 1
#                 diff_list.extend([sent1,sent2])
#         if not flag:
#             print('\n')
#         docs1.extend(doc1)#doc.lower().rstrip('.'))
#         docs2.extend(doc2)
   

In [3]:
def access_data(path):
    with open(path,'r', newline='', encoding='utf-8') as f:
        reader = csv.reader(f, dialect='excel')
        docs = []
        methods = []
        full = []
        for i, row in enumerate(reader):
            method = row[0]
            doc = row[1] 
            full.append(tuple(row))
            methods.append(method)
            docs.append(doc)
        return methods, docs, full

def process_matches(doc, expression, ent_label=None, tok_attrs=None):
    for match in re.finditer(expression, doc.text):
        if match.group(0) not in ['e.g.', 'i.e.']:
            start,end = match.span()
            span = doc.char_span(start, end, label = ent_label)
            if span is not None:
                pot_ents = [ent for ent in doc.ents if ent.start >= span.start or ent.end <= span.end]
                if pot_ents != []:
                    new_ents= list(doc.ents)
                    [new_ents.remove(pe) for pe in pot_ents]
                    doc.ents = new_ents
                doc.ents = list(doc.ents) + [span]
                with doc.retokenize() as retokenizer:
                    retokenizer.merge(span, attrs=tok_attrs)
    return doc


def my_retokenizer(doc):
    expression = r'([A-Za-z]+\.)+[A-Za-z]+\.?(\( ?[A-Za-z\.]* ?\))'
    doc = process_matches(doc, expression, ent_label='MT_OR_CL', tok_attrs={'POS' : 'PROPN'})
    expression = r'[A-Z]*[a-z]+([A-Z]+[a-z]*)*(\( ?[A-Za-z\.]* ?\))'
    doc = process_matches(doc, expression, ent_label='MT_OR_CL', tok_attrs={'POS' : 'PROPN'})
    return doc

def process_file(path, mappings_path=None):
    '''
    Returns a list of two-tuples. The first entry is a label, the second is a document
    '''
    method_labels, docs, full_rows = access_data(path)  
    classes = None
    if mappings_path:
        mapped_methods, lex_class, _ = access_data(mappings_path) 
        #mapped_methods = [mapping_doc[0] for mapping_doc in mapping_docs]
        #classes = [mapping_doc[1] for mapping_doc in mapping_docs]
        #full_rows = [row ifor row in full_rows ]
        method_labels = [row[0] if row[0] in mapped_methods else 'NoClass' for row in full_rows]
        docs = [row[1] for row in full_rows]
        #doc_by_method = {key:doc_by_method[key] for key in doc_by_method.keys() if key in mapped_methods}
        
    return method_labels, docs, full_rows

def my_sentencizer(docs, labels = None):
    
    if labels and len(labels) != len(docs):
        raise ValueError('Docs and labels are not the same length')
        
    new_docs = []
    new_labels = []
    for i, doc in enumerate(docs):
        sents = [sent for sent in get_sentences(doc) if sent != '']
        new_docs.extend(sents)
        if labels:
            new_labels.extend([labels[i]]*len(sents))
            
#         if i < 20:
#             print('\norig:')
#             print(doc)
#             print('new:')
#             print(sents)
#             print('orig label')
#             print(labels[i])
#             print([labels[i]]*len(sents))
#             print('new labels:')
#             print(new_labels)
#         if i %2000 ==0:
#             print('\nsentencizer check:',i)
#             print(new_docs)
# #             print(new_labels)
#     print('done sentencizing')   
    if labels:
        new_docs = [(a,b) for a,b in zip(new_docs, new_labels)]
        
    return new_docs

def my_cleaner(docs):
    '''
    This method separates a list of documents into a list of sentences with equal spaces. If maintain label
    is True, the first column (the label column) will be dublicated for each new sentence. The result will 
    be a list of tuples, with the first entry as the label and the second as the sentence
    '''
    new_docs = []
    for doc in docs:
        # standardize spaces and newlines
        new_doc = re.sub(r'\s+', ' ', doc)
        new_docs.append(new_doc)

    return new_docs

def load_classification_docs(doc_path, mappings_path):
    meth_labels,docs, full_rows = process_file(doc_path, mappings_path)
    docs = my_cleaner(docs)
    learning_examples = my_sentencizer(docs, meth_labels)
    random.shuffle(learning_examples)
    training = learning_examples[:int(len(learning_examples)*.7)]
    testing = learning_examples[int(len(learning_examples)*.7):]
    trainX = [a for a, b in training]
    trainY = [b for a, b in training]
    testX = [a for a, b in testing]
    testY = [b for a, b in testing]
    
    return trainX, trainY, testX, testY
    
    


In [4]:
model = spacy.load('en_core_web_lg')#'en_core_web_sm'

In [5]:
trainX, trainY, testX, testY = load_classification_docs(annotated_path, mappings_path)

In [6]:

try:
    model.add_pipe(my_retokenizer, first=True)
except ValueError:
    model.remove_pipe('my_retokenizer')
    model.add_pipe(my_retokenizer, first=True)
    
processed_docs = []
for doc in model.pipe(trainX, disable=['parser', 'tagger', 'ner']):
    processed_docs.append(doc)
    

In [11]:
oovs = [tok.text for doc in processed_docs for tok in doc if tok.is_oov and not tok.is_punct]
oovs_lem = [tok.lemma_ for doc in processed_docs for tok in doc if tok.is_oov and not tok.is_punct]
lemmas = [tok.lemma_ for doc in processed_docs for tok in doc if not tok.is_punct]
toks = [tok.text for doc in processed_docs for tok in doc]
toks_no_punct = [tok.text for doc in processed_docs for tok in doc if not tok.is_punct]
oov_count = Counter(oovs)
oov_lem_count = Counter(oovs_lem)
tok_count = Counter(toks)
tok_no_punct_count = Counter(toks_no_punct)
lemma_count = Counter(lemmas)
lemma_diff_toks_oov = [lemma for lemma in oov_lem_count.keys() if lemma not in list(oov_count.keys())]

In [13]:
print('Total tokens', len(toks))
print('Total tokens (not punctuation)', len(toks_no_punct))
print('Total unique tokens', len(tok_count.keys()))
print('Total unique tokens (no punctuation)', len(tok_no_punct_count))
print('Total unique lemmas', len(lemma_count))
print('Out of vocab tokens:', len(oovs))
print('Out of vocab unique tokens:', len(oov_count))
print('Out of vocab unique lemmas:', len(oov_lem_count))

print('Most common out of vocab tokens:', oov_count.most_common(20))
print('Most common out of vocab lemmas:', oov_lem_count.most_common(20))
print('OOV lemmas not in tokens:', lemma_diff_toks_oov[:20])

Total tokens 216558
Total tokens (not punctuation) 193132
Total unique tokens 7954
Total unique tokens (no punctuation) 7926
Total unique lemmas 6603
Out of vocab tokens: 8397
Out of vocab unique tokens: 2542
Out of vocab unique lemmas: 2542
Most common out of vocab tokens: [('number', 156), ('Manifest.permission', 140), ('Drawable', 90), ('ID', 89), ('PendingIntent', 85), ('com.android.contacts', 68), ('android.graphics', 43), ('BroadcastReceiver', 40), ('meta', 40), ('android.content', 39), ('dataPosition()', 36), ('ShowAll', 34), ('LayoutParams', 31), ('X', 30), ('root', 29), ('prepare()', 28), ('VERSION_CODES.P', 27), ('\\([0', 27), ('WindowManager', 26), ('mutate()', 26)]
Most common out of vocab lemmas: [('numb', 156), ('Manifest.permission', 140), ('Drawable', 90), ('ID', 89), ('PendingIntent', 85), ('com.android.contacts', 68), ('android.graphics', 43), ('BroadcastReceiver', 40), ('meta', 40), ('android.content', 39), ('dataPosition()', 36), ('ShowAll', 34), ('LayoutParams', 31

In [13]:

try:
    textcat = nlp.create_pipe(
            "textcat", config={"exclusive_classes": False, "architecture": "simple_cnn"}
        )
except:
    model.remove_pipe('textcat')
        textcat = nlp.create_pipe(
            "textcat", config={"exclusive_classes": False, "architecture": "simple_cnn"}
        )
        
        
    

I
called
equals(Object )
and
received
x


In [31]:
prepped_texts = [' '.join([tok.text for tok in doc]) for doc in processed_docs]
with open(prepped_file, 'w') as f:
    for line in prepped_texts:
        

Describe the kinds of special objects contained in this Parcelable instance 's marshaled representation .
For example if the object will include a file descriptor in the output of writeToParcel(android.os . Parcel int ) the return value of this method must include the CONTENTS_FILE_DESCRIPTOR bit .
Return the current setClipData(ClipData) which you can modify in - place .
Return Bundle for extra vendor - specific data that can be modified and examined .
Returns the current setIntent(Intent) if one is set else the default Intent obtained from Activity.getIntent .
Can be modified in - place .
Returns the current setStructuredData(String) .
Return the content 's web URI as per setWebUri(android.net.Uri) or null if there is none .
Returns whether or not the current Intent was explicitly provided in Activity.onProvideAssistContent .
If not the Intent was automatically set based on Activity.getIntent .
Returns whether or not the current getWebUri() was explicitly provided in Activity.onProvi

In [None]:
with open(prepped_file, 'w',encoding='utf-8') as f:
    for line in docs1:
        f.write(line+"\n")
    

In [None]:
model = fasttext.train_unsupervised(prepped_file, model='cbow', wordNgrams=1,neg=10, ws=8)

In [None]:
print(model.get_nearest_neighbors('typo'))

In [None]:
model.get_nearest_neighbors('argument')

In [None]:
model.get_subwords('subwords')
print(model.get_input_matrix().shape)

In [None]:
from spacy.vocab import Vocab

#im = model.get_input_matrix()
vocab = Vocab()
for i, key in enumerate(model.labels):
    print([idx for j, idx in enumerate(model.get_subwords(str(key))[0])])
    sws, idxs = model.get_subwords(str(key))
    print(idxs[0])
    #print(model.get_input_vector('the'))
    pieces = [model.get_input_vector(idx) for j, idx in enumerate(idxs)]
    #[print(k) for k in model.get_subwords(str(key))[1]]
    pieces = np.sum(np.array(pieces), axis=0)/7
    if i ==0:
        print(model.get_subwords(str(key)))
        print(pieces)
        break
    #print(pieces.shape)
    vocab.set_vector(key, pieces)

### print(vocab.get_vector('the'))
print(model.get_output_matrix()[0])
print(model.labels[0])
print(model['the'])
print(model.get_subwords('the'))

In [None]:
print(model.get_output_matrix()[0])

In [None]:
print(model.is_quantized())

In [None]:
print(dir(model))

In [None]:
print(dir(model.get_subwords.__func__))
print(dir(model.get_subwords.__func__.__code__))
print(model.get_subwords.__func__.__defaults__)

In [None]:
labs = model.labels
sw, idx =model.get_subwords(labs[0])
subword_dict = {}
i = 0
for doc in docs:
    #print(doc)
    for lab in doc.split():
        if i % 10000 == 0:
            print(i, lab)
        i+=1
        #print(lab)
        sws, idxs = model.get_subwords(lab)
        #print(sws, idxs)
        for sw, idx in zip(sws, idxs.tolist()):
            subword_dict[idx]=sw
print(len(subword_dict.keys()))
#print(model.get_input_matrix().shape)

In [None]:
print(len(set(subword_dict.values())))

In [None]:
d = {idx:sw for doc in docs for lab in doc.split() for sw, idx in zip(*model.get_subwords(lab))}
            

In [None]:
print(d.values())

In [None]:
from spacy.vocab import Vocab

vocab = Vocab()
for idx, sw in d.items():
    vocab.set_vector(sw, model.get_input_vector(idx))

In [None]:
s_spot ='save_spot.bin'
model.save_model(s_spot)

In [None]:
import spacy
from spacy.language import Language

nlp = Language()

with open('save_spot.bin', "rb") as file_:
    header = file_.readline()
    nr_row, nr_dim = header.split()
    nlp.vocab.reset_vectors(width=int(nr_dim))
    for line in file_:
        line = line.rstrip().decode("utf8")
        pieces = line.rsplit(" ", int(nr_dim))
        word = pieces[0]
        vector = numpy.asarray([float(v) for v in pieces[1:]], dtype="f")
        nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab
# test the vectors and similarity
text = "class colspan"
doc = nlp(text)
print(text, doc[0].similarity(doc[1]))

In [None]:
print(model.labels)
model.save_model(save_name)

In [None]:
model = fasttext.load_model(save_name)