In [1]:
import spacy
nlp = spacy.load("en_core_web_lg")
import pandas as pd
import wordninja as wn


In [None]:
def find_words(instring, prefix = '', words = None):
    if not instring:
        return []
    if words is None:
        words = set()
        with open('/usr/share/dict/words') as f:
            for line in f:
                words.add(line.strip())
    if (not prefix) and (instring in words):
        return [instring]
    prefix, suffix = prefix + instring[0], instring[1:]
    solutions = []
    # Case 1: prefix in solution
    if prefix in words:
        try:
            solutions.append([prefix] + find_words(suffix, '', words))
        except ValueError:
            pass
    # Case 2: prefix not in solution
    try:
        solutions.append(find_words(suffix, prefix, words))
    except ValueError:
        pass
    if solutions:
        return sorted(solutions,
                      key = lambda solution: [len(word) for word in solution],
                      reverse = True)[0]
    else:
        raise ValueError('no solution')

In [None]:
from math import log

# Build a cost dictionary, assuming Zipf's law and cost = -math.log(probability).
words = open("words-by-frequency.txt").read().split()
wordcost = dict((k, log((i+1)*log(len(words)))) for i,k in enumerate(words))
maxword = max(len(x) for x in words)

def infer_spaces(s):
    """Uses dynamic programming to infer the location of spaces in a string
    without spaces."""

    # Find the best match for the i first characters, assuming cost has
    # been built for the i-1 first characters.
    # Returns a pair (match_cost, match_length).
    def best_match(i):
        candidates = enumerate(reversed(cost[max(0, i-maxword):i]))
        return min((c + wordcost.get(s[i-k-1:i], 9e999), k+1) for k,c in candidates)

    # Build the cost array.
    cost = [0]
    for i in range(1,len(s)+1):
        c,k = best_match(i)
        cost.append(c)

    # Backtrack to recover the minimal-cost string.
    out = []
    i = len(s)
    while i>0:
        c,k = best_match(i)
        assert c == cost[i]
        out.append(s[i-k:i])
        i -= k

    return " ".join(reversed(out))
which you can use with

In [None]:
words = set()
with open('/usr/share/dict/words') as f:
    for line in f:
        words.add(line.strip())

solutions = {}
def find_words(instring):
    # First check if instring is in the dictionnary
    if instring in words:
        return [instring]
    # No... But maybe it's a result we already computed
    if instring in solutions:
        return solutions[instring]
    # Nope. Try to split the string at all position to recursively search for results
    best_solution = None
    for i in range(1, len(instring) - 1):
        part1 = find_words(instring[:i])
        part2 = find_words(instring[i:])
        # Both parts MUST have a solution
        if part1 is None or part2 is None:
            continue
        solution = part1 + part2
        # Is the solution found "better" than the previous one?
        if best_solution is None or len(solution) < len(best_solution):
            best_solution = solution
    # Remember (memoize) this solution to avoid having to recompute it
    solutions[instring] = best_solution
    return best_solution

In [None]:
import re, string, random, glob, operator, heapq
from collections import defaultdict
from math import log10

def memo(f):
    "Memoize function f."
    table = {}
    def fmemo(*args):
        if args not in table:
            table[args] = f(*args)
        return table[args]
    fmemo.memo = table
    return fmemo

def test(verbose=None):
    """Run some tests, taken from the chapter.
    Since the hillclimbing algorithm is randomized, some tests may fail."""
    import doctest
    print 'Running tests...'
    doctest.testfile('ngrams-test.txt', verbose=verbose)

################ Word Segmentation (p. 223)

@memo
def segment(text):
    "Return a list of words that is the best segmentation of text."
    if not text: return []
    candidates = ([first]+segment(rem) for first,rem in splits(text))
    return max(candidates, key=Pwords)

def splits(text, L=20):
    "Return a list of all possible (first, rem) pairs, len(first)<=L."
    return [(text[:i+1], text[i+1:]) 
            for i in range(min(len(text), L))]

def Pwords(words): 
    "The Naive Bayes probability of a sequence of words."
    return product(Pw(w) for w in words)

#### Support functions (p. 224)

def product(nums):
    "Return the product of a sequence of numbers."
    return reduce(operator.mul, nums, 1)

class Pdist(dict):
    "A probability distribution estimated from counts in datafile."
    def __init__(self, data=[], N=None, missingfn=None):
        for key,count in data:
            self[key] = self.get(key, 0) + int(count)
        self.N = float(N or sum(self.itervalues()))
        self.missingfn = missingfn or (lambda k, N: 1./N)
    def __call__(self, key): 
        if key in self: return self[key]/self.N  
        else: return self.missingfn(key, self.N)

def datafile(name, sep='\t'):
    "Read key,value pairs from file."
    for line in file(name):
        yield line.split(sep)

def avoid_long_words(key, N):
    "Estimate the probability of an unknown word."
    return 10./(N * 10**len(key))

N = 1024908267229 ## Number of tokens

Pw  = Pdist(datafile('count_1w.txt'), N, avoid_long_words)

#### segment2: second version, with bigram counts, (p. 226-227)

def cPw(word, prev):
    "Conditional probability of word, given previous word."
    try:
        return P2w[prev + ' ' + word]/float(Pw[prev])
    except KeyError:
        return Pw(word)

P2w = Pdist(datafile('count_2w.txt'), N)

@memo 
def segment2(text, prev='<S>'): 
    "Return (log P(words), words), where words is the best segmentation." 
    if not text: return 0.0, [] 
    candidates = [combine(log10(cPw(first, prev)), first, segment2(rem, first)) 
                  for first,rem in splits(text)] 
    return max(candidates) 

In [2]:

labels = ['Utility','Natural Language Processing','Application Performance Manager','Network','Database','Interpreter','Multi Thread','Error Handling','Logging','Language','Data Structure',
          'Software Development and IT Operations','Internationalization','Setup','Logic','Microservices','Machine Learning','Test','Search','Input and Output','User Interface','Parser','Security',
          'Cloud','Big Data','Event Handling','Application','Geographic Information System ']
debug = 'N'

def semantic_match(api):
    #semantic matching
    #comparator = sc(sentencizer=True)
    #comparator = sc(spacy_model='en_core_web_lg')
        
    api_classes = ' '
    api_list = []    
    if debug == 'Y':
        print(api, "<-api")
    
    #count = 0
    max_sim = 0
    api_selected = ' '
    api_clean = api.replace(".", " ")
    api_clean = api_clean.replace("/", " ")
    
    #api_class = api_clean.split(" ")
    api_class_list = api_clean.rsplit(' ', 1)

    api_class = api_class_list[len(api_class_list)-1]
    api_begin = api_class_list[0]
    api_list = wn.split(api_class)
    
    api_classes = api_classes.join(api_list)
    
    api_new = str(api_begin)+ ' '+ str(api_classes)
    if debug == 'Y':
        print('class list:',api_class_list)
        print('class:',api_class)
        print('list:',api_list)
        print('classes:',api_classes)

        print(type(api_new))
        print("api_new:",api_new)        

    for k in range(len(labels)):
        label = labels[k]
            
        doc1 = nlp(api_new)
        doc2 = nlp(label)
        #similarity = comparator.compare_phrases(doc1, doc2)
        similarity = doc1.similarity(doc2)
        if debug == 'Y':
            print(doc1, "<->", doc2, similarity)
            
        if (similarity > max_sim):
            max_sim = similarity
            api_selected = label
            if debug == 'Y':
                print("new max similarity:",similarity, 'doc2:', doc2)

            #count = count + 1
            
    if debug == 'Y':
        print("max similarity:",max_sim, 'api_selected:', api_selected)
    
    return api_selected+','+str(round(max_sim,2))
                
            

            


In [3]:

def __main__():
    
    #csv created by: select general, specific, class, api_name from "API_specific" a, "API" b where a.api_name_fk = b.api_name

    api_specific = pd.read_csv( './api_specific_class.csv')
    api_specific["corpus"] = api_specific["general"].map(str)+ ' ' + api_specific["specific"].map(str)+ ' ' + api_specific["class"].map(str)


    api_specific['expert'] = api_specific['corpus'].apply(semantic_match)

    api_specific.to_csv( './api_specific_expert.csv')
    


In [4]:
__main__()

  similarity = doc1.similarity(doc2)


In [5]:
api_expert = pd.read_csv( './api_specific_expert.csv')

api_grouped = api_expert['expert'].groupby([api_expert.general]).apply(set).reset_index()

api_grouped.head()

Unnamed: 0,general,expert
0,AdditionalAnswers,"{Geographic Information System ,0.54}"
1,AdditionalMatchers,"{Event Handling,0.5}"
2,Answers,"{Error Handling,0.52, Software Development and..."
3,ArgumentMatchers,"{Big Data,0.44}"
4,BDDMockito,"{Test,0.45}"


In [6]:
api_grouped.to_csv( './api_expert_review.csv', encoding='utf-8')


In [10]:
pd.options.display.max_seq_items = 2000
pd.options.display.max_colwidth = 900
pd.options.display.max_rows = 999

In [11]:
 api_grouped

Unnamed: 0,general,expert
0,AdditionalAnswers,"{Geographic Information System ,0.54}"
1,AdditionalMatchers,"{Event Handling,0.5}"
2,Answers,"{Error Handling,0.52, Software Development and IT Operations,0.58}"
3,ArgumentMatchers,"{Big Data,0.44}"
4,BDDMockito,"{Test,0.45}"
5,ExceededMemoryLimitException,"{Error Handling,0.6}"
6,ExceededSpillLimitException,"{Error Handling,0.53}"
7,JSci,"{Data Structure,0.53}"
8,Matchers,"{Event Handling,0.34}"
9,Mockito,"{Test,0.3, Parser,0.3, Parser,0.23, Parser,0.22}"
