In [1]:
from elasticsearch import Elasticsearch
from utils.csv import get_stopwords

In [2]:
es = Elasticsearch(['es01','es02','es03'],  
                   sniff_on_start=True,
                   sniff_on_connection_fail=True,
                   sniffer_timeout=60)

In [3]:
def scroll_all_ids(es, index, fields=[], pagesize=25, scroll_timeout="1m", **kwargs):
    result = es.search(index=index, 
                       scroll=scroll_timeout, 
                       _source_includes=fields, 
                       size=pagesize, 
                       **kwargs)
    while True:
        scroll_id = result["_scroll_id"]
        hits = result["hits"]["hits"]
        # Stop after no more docs
        if not hits:
            break
        # Yield each entry
        yield [hit['_id'] for hit in hits]
        # Continue scrolling
        result = es.scroll(body={
                "scroll_id": scroll_id,
                "scroll": scroll_timeout
            }, **kwargs)
    

In [4]:
def generate_combined_tags(index, ids, fields):
    result = es.mtermvectors(index=index,ids=ids, fields=fields, 
                    positions=False, 
                    offsets=False,
                    term_statistics=True)
    yield from (tag for doc in result['docs'] for tag in combine_tags_with_statistics(doc['term_vectors']))

In [5]:
def combine_tags_with_statistics(term_vectors_by_field):    
    tags = (item for field in term_vectors_by_field.values() for item in field['terms'].items())
    yield from tags

In [6]:
def is_valid_term(term,stop_words=get_stopwords()):
    if term in stop_words or term in ['https']:
        return False
    if term.replace('.','').replace('x','').replace(':','').isnumeric():
        return False
    if term.startswith('0x'):
        return False
    return True
    

In [7]:
def aggregate_tags(tags, stop_words=get_stopwords()):
    aggregated_tags = {}
    for term,statistics in tags:
        if is_valid_term(term):
            if term not in aggregated_tags:
                aggregated_tags[term] = statistics
            else:
                aggregated_tags[term]['term_freq'] += statistics['term_freq']
    return aggregated_tags

In [9]:
task_fields=['task','task_description','project','project_description']

In [10]:
task_tags = (tag for task_ids in scroll_all_ids(es,'tasks') 
             for tag in generate_combined_tags(index='tasks', 
                               ids=task_ids, fields=task_fields))

In [11]:
aggregated_task_tags = aggregate_tags(task_tags)

In [12]:
def sort_by_statistics(terms, ordered_fields=['doc_freq','term_freq','ttf']):
    tuples=[]
    for term,stats in terms.items():
        l = [stats[field] for field in ordered_fields]
        l.append(term)
        tuples.append(tuple(l))
    tuples.sort(reverse=True)
    return tuples
            
    

In [13]:
sort_by_statistics(aggregated_task_tags)

[(28, 45, 28, 'driver'),
 (23, 30, 23, 'test'),
 (20, 45, 20, 'schematic'),
 (19, 19, 19, 'usb'),
 (18, 22, 19, 'boot'),
 (17, 20, 17, 'camera'),
 (17, 17, 17, 'solutions'),
 (17, 17, 17, 'reader'),
 (17, 17, 17, 'fingerprint'),
 (16, 86, 16, 'support'),
 (16, 36, 16, 'pcb'),
 (16, 34, 16, 'layout'),
 (15, 21, 15, 'lpc5500'),
 (15, 15, 15, 'healthcare'),
 (13, 14, 13, 'routing'),
 (13, 13, 13, 'clinic'),
 (12, 24, 12, 'mini'),
 (12, 13, 12, 'imx'),
 (12, 12, 12, '8m'),
 (11, 21, 11, 'integration'),
 (11, 16, 11, 'ddr'),
 (11, 13, 11, 'calibration'),
 (11, 13, 11, 'android'),
 (10, 29, 12, 'nxp'),
 (10, 10, 10, 'systems'),
 (10, 10, 10, 'imx8mm'),
 (9, 30, 9, 'linux'),
 (9, 25, 11, 'custom'),
 (9, 15, 9, 'display'),
 (9, 12, 11, 'm4'),
 (9, 10, 9, 'imx6ul'),
 (9, 9, 9, 'simulations'),
 (9, 9, 9, 'peripherial'),
 (8, 13, 8, 'touch'),
 (8, 10, 8, 'wifi'),
 (8, 8, 8, 'cap'),
 (7, 41, 8, 'bsp'),
 (7, 14, 7, 'software'),
 (7, 11, 7, 'gpio'),
 (7, 9, 7, 'ethernet'),
 (7, 8, 7, 'tee'),
 (7, 8,

In [14]:
thread_fields=['text']

thread_tags = (tag for thread_ids in scroll_all_ids(es,'threads') 
             for tag in generate_combined_tags(index='threads', 
                               ids=thread_ids, fields=thread_fields))

aggregated_thread_tags = aggregate_tags(thread_tags)

In [15]:
sort_by_statistics(aggregated_thread_tags)

[(59, 451, 451, 'board'),
 (53, 240, 240, 'support'),
 (51, 196, 196, 'nxp'),
 (49, 460, 460, 'device'),
 (43, 243, 243, 'linux'),
 (43, 199, 199, 'code'),
 (43, 127, 127, 'system'),
 (43, 81, 81, 'request'),
 (41, 523, 523, 'boot'),
 (41, 94, 94, 'comments'),
 (40, 81, 81, 'keep'),
 (38, 95, 95, 'current'),
 (37, 201, 201, 'test'),
 (37, 121, 121, 'version'),
 (36, 111, 111, 'hardware'),
 (36, 93, 93, 'software'),
 (36, 67, 67, 'confirm'),
 (35, 189, 189, 'c'),
 (34, 203, 203, 'design'),
 (34, 96, 96, 'change'),
 (34, 89, 89, 'provide'),
 (34, 60, 60, 'soon'),
 (32, 214, 214, 'kernel'),
 (32, 203, 203, 'image'),
 (32, 107, 107, 'changes'),
 (32, 85, 85, 'case'),
 (32, 48, 48, 'look'),
 (31, 96, 96, 'configuration'),
 (31, 63, 63, 'well'),
 (30, 134, 134, 'br'),
 (30, 55, 55, 'updates'),
 (29, 162, 162, 'build'),
 (29, 86, 86, 'interface'),
 (29, 52, 52, 'connected'),
 (28, 636, 636, 'usb'),
 (28, 140, 140, 'imx'),
 (28, 57, 57, 'value'),
 (28, 53, 53, 'im'),
 (28, 42, 42, 'close'),
 (

In [15]:
next(generate_combined_tags(index='threads', 
                               ids='hBKP728BJn_G0KX-yeNc', fields=['text']))

('2000', {'doc_freq': 2, 'ttf': 2, 'term_freq': 1})

In [125]:
'0x'.startswith('0x')

True