
### Join two dicts summing the counts of the same key

In [1]:
from collections import defaultdict
import doctest

In [2]:
def update_dict_counts(old_dict, new_dict):
    """
    Reduces two dicts into a new dict containing the sum of the values sharing the same keys
    >>> old_dict = {1: 2, 3: 3, 4: 2}
    >>> new_dict = {1: 6, 3: 7}
    >>> update_dict_counts(old_dict, new_dict)
    {1: 8, 3: 10, 4: 2}
    """
    merged_dict = defaultdict(int)

    for key, value in list(old_dict.items()) + list(new_dict.items()):
        merged_dict[key] += value
        
    merged_dict = dict(merged_dict)        
    return merged_dict

In [3]:
old_dict = {1: 2, 3: 3, 4: 2}
new_dict = {1: 6, 3: 7}
mer_dict = update_dict_counts(old_dict, new_dict)

In [4]:
mer_dict

{1: 8, 3: 10, 4: 2}

In [5]:
doctest.testmod()

TestResults(failed=0, attempted=3)

### Remove from a dict elements that verify a condition stored in an array

Consider you have a dict mapping words to positions, and you want a new dict containing only the words that appeared at lest `infimum` times.

You store in an array the counts of each word in the dict, based on the position of each word.


In [244]:
%load_ext memory_profiler

import numpy as np
import string
import random
import copy 
import timeit

def get_random_string(length,letters = string.ascii_lowercase):
    result_str = ''.join(random.choice(letters) for i in range(length))
    return result_str

s_len = 8
n_strings = 1_000_000
d = {get_random_string(s_len):k for k in range(n_strings)}
counts = np.random.randint(1,100,n_strings)
infimum = 20

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [267]:
def remove_lower_than_infimum_1(d, infimum, counts):
    len_original_d = len(d)
    new_counts = []
    for word in list(d):
        pos = d[word]
        count = counts[pos]
        if count < infimum:
            del d[word]
        else:
            new_counts.append(count)

    d = {w:p for p,w in enumerate(d)}
    print(f'{len_original_d}/{len(d)}/{round(100*len(d)/len_original_d,2)}% (original, new, percentage of original)')
    return d, np.array(new_counts)

In [268]:
d_copy = copy.deepcopy(d)
%time d_new1, new_counts1 = remove_lower_than_infimum_1(d_copy, infimum, counts)

999995/807897/80.79% (original, new, percentage of original)
CPU times: user 485 ms, sys: 7.13 ms, total: 493 ms
Wall time: 494 ms


In [273]:
def remove_lower_than_infimum_2(d, infimum, counts):
    len_original_d = len(d)

    new_counts = []
    d_new = {}
    for word,pos in d.items():
        count = counts[pos]
        if count >= infimum:
            new_counts.append(count)
            d_new[word] = len(d_new)
        
    del d
    print(f'{len_original_d}/{len(d_new)}/{round(100*len(d_new)/len_original_d,2)}% (original, new, percentage of original)')
    
    return d_new, np.array(new_counts)

In [274]:
d_copy = copy.deepcopy(d)
%time d_new2, new_counts2 = remove_lower_than_infimum_2(d_copy, infimum, counts)

999995/807897/80.79% (original, new, percentage of original)
CPU times: user 323 ms, sys: 5.92 ms, total: 329 ms
Wall time: 330 ms


What about memory ?

Almost all options use more or less the same

In [241]:
%memit remove_lower_than_infimum_1(d_copy, infimum, counts)

999998/807897/80.79% (original, new, percentage of original)
peak memory: 1324.57 MiB, increment: 5.20 MiB


In [242]:
%memit remove_lower_than_infimum_2(d_copy, infimum, counts)

807897/807897/100.0% (original, new, percentage of original)
peak memory: 1319.25 MiB, increment: 0.03 MiB
