In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
DIRECTORY = '/content/drive/My Drive/Informatics/Sphere@mail.ru/1_Data_Analysis/Lec_6/'

In [None]:
import random

In [None]:
def rand_char(lower='a', upper='z', size=1):
    return (chr(random.randint(ord(lower), ord(upper))) for _ in range(size))

# Модуль collections 😍

## defaultdict

In [None]:
from collections import defaultdict

In [None]:
random.seed(3476)

a = list(rand_char(lower='a', upper='d', size=12))
a

['d', 'c', 'b', 'a', 'a', 'a', 'a', 'b', 'c', 'b', 'b', 'd']

In [None]:
counter = {}

for e in a:
    if e not in counter:
        counter[e] = 0
    counter[e] += 1
    
counter

{'a': 4, 'b': 4, 'c': 2, 'd': 2}

In [None]:
counter = defaultdict(int)

for e in a:
    counter[e] += 1
    
counter

defaultdict(int, {'a': 4, 'b': 4, 'c': 2, 'd': 2})

При обращении к ключу, которого нет в `defaultdict`, он автоматически создается.

In [None]:
print(counter['z'])
counter

0


defaultdict(int, {'a': 4, 'b': 4, 'c': 2, 'd': 2, 'z': 0})

`defaultdict` поддерживает интерфейс словарей.

In [None]:
counter.pop('z')
counter

defaultdict(int, {'a': 4, 'b': 4, 'c': 2, 'd': 2})

### Пример: списки смежности

Пусть есть однонаправленный граф.

In [None]:
random.seed(1982)

graph_pair = [ tuple(rand_char(lower='a', upper='g', size=2))
               for _ in range(12) ]
graph_pair

[('g', 'g'),
 ('g', 'a'),
 ('a', 'f'),
 ('e', 'f'),
 ('c', 'a'),
 ('g', 'e'),
 ('d', 'd'),
 ('c', 'g'),
 ('d', 'e'),
 ('c', 'd'),
 ('b', 'e'),
 ('g', 'c')]

In [None]:
graph_list = defaultdict(set)

for p_i, p_j in graph_pair:
    graph_list[p_i].add(p_j)
    
graph_list

defaultdict(set,
            {'a': {'f'},
             'b': {'e'},
             'c': {'a', 'd', 'g'},
             'd': {'d', 'e'},
             'e': {'f'},
             'g': {'a', 'c', 'e', 'g'}})

### Пример: label-кодирование элементов

In [None]:
random.seed(4987)

words = [ ''.join(rand_char(lower='a', upper='g', size=1))
          for _ in range(12) ]
words

['c', 'g', 'f', 'c', 'e', 'a', 'c', 'a', 'e', 'f', 'c', 'a']

In [None]:
categories = defaultdict(lambda : len(categories))

[categories[e] for e in words]

[0, 1, 2, 0, 3, 4, 0, 4, 3, 2, 0, 4]

Предположим, хотим "заморозить" `defaultdict`

In [None]:
categories

defaultdict(<function __main__.<lambda>>,
            {'a': 4, 'c': 0, 'e': 3, 'f': 2, 'g': 1})

In [None]:
categories.default_factory = None
categories['z']

KeyError: ignored

In [None]:
categories

defaultdict(None, {'a': 4, 'c': 0, 'e': 3, 'f': 2, 'g': 1})

In [None]:
# makes object's copy

print(repr(categories))
categories = dict(categories)
categories

defaultdict(None, {'c': 0, 'g': 1, 'f': 2, 'e': 3, 'a': 4})


{'a': 4, 'c': 0, 'e': 3, 'f': 2, 'g': 1}

## Counter

In [None]:
from collections import Counter

In [None]:
a = ['a', 'b', 'a', 'a', 'b', 'c']

counter = Counter(a)
counter['d'] += 1
counter

Counter({'a': 3, 'b': 2, 'c': 1, 'd': 1})

`Counter` поддерживает интерфейс словарей и не создает ключей, если его нет, в отличие от `defaultdict`.

In [None]:
print(counter['f'])
counter

0


Counter({'a': 3, 'b': 2, 'c': 1, 'd': 1})

In [None]:
counter.pop('d')
counter

Counter({'a': 3, 'b': 2, 'c': 1})

`Counter` имеет и собственные методы.

In [None]:
counter.most_common()

[('a', 3), ('b', 2), ('c', 1)]

In [None]:
counter.most_common(2)

[('a', 3), ('b', 2)]

In [None]:
random.seed(3476)

a = list(rand_char(lower='a', upper='d', size=12))
b = list(rand_char(lower='a', upper='d', size=12))

counter_a = Counter(a)
counter_b = Counter(b)

counter_a['c'] = -1
counter_b['z'] = 3

print(repr(counter_a))
print(repr(counter_b))

Counter({'b': 4, 'a': 4, 'd': 2, 'c': -1})
Counter({'a': 4, 'd': 3, 'b': 3, 'z': 3, 'c': 2})


In [None]:
counter_a + counter_b    # a + b

Counter({'a': 8, 'b': 7, 'c': 1, 'd': 5, 'z': 3})

In [None]:
# values <= 0 are ignored

counter_a - counter_b    # a - b

Counter({'b': 1})

In [None]:
# values <= 0 are ignored

counter_a & counter_b    # min

Counter({'a': 4, 'b': 3, 'd': 2})

In [None]:
counter_a | counter_b    # max

Counter({'a': 4, 'b': 4, 'c': 2, 'd': 3, 'z': 3})

## namedtuple

In [None]:
from collections import namedtuple

In [None]:
!cat files/documents.txt

426156754	https://habr.com/ru/company/mailru/blog/463045/	1565611920
4086447006	https://habr.com/ru/company/mailru/blog/462769/	1565877120
3093770339	https://habr.com/ru/company/mailru/blog/463063/	1565345880
1095223504	https://habr.com/ru/company/mailru/blog/462811/	1565169840
3229467448	https://habr.com/ru/company/mailru/blog/461927/	1564500300


In [None]:
def parse(s):
    doc_id, url, ts = s.split('\t')
    doc_id = int(doc_id)
    ts = int(ts)
    return Document(doc_id, url, ts)

In [None]:
Document = namedtuple("Document", ["doc_id", "url", "ts"])

In [None]:
with open(DIRECTORY + "files/documents.txt", "r") as f_name:
    for doc in map(parse, f_name):
        print(repr(doc), sep='\t')

Document(doc_id=426156754, url='https://habr.com/ru/company/mailru/blog/463045/', ts=1565611920)
Document(doc_id=4086447006, url='https://habr.com/ru/company/mailru/blog/462769/', ts=1565877120)
Document(doc_id=3093770339, url='https://habr.com/ru/company/mailru/blog/463063/', ts=1565345880)
Document(doc_id=1095223504, url='https://habr.com/ru/company/mailru/blog/462811/', ts=1565169840)
Document(doc_id=3229467448, url='https://habr.com/ru/company/mailru/blog/461927/', ts=1564500300)


In [None]:
doc._fields

('doc_id', 'url', 'ts')

In [None]:
doc.doc_id, doc.url, doc.ts

(3229467448, 'https://habr.com/ru/company/mailru/blog/461927/', 1564500300)

In [None]:
doc[0], doc[1], doc[2]

(3229467448, 'https://habr.com/ru/company/mailru/blog/461927/', 1564500300)

In [None]:
doc._asdict()

OrderedDict([('doc_id', 3229467448),
             ('url', 'https://habr.com/ru/company/mailru/blog/461927/'),
             ('ts', 1564500300)])

Атрибуты защищены от изменения.

In [None]:
doc.ts = 0
doc

AttributeError: ignored

In [None]:
doc._replace(url='habr.com', ts=0)

Document(doc_id=3229467448, url='habr.com', ts=0)

`namedtuple` имеет поддержку значений по-умолчанию.

In [None]:
Document()

TypeError: ignored

In [None]:
namedtuple("Document", ["doc_id", "url", "ts"], defaults=(-1, None, 0))

TypeError: ignored