In [1]:
import sys
sys.path.append("..")

In [2]:
from src.utils import count_tags, count_templates

In [3]:
data_file = "../data/1691991229/raw.parquet"

## Count templates

In [4]:
template_count, temp_counts_by_article = count_templates(data_file)

100%|█████████████████████████████████████████| 348/348 [03:10<00:00,  1.83it/s]


In [19]:
len(temp_counts_by_article)

44532

In [5]:
template_count.most_common(25)

[('sfn', 683372),
 ('convert', 191941),
 ('cite book', 66515),
 ('lang', 43685),
 ('short description', 38063),
 ('rp', 36382),
 ('good article', 28501),
 ('sfnp', 28227),
 ('efn', 26423),
 ('main', 25637),
 ("'", 25015),
 ('cite web', 20584),
 ('r', 19211),
 ('cvt', 18996),
 ('cite journal', 17478),
 ('nbsp', 17300),
 ('use dmy dates', 15354),
 ('sfnm', 15331),
 ('jct', 14594),
 ('refn', 12763),
 ('certification table entry', 11620),
 ('use mdy dates', 10686),
 ('see also', 10005),
 ('clear', 9868),
 ('cite news', 9565)]

In [6]:
len(template_count)

13633

In [7]:
# Total templates used (repetitions allowed)
template_count.total()

1996562

In [23]:
# Find articles with most templates
article_temp_counts = [(a, len(c)) for a, c in temp_counts_by_article.items()]
article_temp_counts.sort(key = lambda x: x[1], reverse=True)
article_temp_counts[:100]

[('Long and short scales', 245),
 ('Richard Nixon', 77),
 ('Pomona College', 68),
 ('Jack Kemp', 67),
 ('Abraham Lincoln', 67),
 ('Karl Marx', 66),
 ('Nikita Khrushchev', 66),
 ('Bill Clinton', 65),
 ('Nancy Pelosi', 65),
 ('Warren G. Harding', 65),
 ('William Howard Taft', 64),
 ('James G. Blaine', 62),
 ('Ronald Reagan', 62),
 ('Isaac Newton', 60),
 ('William McKinley', 60),
 ('International System of Units', 59),
 ('Oscar Wilde', 59),
 ('Benjamin Harrison', 59),
 ('Benjamin Disraeli', 59),
 ('Aarhus', 58),
 ('Mermaid', 58),
 ('Margaret Thatcher', 57),
 ('Woodrow Wilson', 55),
 ('TRAPPIST-1', 54),
 ('Beryl May Dent', 54),
 ('Jim Moran', 53),
 ('Azerbaijan', 51),
 ('Kerala', 51),
 ('Homs', 50),
 ('Srinivasa Ramanujan', 49),
 ('Usain Bolt', 49),
 ('Hannah Arendt', 49),
 ('Sirius', 49),
 ('2009 NBA All-Star Game', 48),
 ('Widener Library', 48),
 ('2001: A Space Odyssey', 48),
 ('Jimi Hendrix', 48),
 ('2006 World Cup of Pool', 47),
 ('Star Wars: Episode I – The Phantom Menace', 47),
 ('B

In [13]:
temp_counts_by_article["Long and short scales"].most_common()[:20]

[('lang', 258),
 ('#tag:ref', 12),
 ('block indent', 10),
 ('e', 10),
 ('div col', 9),
 ('div col end', 9),
 ('legend', 6),
 ('reflist', 4),
 ('cite web', 3),
 ('citation needed', 3),
 ('bel', 3),
 ('lang-fr', 2),
 ('can', 2),
 ('eri', 2),
 ('flag', 2),
 ('lang-ms', 2),
 ('mus', 2),
 ('syc', 2),
 ('vut', 2),
 ('ipa-my', 2)]

In [18]:
# Find templates used in the most articles
articles_by_temp = {}
for a, c in temp_counts_by_article.items():
    for temp, count in c.most_common():
        if temp not in articles_by_temp:
            articles_by_temp[temp] = {a}
            continue
        articles_by_temp[temp].add(a)

articles_by_temp_ls = list(articles_by_temp.items())
articles_by_temp_ls.sort(key = lambda x: len(x[1]), reverse=True)
most_popular_temps = [(e[0], len(e[1])) for e in articles_by_temp_ls]
most_popular_temps[:50]

[('short description', 38062),
 ('good article', 28476),
 ('convert', 17574),
 ('use dmy dates', 15350),
 ('use mdy dates', 10673),
 ('sfn', 10278),
 ('main', 9889),
 ("'", 7904),
 ('reflist', 7788),
 ('efn', 6842),
 ('cite book', 6347),
 ('see also', 5374),
 ('use british english', 4963),
 ('clear', 4906),
 ('cite web', 4174),
 ('featured article', 3634),
 ('col-end', 3553),
 ('refend', 3443),
 ('lang', 3428),
 ('refbegin', 3391),
 ('about', 3370),
 ('use american english', 3312),
 ('blockquote', 3223),
 ('col-2', 3189),
 ('col-begin', 3075),
 ('cite journal', 3022),
 ("'s", 2715),
 ('further', 2674),
 ('citation needed', 2408),
 ('for', 2347),
 ('refn', 2304),
 ('inflation', 2208),
 ('cvt', 2204),
 ('div col', 2119),
 ('nbsp', 2116),
 ('div col end', 2088),
 ('circa', 2007),
 ('certification table top', 1975),
 ('as of', 1971),
 ('certification table entry', 1969),
 ('engvarb', 1946),
 ('certification table bottom', 1894),
 ('rp', 1894),
 ('#tag:ref', 1891),
 ('cite news', 1878),
 ('

In [21]:
# Save most popular temps (templates used by the most articles) to disk
with open("../metrics/temps_used_by_most_articles.txt", "w") as f:
    for el in most_popular_temps:
        f.write(f"{el[0]}, {el[1]}\n")

In [8]:
# Save template count to disk
with open("../metrics/template_counts_raw.txt", "w") as f:
    for el in template_count.most_common():
        f.write(f"{el[0]} {el[1]}\n")

## Count tags

In [9]:
tag_count, tag_counts_by_article = count_tags(data_file)

100%|█████████████████████████████████████████| 348/348 [03:12<00:00,  1.81it/s]


In [13]:
tag_count.most_common()

[('i', 1321390),
 ('li', 485356),
 ('b', 126186),
 ('ref', 26777),
 ('dd', 20148),
 ('br', 15656),
 ('sub', 13007),
 ('small', 12500),
 ('sup', 11351),
 ('math', 11252),
 ('dt', 9080),
 ('blockquote', 5825),
 ('nowiki', 4077),
 ('gallery', 2462),
 ('span', 2378),
 ('div', 1214),
 ('code', 671),
 ('poem', 500),
 ('hr', 474),
 ('u', 366),
 ('timeline', 300),
 ('onlyinclude', 297),
 ('section', 281),
 ('big', 225),
 ('noinclude', 144),
 ('syntaxhighlight', 129),
 ('references', 127),
 ('score', 123),
 ('chem', 121),
 ('td', 92),
 ('cite', 66),
 ('imagemap', 58),
 ('p', 47),
 ('tr', 47),
 ('abbr', 44),
 ('em', 42),
 ('hiero', 38),
 ('ol', 38),
 ('s', 37),
 ('var', 32),
 ('q', 24),
 ('includeonly', 24),
 ('pre', 17),
 ('th', 17),
 ('mapframe', 15),
 ('ul', 12),
 ('dfn', 10),
 ('ce', 6),
 ('kbd', 5),
 ('table', 5),
 ('graph', 3),
 ('mark', 3),
 ('wbr', 2),
 ('strong', 1),
 ('bdi', 1),
 ('ins', 1),
 ('center', 1)]

In [14]:
len(tag_count)

57

In [15]:
tag_count.total()

2073105