In [1]:
import pandas as pd

country_headers = ['name', 'formal_name', 'code2', 'code3', 'who', 'cares', 'blah']
cf = pd.read_csv('countries.csv', names=country_headers, keep_default_na=False)
names = list(cf.name)

names[:5]

['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola']

In [2]:
from Trie import Trie
                
T = Trie()
T.add_string("Australia")
T.add_string("Austria")
T.add_string("Aukland")
T.add_string("Albania")
T.add_string("Canada")
[_ for _ in T.shortest_unique_prefixes()]

[('Austra', 'Australia'),
 ('Austri', 'Austria'),
 ('Auk', 'Aukland'),
 ('Al', 'Albania'),
 ('C', 'Canada')]

In [3]:
T = Trie()
for name in names:
    T.add_string(name)
    
pres = list(T.shortest_unique_prefixes())
pres[:5]

[('Af', 'Afghanistan'),
 ('Alb', 'Albania'),
 ('Alg', 'Algeria'),
 ('And', 'Andorra'),
 ('Ang', 'Angola')]

In [4]:
# We'll code "no unique prefix" as prefix len of -1 just to simplify processing.
x = [(a, b, -1 if a is None else len(a)) for (a,b) in pres]
predf = pd.DataFrame(x, columns=['pre', 'name', 'prelen'])
predf.head()

Unnamed: 0,pre,name,prelen
0,Af,Afghanistan,2
1,Alb,Albania,3
2,Alg,Algeria,3
3,And,Andorra,3
4,Ang,Angola,3


In [5]:
df = pd.merge(predf, cf, on='name')
df.head()

Unnamed: 0,pre,name,prelen,formal_name,code2,code3,who,cares,blah
0,Af,Afghanistan,2,Islamic Republic of Afghanistan,AF,AFG,33.982993,66.391594,6
1,Alb,Albania,3,Republic of Albania,AL,ALB,41.000174,19.8717,7
2,Alg,Algeria,3,People's Democratic Republic of Algeria,DZ,DZA,27.898617,3.197712,5
3,And,Andorra,3,Principality of Andorra,AD,AND,42.540571,1.552013,11
4,Ang,Angola,3,Republic of Angola,AO,AGO,-12.164697,16.709336,6


In [6]:
predf.groupby('prelen').size()

prelen
-1      3
 1      3
 2     56
 3     80
 4     27
 5      4
 6      3
 7      8
 8      4
 9      1
 12     2
 13     2
dtype: int64

In [8]:
import os
"""So maybe I'll just output an html document here rather than going through matplotlib, since
this is basically all text and images - no real charting involved.
"""

PREAMBLE = """<section id="preamble">
<em>Example:</em> <b>Belgium</b> is the only country whose name begins with <b><code>BELG</code></b>. Any shorter prefix, like <code>BEL</code> would be ambiguous with other\
 countries (Belarus, Belize), so Belgium's shortest unique prefix length is <b>4</b>.
</section>
"""

POSTAMBLE = """<div id="postamble">
<span id="data">English names of UN member states via un.org/en/about-us/member-states</span>
<span id="sig">@halfeatenscone</span>
</div>"""

def render_doc(df, spoiler=False):
    """Return the html source of the full infographic as a string.
    """
    s = """<html>
    <head>
      <link rel="stylesheet" href="./common.css">
      <link rel="stylesheet" href="./{}spoil.css">
    </head>
    <body>
    <h1>Countries by shortest unique prefix</h1>
    """.format('' if spoiler else 'no')
    if not spoiler:
        s += PREAMBLE
    for presize in range(1, 9):
        s += render_section(df, presize, abbrev=spoiler)
    s += render_section(df, 9, plus=True, abbrev=spoiler)
    s += render_section(df, -1, abbrev=spoiler)
    s += POSTAMBLE
    s += "</body></html>"
    return s

def render_section(df, prelen, plus=False, abbrev=False):
    """Render a document section corresponding to the given prefix length.
        plus: if True, use all rows with values >= prelen
        abbrev: if True, use an abbreviated form of full name (specifically, trim
            anything in parentheses. e.g. "Venezuela (Bolivarian Republic of)" -> "Venezuela"
            This is just to cheat out a little more space for text in the 'spoiler' version of the infographic.
    """
    classname = "len{}".format('na' if prelen == -1 else prelen)
    s = '<section class={}>\n'.format(classname)
    s += '<h2>{}{}</h2>\n'.format(
        "N/A" if prelen == -1 else prelen,
        '+' if plus else ''
    )
    s += '<ul id="{}">\n'.format(classname)
    if plus:
        rows = df[df.prelen >= prelen]
    else:
        rows = df[df.prelen == prelen]
    for _, row in rows.iterrows():
        s += render_row(row, abbrev)
    s += '</ul>\n'
    s += '</section>\n'
    return s

SVG = 1
if SVG:
    img_pre = 'countries/data/flags/SVG/'
else:
    img_pre = 'countries/data/flags/PNG-32/'
def render_country_data(pre, suff, code):
    if SVG:
        img_fname = code + '.svg'
    else:
        img_fname = code + '-32.png'
    img_path = img_pre + img_fname
    img_path = 'file://' + os.path.abspath(img_path)
    img = '<img class="flag" src="{}" />'.format(img_path)
    text = '''<span class="name">
    <span class="pre">{}</span><span class="suff">{}</span>
    </span>
    '''.format(pre, suff)
    return '<li>{}{}</li>\n'.format(img, text)

def render_row(row, abbrev=False):
    name = row['name']
    if abbrev and '(' in name:
        name = name[:name.find('(')-1]
    if row.prelen == -1:
        #return render_country_data(row['name'], '', row.code2)
        return render_country_data('', name, row.code2)
    return render_country_data(row.pre, name[row.prelen:], row.code2)

# Generate two html docs which are identical except that one uses styles.css and the other uses the "spoilery" styles of sstyles.css
doca = render_doc(df)
with open('pres.html', 'w') as f:
    f.write(doca)
docb = render_doc(df, spoiler=True)
with open('spoilers.html', 'w') as f:
    f.write(docb)