# Node Merging
This notebook was intended to be a test of merging nodes to reduce graph size.

## Conclusion
Merging won't reduce the size by that much...
Should probably just try to find an efficient algo.

In [12]:
import networkx as nx
import tempfile
import subprocess
import matplotlib.pyplot as plt

# Load country list

In [13]:
with open('source.txt') as file:
    lines = file.readlines()

names = list(map(lambda line: line.split('\t')[1], lines))
print(f"Total number of countries: {len(names)}")

Total number of countries: 195


# Get start/end groups

In [15]:
def getEndSets(words):
    starts = {}
    finishes = {}

    for name in words:
        s = name[0].lower()
        f = name[-1].lower()
        starts[s] = starts.get(s, []) + [name]
        finishes[f] = finishes.get(f, []) + [name]

    return starts, finishes

starts, finishes = getEndSets(names)

# Merge same start/end countries

In [17]:
merges = {}
mergeable = set()
for name in names:
    s = name[0].lower()
    f = name[-1].lower()
    
    if s == f:
        merges[s] = merges.get(s, []) + [name]
        mergeable.add(name)

merged = []
for name in names:
    if name not in mergeable:
        merged.append(name)

for mset in merges.values():
    merged.append(', '.join(mset))
merged = sorted(merged)
print('\t' + '\n\t'.join(merged))

names = merged

	Afghanistan
	Albania, Algeria, Andorra, Angola, Antigua and Barbuda, Argentina, Armenia, Australia, Austria
	Azerbaijan
	Bahamas
	Bahrain
	Bangladesh
	Barbados
	Belarus
	Belgium
	Belize
	Benin
	Bhutan
	Bolivia
	Bosnia and Herzegovina
	Botswana
	Brazil
	Brunei
	Bulgaria
	Burkina Faso
	Burundi
	Cabo Verde
	Cambodia
	Cameroon
	Canada
	Central African Republic
	Chad
	Chile
	China
	Colombia
	Comoros
	Congo
	Costa Rica
	Croatia
	Cuba
	Cyprus
	Czechia
	Côte d'Ivoire
	Democratic Republic of the Congo
	Denmark
	Djibouti
	Dominica
	Dominican Republic
	Ecuador
	Egypt
	El Salvador
	Equatorial Guinea
	Eritrea
	Estonia
	Eswatini
	Ethiopia
	Fiji
	Finland
	France
	Gabon
	Gambia
	Georgia
	Germany
	Ghana
	Greece
	Grenada
	Guatemala
	Guinea
	Guinea-Bissau
	Guyana
	Haiti
	Holy See
	Honduras
	Hungary
	Iceland
	India
	Indonesia
	Iran
	Iraq
	Ireland
	Israel
	Italy
	Jamaica
	Japan
	Jordan
	Kazakhstan
	Kenya
	Kiribati
	Kuwait
	Kyrgyzstan
	Laos
	Latvia
	Lebanon
	Lesotho
	Liberia
	Libya
	Liechtenstein
	Lithuani

In [18]:
print(f"Total number of countries after merger: {len(merged)}")

Total number of countries after merger: 184


# Get sinks and sources

In [20]:
sinks = []
sources = set(names.copy())
edgeCount = 0
for curr in names:
    f = curr[-1].lower()
    children = starts.get(f, [])
    if curr in children:
        children.remove(curr)
    
    if len(children) == 0:
        sinks.append(curr)
    
    for child in children:
        edgeCount += 1
        
        if child in sources:
            sources.remove(child)

sinks = sorted(sinks)
sources = sorted(sources)
print(f"Total sinks: {len(sinks)}")
print('\t' + '\n\t'.join(sinks))
print(f"\nTotal sources: {len(sources)}")
print('\t' + '\n\t'.join(sources))

Total sinks: 0
	

Total sources: 40
	Albania, Algeria, Andorra, Angola, Antigua and Barbuda, Argentina, Armenia, Australia, Austria
	Bahamas
	Bahrain
	Bangladesh
	Barbados
	Belarus
	Belgium
	Belize
	Benin
	Bhutan
	Bolivia
	Bosnia and Herzegovina
	Botswana
	Brazil
	Brunei
	Bulgaria
	Burkina Faso
	Burundi
	Central African Republic
	Fiji
	Finland
	France
	Jamaica
	Japan
	Jordan
	Pakistan
	Palau
	Palestine State
	Panama
	Papua New Guinea
	Paraguay
	Peru
	Philippines
	Poland
	Portugal
	Vanuatu
	Venezuela
	Vietnam
	Zambia
	Zimbabwe


# Find nodes with single child
These can be merged because there will never be an alterate path.

In [44]:
supermerged = []
removedNodes = set()
for key, val in starts.items():
#     print(key, len(val))
    if len(val) == 1:
        
        if len(finishes[key]) == 1:
            removedNodes.add(val[0])
            removedNodes.add(finishes[key][0])
            # newval = val[0] + ', ' + finishes[key][0]
            newval = finishes[key][0] + ', ' + val[0]
            supermerged.append(newval)
#             print(newval)

for name in names:
    if name not in removedNodes:
        supermerged.append(name)
supermerged = sorted(supermerged)

print(f"\nTotal supermerged: {len(supermerged)}")


Total supermerged: 184


In [67]:
names = supermerged
starts, finshes = getEndSets(names)

In [86]:
# def dfs(start):
#     visited = set()
#     stack = [start]
#     path = []
#     while stack:
#         curr = stack.pop()
#         if curr not in visited:
#             visited.add(curr)
#             path.append(curr)
#             f = curr[-1].lower()
#             children = starts[f]
# #             print(f"Children ({curr}): {children}")
#             for child in children:
#                 if child not in visited:
#                     stack.insert(0, child)
#                     break
#     print(f"Built path ({len(path)}):")
#     print('\t' + '\n\t'.join(path))

def dfs(curr, depths, visited):
    visited.add(curr)
    
    f = curr[-1].lower()
    children = starts[f]
    for child in children:
        if child not in visited:
            dfs(child, depths, visited)
        
        depths[curr] = max(depths[curr], 1 + depths[child])




In [87]:
depths = {name: 0 for name in names}
paths = {name: [] for name in names}
visited = set()

for name in names:
    if name not in visited:
        dfs(name, depths, visited)

In [88]:
orderedNames = sorted(names, key=lambda n: depths[n])
for name in orderedNames:
    print(depths[name], name)

19 Cambodia
19 Canada
19 China
19 Colombia
19 Costa Rica
19 Croatia
19 Cuba
19 Czechia
19 Dominica
19 Equatorial Guinea
19 Eritrea
19 Estonia
19 Ethiopia
19 Gambia
19 Georgia
19 Ghana
19 Grenada
19 Guatemala
19 Guinea
19 Guyana
19 India
19 Indonesia
19 Kenya
19 Latvia
19 Liberia
19 Libya
19 Lithuania
19 Malaysia
19 Malta
19 Mauritania
19 Micronesia
19 Moldova
19 Mongolia
19 Nicaragua
19 Nigeria
19 North Korea
19 North Macedonia
19 Romania
19 Russia
19 Rwanda
19 Saint Lucia
19 Samoa
19 Saudi Arabia
19 Serbia
19 Slovakia
19 Slovenia
19 Somalia
19 South Africa
19 South Korea
19 Sri Lanka
19 Syria
19 Tanzania
19 Tonga
19 Tunisia
19 Uganda
19 United States of America
20 Ecuador
20 El Salvador
20 Madagascar
20 Myanmar
20 Niger
20 Qatar
21 Côte d'Ivoire
21 Cabo Verde
21 Chile
21 Greece
21 Iraq
21 Mozambique
21 Sierra Leone
21 Singapore
21 Suriname
21 Timor-Leste
22 Congo
22 Democratic Republic of the Congo
22 Lesotho
22 Mexico
22 Monaco
22 Montenegro
22 Morocco
22 Togo
22 Trinidad and Tobago


In [31]:
# def findLongest(curr, longest, path=[]):
#     path.append(curr)
#     f = curr[-1].lower()
#     children = starts.get(f, [])
#     for child in children:
#         if child not in path:
#             newpath = path + [child]
#             print(f"Made newpath: {newpath}")
#             if len(newpath) > len(longest):
#                 longest = newpath
#                 print(f"Updating longest ({len(longest)}): {longest}")
#             findLongest(child, path=newpath, longest=longest)

# L = []
# findLongest(names[0], L)

def findLongest(curr, seen=None, path=None):
    if seen is None:
        seen = set()
    if path is None:
        path = [curr]
    
    seen.add(curr)
    
    f = curr[-1].lower()
    children = starts.get(f, [])
    
    paths = []
    for child in children:
        if child not in seen:
            newpath = path + [child]
            paths.append(tuple(newpath))
            paths.extend(findLongest(child, seen=seen.copy(), path=newpath))

        print(f"Current paths len: {len(paths)}")
    return paths

In [None]:
findLongest('Afghanistan')

Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths len: 0
Current paths

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [10]:
def topo(namesSet):
    topoSorted = []
    while namesSet:
        start = len(namesSet)
        starts, finishes = getEndSets(namesSet)
        nextSet = []
        for name in namesSet:
            s = name[0].lower()
            parents = finishes.get(s, [])
#             print(len(parents), name)
            if len(parents) == 0:
#                 print(f"Found orphan: {name}")
                topoSorted.append(name)
            else:
                nextSet.append(name)
#         print(len(namesSet), len(nextSet))
        if len(namesSet) == len(nextSet):
            topoSorted += namesSet
            break
        namesSet = nextSet
    
    return topoSorted

In [11]:
topo(names)

['Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Fiji',
 'Finland',
 'France',
 'Jamaica',
 'Japan',
 'Jordan',
 'Pakistan',
 'Palau',
 'Palestine State',
 'Panama',
 'Papua New Guinea',
 'Paraguay',
 'Peru',
 'Philippines',
 'Poland',
 'Portugal',
 'Vanuatu',
 'Venezuela',
 'Vietnam',
 'Zambia',
 'Zimbabwe',
 'Haiti',
 'Holy See',
 'Honduras',
 'Hungary',
 'Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 "Côte d'Ivoire",
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czechia',
 'Democratic Republic of the Congo',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican R