In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON

In [None]:
# Write a TreeNode class
class TreeNode :
    def __init__(self, uri, label, child):
        self.uri = uri
        self.label = label
        self.child = child

In [None]:
## Download this: https://files.dice-research.org/projects/LOLA/misc/glottolog_language.ttl.gz (backup of: https://glottolog.org/meta/downloads version 5.0)
## Set it up in a virtuoso instance under the following graph name: http://upb.de/dice/lola/language
## To start the virtuoso server (edit the ini file beforehand for DirsAllowed and ports): ./virtuoso-t +configfile ../database/virtuoso.ini +f
## Make use of ld_dir(), rdf_loader_run() and checkpoint;

# sparql_obj = SPARQLWrapper(endpoint="http://porque.cs.upb.de:18890/sparql", defaultGraph="http://upb.de/dice/lola/language")
sparql_obj = SPARQLWrapper(endpoint="http://lola.cs.upb.de:18890/sparql", defaultGraph="http://upb.de/dice/lola/language")

In [None]:
# Find all top nodes through SPARQL
top_node_sparql = '''
SELECT DISTINCT ?node ?nodelbl WHERE { 
  ?node a <http://purl.org/linguistics/gold/LanguageFamily>. 
  ?node rdfs:label ?nodelbl.
}
'''

In [None]:
# Fetching all top-level language families
lang_trees = []

sparql_obj.setQuery(top_node_sparql)
sparql_obj.setReturnFormat(JSON)
results = sparql_obj.query().convert()
print('Results fetched: ',len(results["results"]["bindings"]))
for result in results["results"]["bindings"]:
    uri = result["node"]["value"]
    label = result["nodelbl"]["value"]
    lang_trees.append(TreeNode(uri, label, []))

In [None]:
# Starting at each language family, perform a depth first search and form the tree
child_node_sparql = '''
SELECT DISTINCT ?node ?nodelbl WHERE {<%s> <http://www.w3.org/2004/02/skos/core#narrower> ?node . ?node rdfs:label ?nodelbl.}
'''

In [None]:
# recursive function to find child nodes
def find_child_nodes(tree_node):
    sparql_obj.setQuery(child_node_sparql % (tree_node.uri))
    sparql_obj.setReturnFormat(JSON)
    results = sparql_obj.query().convert()
    # TODO: Implement proper DFS with loop detection
    for result in results["results"]["bindings"]:
        uri = result["node"]["value"]
        label = result["nodelbl"]["value"]
        child_node = TreeNode(uri, label, [])
        tree_node.child.append(child_node)
        # Find the children of the child node
        find_child_nodes(child_node)

In [None]:
for tree in lang_trees:
    # If there exists a loop in the tree, then this function will run forever.
    find_child_nodes(tree)

In [None]:
# Print the one of the path from one of the trees:
tree = lang_trees[34]
path = tree.label
while True:
    if(len(tree.child) != 0):
        tree = tree.child[0]
        path+= ' -> ' + tree.label
    else:
        break


In [None]:
print(path)

In [None]:
# languages = ['English', 'Cebuano', 'German', 'Swedish', 'French', 'Dutch', 'Russian', 'Spanish', 'Italian', 'Egyptian Arabic', 'Polish', 'Japanese', 'Mandarin', 'Vietnamese', 'Waray', 'Arabic', 'Ukrainian', 'Portuguese', 'Persian', 'Catalan', 'Serbian', 'Indonesian', 'Korean', 'Bokmål', 'Finnish', 'Hungarian', 'Czech', 'Turkish', 'Chechen', 'Serbo-Croatian', 'Romanian', 'Southern Min', 'Tatar', 'Basque', 'Malay', 'Esperanto', 'Hebrew', 'Armenian', 'Bulgarian', 'Danish', 'South Azerbaijani', 'Slovak', 'Kazakh', 'Estonian', 'Minangkabau', 'Belarusian', 'Simple English', 'Croatian', 'Greek', 'Lithuanian', 'Galician', 'Azerbaijani', 'Slovene', 'Urdu', 'Nynorsk', 'Georgian', 'Hindi', 'Uzbek', 'Thai', 'Tamil', 'Latin', 'Welsh', 'Asturian', 'Macedonian', 'Cantonese', 'Bengali', 'Volapük', 'Latvian', 'Tajik', 'Afrikaans', 'Burmese']

In [None]:
# languages = [
#     "Bengali",
#     "German",
#     "Estonian",
#     "Finnish",
#     "French",
#     "Modern Hebrew",
#     "Hindi",
#     "Standard Indonesian",
#     "Italian",
#     "Japanese",
#     "Korean",
#     "Lithuanian",
#     "Dutch",
#     "Romanian",
#     "Russian",
#     "Thai",
#     "Ukrainian",
#     "Bulgarian",
#     "Irish",
#     "Malayalam",
#     "Macedonian",
#     "Norwegian Bokmål",
#     "Nepali",
#     "Sinhala",
#     "Telugu",
#     "Xhosa"
# ]

In [None]:
languages = [
    "English", "Russian", "Spanish", "German", "French", "Chinese", "Italian", "Portuguese", 
    "Polish", "Dutch", "Japanese", "Vietnamese", "Turkish", "Arabic", "Czech", "Persian", 
    "Greek", "Swedish", "Ukrainian", "Hungarian", "Romanian", "Finnish", "Danish", 
    "Bulgarian", "Indonesian", "Thai", "Korean", "Hindi", "Norwegian", "Slovak", 
    "Catalan", "Lithuanian", "Bangla", "Estonian", "Slovenian", "Latvian", "Albanian", 
    "Azerbaijani", "Tamil", "Hebrew", "Serbian", "Nepali", "Georgian", "Armenian", 
    "Macedonian", "Urdu", "Kazakh", "Malayalam", "Icelandic", "Marathi", "Mongolian", 
    "Telugu", "Galician", "Belarusian", "Basque", "Kannada", "Gujarati", "Khmer", 
    "Burmese", "Afrikaans", "Sinhala", "Punjabi", "Kyrgyz", "Welsh", "Tajik", 
    "Croatian", "Esperanto", "Pashto", "Filipino", "Irish", "Kurdish", "Cebuano", 
    "Amharic", "Malay", "Western Frisian", "Tatar", "Lao", "Central Kurdish", 
    "Luxembourgish", "Odia", "Maltese", "Yiddish", "Norwegian Nynorsk", "Malagasy", 
    "Sindhi", "Uzbek", "Bashkir", "Egyptian Arabic", "Divehi", "Swahili", "Tibetan", 
    "Assamese", "Latin", "Uyghur", "Serbian (Latin)", "Breton", "South Azerbaijani", 
    "Waray", "Chuvash", "Sakha", "Chechen", "Sanskrit", "Western Panjabi", 
    "Low German", "Turkmen", "Occitan", "Mingrelian", "Asturian", "Ossetic", 
    "Scottish Gaelic", "Eastern Mari", "Piedmontese", "Swiss German", "Volapük", 
    "Bishnupriya", "Newari", "Upper Sorbian", "Lombard", "Aragonese", "Iloko", 
    "Javanese", "Mazanderani", "Lezghian", "Karachay-Balkar", "Sundanese", 
    "Minangkabau", "Komi", "Walloon", "Lojban", "Bosnian", "Quechua", "Ido", 
    "Western Mari", "Goan Konkani", "Interlingua", "Avaric", "Bihari languages", 
    "Wu Chinese", "Limburgish", "Yoruba", "Nahuatl languages", "Venetian", 
    "Guarani", "Russia Buriat", "Cornish", "Maithili", "Emiliano-Romagnol", 
    "Lower Sorbian", "Kalmyk", "Northern Luri", "Somali", "Neapolitan", 
    "Romansh", "Tuvinian", "Sicilian", "Haitian Creole", "Northern Frisian", 
    "Mirandese", "Erzya", "Interlingue", "Pampanga", "Bavarian", "Yue Chinese", 
    "Chavacano", "Central Bikol", "West Flemish", "Rusyn"
]



In [None]:
path_map = {}

# recursive function to find language path
def find_language_path(treenode, path, lang):
    if len(treenode.child) == 0:
        return None
    for child in treenode.child:
        temppath = path + ', ' + child.label
        if child.label.casefold() == lang:
            return temppath
        else:
            res = find_language_path(child, temppath, lang)
            if not res:
                continue
            else:
                return res
    return None
# find path map for each language in all trees
for lang in languages:
    lang_path = None
    for tree in lang_trees:
        lang_path = find_language_path(tree, tree.label, lang.casefold())
        if lang_path:
            path_map[lang] = lang_path
            print(lang_path)
            break
    if not lang_path :
        print('>> Needs manual searching')