In [1]:
# To read the .csv files
import pandas as pd 
# To store the direct inheritance relations as a graph
import networkx as nx

# Data analysis notebook

This data provides a few simple methods to rapidly analyse EtymDB's database. 
(It is very likely that there are more efficient ways to process its information, feel free to try new things.)

## Utils

In [2]:
def add_val_to_ctr_dict(cur_dict, key):
    try:
        cur_dict[key] += 1
    except KeyError:
        cur_dict[key] = 1

# I. Data extraction
This step is quite slow: building the relation trees can take up to half an hour per relation tree. It helps the naviagation later on, though.

In [3]:
path_values = "../data/split_etymdb/etymdb_values.csv"
path_link = "../data/split_etymdb/etymdb_links_info.csv"

df_values = pd.read_csv(path_values,
                        sep='\t',
                        names=["id", "lang", "field", "lexeme", "meaning"],
                        dtype={"id": int, "lang": str, "field": int, "meaning": str}).set_index("id")

df_link = pd.read_csv(path_link,
                      sep='\t',
                      names=["relation_type", "child", "parent"],
                      dtype={"relation_type": str, "child": int, "parent": int})

df_inher = df_link.loc[df_link['relation_type'].isin(["inh"])]
df_bor = df_link.loc[df_link['relation_type'].isin(["bor"])]
df_cog = df_link.loc[df_link['relation_type'].isin(["cog"])]

In [4]:
def get_graph(df: pd.DataFrame, df_values: pd.DataFrame, oriented: bool = True):
    """ Builds a graph for the links between the different database lexemes. 
    
    df: inheritance/borrowing/cognacy links dataframe
    df_values: dataframe containing all the lexemes with their ids
    oritented: True if inheritance, False otherwise
    """
    tree = nx.DiGraph() if oriented else nx.Graph()

    for index, row in df.iterrows():
        child_ix = row["child"]
        parent_ix = row["parent"]
        if child_ix >= 0 and parent_ix >= 0: # We ignore compositional relationships in this demo 
            for node_ix in [child_ix, parent_ix]:
                node_name = str(df_values.loc[node_ix].lexeme) + "_" + str(df_values.loc[node_ix].lang)
                tree.add_node(node_name, 
                              lang=str(df_values.loc[node_ix].lang), 
                              lexeme= str(df_values.loc[node_ix].lexeme),
                              meaning= str(df_values.loc[node_ix].meaning),
                              id= int(node_ix))
            tree.add_edge(str(df_values.loc[parent_ix].lexeme) + "_" + str(df_values.loc[parent_ix].lang), 
                          str(df_values.loc[child_ix].lexeme) + "_" + str(df_values.loc[child_ix].lang))
    
    return tree

In [5]:
inher_graph = get_graph(df_inher, df_values) 

In [6]:
print(f"This database contains {inher_graph.number_of_nodes()} words " \
      f"linked by {inher_graph.number_of_edges()} inheritance relations")

This database contains 418329 words linked by 320524 inheritance relations


In [7]:
cog_graph = get_graph(df_cog, df_values, False)

In [8]:
print(f"This database contains {cog_graph.number_of_nodes()} words " \
      f"linked by {cog_graph.number_of_edges()} cognacy relations")

This database contains 155084 words linked by 144072 cognacy relations


In [35]:
bor_graph = get_graph(df_bor, df_values) 

In [36]:
print(f"This database contains {bor_graph.number_of_nodes()} words " \
      f"linked by {bor_graph.number_of_edges()} borrowing relations")

This database contains 153340 words linked by 95920 borrowing relations


# II. General data analysis

## II. 1 Data coverage

### Total number of languges

In [11]:
langs = {}

for index, row in df_values.iterrows():
    add_val_to_ctr_dict(langs, row["lang"])

print(f"Total number of languages in the base {len(langs)}")
print("Language with the most lexical items",
      {key: langs[key] for key in sorted(langs, key=langs.get, reverse=True)[:10]})

Total number of languages in the base 2536
Language with the most lexical items {'en': 911086, 'la': 69224, 'fr': 34488, 'it': 31295, 'de': 27009, 'grc': 25874, 'fi': 24682, 'ru': 24187, 'es': 22213, 'nl': 21863}


### Languages with more than 100 lexemes

In [12]:
more_than_100 = {key: langs[key] for key in sorted(langs, key=langs.get, reverse=True) 
                 if (isinstance(key, str) and langs[key] > 100 and key.find("-pro") == -1)}
#print(f"Get languages with more than a 100 lexical units, which are not proto languages (rough approx): {more_than_100}")

### Words with gloss

In [13]:
word_has_gloss = 0
total_ctr = 0

for index, row in df_values.iterrows():
    total_ctr += 1
    if str(row["meaning"]) not in ["", "nan"]:
        word_has_gloss += 1
print(f"Total number of words with gloss {word_has_gloss} - Percentage of glossed words {word_has_gloss/total_ctr}")

Total number of words with gloss 1129032, - Percentage of glossed words 0.5989222887201038


### Percentage of words from a given language

In [14]:
lang = "en" # For example, English
word_is_lang = 0

for index, row in df_values.iterrows():
    if str(row["lang"]) == lang:
        word_is_lang += 1

print(f"Percentage of words from {lang}: {word_is_lang/len(df_values)}. Total number: {word_is_lang}")

Percentage of words from en: 0.48330756997219254. Total number: 911086


## II.2 Relations between languages

### Languages with the absolute most relations

In [38]:
def get_linked_languages(key, link_dict, tree):
    """Counts the number of relations between the different languages, either from the top to the bottom or the opposite"""
    oriented = True if isinstance(tree, nx.DiGraph) else False
    for node_ix in tree.nodes:
        lang = tree.nodes[node_ix]["lang"]
        if lang not in link_dict[key].keys():
            link_dict[key][lang] = {}
            
        tree_enum = tree.neighbors(node_ix) if not oriented else \
            (tree.predecessors(node_ix) if "children => parent" in key else tree.successors(node_ix))
        for ngbr_ix in tree_enum:
            ngbr_lang = tree.nodes[ngbr_ix]["lang"]
            add_val_to_ctr_dict(link_dict[key][lang], ngbr_lang)
    return link_dict

linked_languages = {"Inheritance: children => parent": {}, "Inheritance: parent => children": {}, "Cognacy": {}, 
                    "Borrowing: children => parent": {}, "Borrowing: parent => children": {}}
for key, graph in {"Inheritance: children => parent": inher_graph, "Inheritance: parent => children": inher_graph, 
                   "Cognacy": cog_graph, 
                   "Borrowing: children => parent": bor_graph, "Borrowing: parent => children": bor_graph}.items():
    linked_languages = get_linked_languages(key, linked_languages, graph)
    

In [39]:
for most in [True, False]:
    print(f"------ Languages with the {'most' if most else 'least'} relations in the tree.")
    for rel_type, rel_dict in linked_languages.items():
        all_link_dict = {k: sum(list(v.values())) for k, v in rel_dict.items()}
        print(rel_type, sorted(all_link_dict, key=all_link_dict.get, reverse=most)[:5])

------ Languages with the most relations in the tree.
Inheritance: children => parent ['en', 'it', 'enm', 'fr', 'la']
Inheritance: parent => children ['la', 'gem-pro', 'ine-pro', 'sla-pro', 'grc']
Cognacy ['en', 'de', 'nl', 'la', 'ang']
Borrowing: children => parent ['en', 'pt', 'hi', 'fr', 'ru']
Borrowing: parent => children ['en', 'la', 'fr', 'grc', 'sa']
------ Languages with the least relations in the tree.
Inheritance: children => parent ['kmb', 'euq-pro', 'qfa-cka-pro', 'map-pro', 'ath-pro']
Inheritance: parent => children ['pap', 'fax', 'gmw-zps', 'pnt', 'lld']
Cognacy ['jpx-ryu-pro', 'mhu', 'trk', 'bds', 'tgn']
Borrowing: children => parent ['fro-nor', 'kmb', 'la-ecc', 'lng', 'hbo']
Borrowing: parent => children ['yly', 'mic', 'zpq', 'lb', 'zos']


### Get the n most frequent neighbors for a given language

In [41]:
n = 5
linked_languages_max = {
    rel_type: {
        lang: {
            f_name: friends[f_name] for f_name in sorted(friends, key=friends.get, reverse=True)[:n]
        } for lang, friends in rel_dict.items()
    } for rel_type, rel_dict in linked_languages.items()
}

In [43]:
max_lang = "fro"
print("Most frequent neighbors for lang", max_lang)
for rel_type, rel_dict in linked_languages_max.items():
    try:
        print(rel_type, rel_dict[max_lang])
    except KeyError:
        pass

Most frequent neighbors for lang fro
Inheritance: children => parent {'la': 3886, 'la-vul': 680, 'la-lat': 464, 'frk': 310, 'la-med': 272}
Inheritance: parent => children {'fr': 2001, 'nrf': 1554, 'frm': 1532, 'en': 1308, 'enm': 1299}
Cognacy {'en': 161, 'fro': 50, 'it': 42, 'pro': 32, 'fr': 27}
Borrowing: children => parent {'la': 354, 'frk': 50, 'la-lat': 23, 'la-med': 12, 'non': 10}
Borrowing: parent => children {'enm': 1039, 'en': 369, 'it': 67, 'gl': 46, 'ga': 40}


### Total number of relations of each type for a given language

In [45]:
linked_languages_all = {
    rel_type: {
        lang: sum([v for k, v in friends.items()])
         for lang, friends in rel_dict.items()
    } for rel_type, rel_dict in linked_languages.items()
}

In [48]:
sum_lang = "sa"
print("Total number of relations of a type for lang", sum_lang)
sum = 0
for rel_type, rel_dict in linked_languages_all.items():
    print(f"{rel_dict[sum_lang]} relations of type {rel_type}")
    sum += rel_dict[sum_lang]
print(sum)

Total number of relations of a type for lang sa
1724 relations of type Inheritance: children => parent
7877 relations of type Inheritance: parent => children
4710 relations of type Cognacy
45 relations of type Borrowing: children => parent
4513 relations of type Borrowing: parent => children
18869
