In [19]:
from fos.entity import load_entities
from fos.vectors import load_fasttext, load_tfidf, load_field_keys
from fos.settings import ASSETS_DIR

import pandas as pd

In [20]:
import re

In [21]:
def clean_text(text):
    # Normalize hyphens
    text = re.sub(r'[–—]', '-', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

In [22]:
def to_title(text):
    # We assume all dashes are hyphens and each whitespace is a space 
    #   which requires clean_text first
    tokens = re.split(r'([ \-\(\)])', text)
    cased_tokens = []
    for i, token in enumerate(tokens):
        # Two split-pattern chars in a row yields an empty string between the matches
        if len(token) == 0:
            continue
        # Don't change abbreviation casing
        elif len(token) > 1 and token.isupper():
            cased_tokens.append(token)
        # Also don't change e.g. 'eWLB'
        elif not token[0].isupper() and len(token) > 1 \
            and any(c.isalpha() and c.isupper() for c in token[1:]):
                cased_tokens.append(token)
        # Lowercase prepositions, unless they're the first word
        elif i > 0 and token.lower() in ['a', 'an', 'the', 'of', 'and', 'or', 'but', 'for', 'nor', 'on', 'at', 'to', 'from', 'by', 'with', 'in', 'through', 'via']:
            cased_tokens.append(token.lower())
        # Uppercase the first letter of other words
        elif len(token) == 1:
            cased_tokens.append(token[0].upper())
        else:
            cased_tokens.append(token[0].upper() + token[1:])
    text = ''.join(cased_tokens)
    return text

assert to_title('Computer science') == 'Computer Science', to_title('Computer science')
assert to_title('Neuro-symbolic AI') == 'Neuro-Symbolic AI'
assert to_title('text-to-speech') == 'Text-to-Speech'
assert to_title('Text-To-Image Models') == 'Text-to-Image Models'
assert to_title('System On A Chip') == 'System on a Chip'
assert to_title('Zero-Knowledge Proofs') == 'Zero-Knowledge Proofs'
assert to_title('Law and society') == 'Law and Society'
assert to_title('The Law and society') == 'The Law and Society'
assert to_title('GPU') == 'GPU'
assert to_title('Brain-computer interfacing') == 'Brain-Computer Interfacing'
assert to_title('eWLB packaging') == 'eWLB Packaging'
assert to_title('System in a package') == 'System in a Package'
assert to_title('Risk analysis (Engineering)') == 'Risk Analysis (Engineering)'

In [23]:
keys = load_field_keys("en")
len(keys)

1109

In [24]:
keys[:5]

['Environmental science', 'Geology', 'Chemistry', 'History', 'Engineering']

In [25]:
keys = [to_title(clean_text(key)) for key in keys]

# Going to need to fix this -- on inspection this appears in the DB too. We have one
# record with a dash and another with an en-dash.
pd.Series(keys)[pd.Series(keys).duplicated()]

370    Human-Computer Interaction
dtype: object

In [26]:
db = pd.read_sql_table('pages', 'sqlite:///../wiki/data/wiki.db')
db.head()

Unnamed: 0,id,level,display_name,normalized_name,en_title_1,page_id_1,en_html_1,wiki_title_1_section,wiki_title_2_section,en_title_2,page_id_2,wiki_title_3_section,en_title_3,page_id_3,en_html_2,en_html_3,en_text
0,1,0,Environmental science,environmental science,Environmental science,64919,"<!DOCTYPE html>\n<html prefix=""dc: http://purl...",,,,,,,,,,Environmental science is an interdisciplinary...
1,2,0,Geology,geology,Geology,12207,"<!DOCTYPE html>\n<html prefix=""dc: http://purl...",,,,,,,,,,Geology (from Ancient Greek γῆ (gê) 'earth' ...
2,3,0,Chemistry,chemistry,Chemistry,5180,"<!DOCTYPE html>\n<html prefix=""dc: http://purl...",,,,,,,,,,Chemistry is the scientific study of the prop...
3,4,0,History,history,History,10772350,"<!DOCTYPE html>\n<html prefix=""dc: http://purl...",,,,,,,,,,History (derived from Ancient Greek ἱστορία ...
4,5,0,Engineering,engineering,Engineering,9251,"<!DOCTYPE html>\n<html prefix=""dc: http://purl...",,,,,,,,,,Engineering is the practice of using natural ...


Take a moment to confirm that our key index -- the list of field names that identifes rows in field embedding matrices -- is consistent with the field names in the database.  

In [27]:
for display_name in db['display_name'].sort_values():
    display_name = clean_text(display_name)
    if display_name != to_title(display_name):
        print(display_name, '->', to_title(display_name))

3D integrated circuit -> 3D Integrated Circuit
3D microfabrication -> 3D Microfabrication
AI alignment -> AI Alignment
AI safety -> AI Safety
Abductive reasoning -> Abductive Reasoning
Access control -> Access Control
Active contour model -> Active Contour Model
Active learning -> Active Learning
Active-pixel sensor -> Active-Pixel Sensor
Activity recognition -> Activity Recognition
Actuarial science -> Actuarial Science
Adversarial machine learning -> Adversarial Machine Learning
Aerospace engineering -> Aerospace Engineering
Affective neuroscience -> Affective Neuroscience
Agent architectures -> Agent Architectures
Agent-based modeling -> Agent-Based Modeling
Agent-based social simulation -> Agent-Based Social Simulation
Agricultural biotechnology -> Agricultural Biotechnology
Agricultural economics -> Agricultural Economics
Agricultural engineering -> Agricultural Engineering
Agricultural science -> Agricultural Science
Algorithmic bias -> Algorithmic Bias
Alignment-free sequence an

In [28]:
# Same field names in each?
db['display_name'] = db['display_name'].apply(lambda x: to_title(clean_text(x)))
assert set(keys) == set(db['display_name'])

In [29]:
# Same order?
assert all(pd.Series(keys) == db['display_name'])

## First approach

Below I load `all_fields_hierarchy.jsonl` because I thought it contained the field metadata (i.e., levels) and parent-child links we need for BQ, but this turns out to be a detour. The metadata in it isn't final or consistent with what we settled on.

In [32]:
# Now load the parent/child link information from all_fields_hierarchy.jsonl
links = pd.read_json(ASSETS_DIR / "fields/all_fields_hierarchy.jsonl", lines=True)
links['display_name'] = links['display_name'].apply(lambda x: to_title(clean_text(x)))
links['child_display_name'] = links['child_display_name'].apply(lambda x: to_title(clean_text(x)))
links.head()

Unnamed: 0,normalized_name,display_name,parent_level,child_normalized_name,child_display_name,child_level
0,environmental science,Environmental Science,0,environmental engineering,Environmental Engineering,1
1,engineering,Engineering,0,environmental engineering,Environmental Engineering,1
2,environmental science,Environmental Science,0,environmental planning,Environmental Planning,1
3,geography,Geography,0,environmental planning,Environmental Planning,1
4,environmental science,Environmental Science,0,environmental resource management,Environmental Resource Management,1


In [33]:
links.shape

(1162, 6)

In [34]:
# Quick consistency check: all the links file field names are in the database
assert links['display_name'].isin(db['display_name']).all()

In [35]:
# Similarly, all the db field names can be found either as parents or children in the links file
assert (db['display_name'].isin(links['display_name']) | db['display_name'].isin(links['child_display_name'])).all()

In [36]:
# What levels are linked here?
links[['parent_level', 'child_level']].value_counts(sort=False, dropna=False)

parent_level  child_level
0             1              351
1             2              106
2             3              705
dtype: int64

In [37]:
# The idea here is to populate the dictionaries with the children of each field at each level
l0_l1 = {}
l0_l2 = {}
l0_l3 = {}
l1_l2 = {}
l1_l3 = {}
l2_l3 = {}

In [38]:
# Get rows containing children of each L0; all L0s have children so we can iterate over rows with 'parent_level' == 0
for l0_parent, l1_children in links.loc[links['parent_level'] == 0].groupby('display_name'):
    # For each L0, we want to know what their L1 children are
    assert (l1_children['child_level'] == 1).all()
    l0_l1[l0_parent] = l1_children['child_display_name'].tolist()
    print('[0]', l0_parent, l1_children.shape[0])

[0] Art 6
[0] Biology 29
[0] Business 13
[0] Chemistry 21
[0] Computer Science 17
[0] Economics 40
[0] Engineering 45
[0] Environmental Science 8
[0] Geography 11
[0] Geology 18
[0] History 7
[0] Materials Science 7
[0] Mathematics 20
[0] Medicine 45
[0] Philosophy 7
[0] Physics 27
[0] Political Science 3
[0] Psychology 14
[0] Sociology 13


In [39]:
surprises = []
for l0, l1s in l0_l1.items():
    for l1 in l1s:
        # print(l0, '->', l1)
        l1_l2_links = links.loc[links['display_name'] == l1]
        for _, row in l1_l2_links.iterrows():
            if row['child_level'] != 2:
                surprises.append(row['child_display_name'])
            l2_name = row['child_display_name']
            l1_l2[l1] = l2_name
            l0_l2[l0] = l2_name

The parent_level and child_level fields in this file don't look consistent -- below, AI has a parent level of `2` where it appears as a parent of active learning, but a child level of `1` where it appears as a child of computer science.

In [41]:
len(surprises)

892

In [42]:
links.loc[links['child_display_name'] == 'Active Learning']

Unnamed: 0,normalized_name,display_name,parent_level,child_normalized_name,child_display_name,child_level
441,artificial intelligence,Artificial Intelligence,2,active learning,Active Learning,3


In [43]:
links.loc[links['child_display_name'] == 'Artificial Intelligence']

Unnamed: 0,normalized_name,display_name,parent_level,child_normalized_name,child_display_name,child_level
319,computer science,Computer Science,0,artificial intelligence,Artificial Intelligence,1


## Final approach

We have a Google Sheet called `Consolidated revisions` [here](https://docs.google.com/spreadsheets/d/1ic_ckG2zujuQXBIdasWtPXVE8yQUW9s_IXtiVczduao/edit?gid=0#gid=0) containing everything we changed as part of the L2 project. (This includes changing some L1s.) Then, we have in the `fields_of_study` dataset in BQ the original L0 and L1 fields. Together these have all the information we need to reliably define the current field taxonomy: L0-L2 names, levels, and relations/links between them.

In [330]:
# Grab the L0-L1 names and levels from BQ
import pandas_gbq as pbq

meta = pbq.read_gbq('select name, level from fields_of_study.field_meta where level between 0 and 1')
meta['name'] = meta['name'].apply(lambda x: to_title(clean_text(x)))
meta.head()

Downloading: 100%|[32m██████████[0m|


Unnamed: 0,name,level
0,Political Science,0
1,Psychology,0
2,Mathematics,0
3,Chemistry,0
4,Art,0


In [331]:
meta.shape

(311, 2)

In [332]:
meta['name'].str.lower().duplicated().any()

False

In [333]:
# Grab the parent-child relations for the above L0-L1 fields
children = pbq.read_gbq('''\
select 
  l0_meta.name as parent_name,
  l1_meta.name as child_name
from fields_of_study.field_meta as l0_meta
inner join fields_of_study.field_children using(field_id)
inner join fields_of_study.field_meta as l1_meta on field_children.child_field_id = l1_meta.field_id
where l0_meta.level = 0
order by 
  parent_name, 
  child_name
''')
children['parent_name'] = children['parent_name'].apply(lambda x: to_title(clean_text(x)))
children['child_name'] = children['child_name'].apply(lambda x: to_title(clean_text(x)))
children

Downloading: 100%|[32m██████████[0m|


Unnamed: 0,parent_name,child_name
0,Art,Aesthetics
1,Art,Art History
2,Art,Classics
3,Art,Humanities
4,Art,Literature
...,...,...
365,Sociology,Pedagogy
366,Sociology,Political Economy
367,Sociology,Regional Science
368,Sociology,Social Science


In [334]:
from openpyxl import load_workbook

# This is the above `Consolidated revisions` sheet saved to a local Excel file
path = str(ASSETS_DIR / "fields/l2_revisions.xlsx")
wb = load_workbook(path)
wb.sheetnames

['Computer Science L1s',
 'Biology L1s',
 'AI L2s',
 'AI L3s',
 'Additional Eng L1',
 'Semiconductor L2s',
 'Semiconductor L3s',
 'Biotech L2s',
 'Biotech L3s',
 'Genetics L2s',
 'Genetics L3s',
 'Immunology L2s',
 'Immunology L3s',
 'Neuroscience L2s',
 'Neuroscience L3s',
 'Virology L2s',
 'Virology L3s',
 'Bioinformatics L2s',
 'Bioinformatics L3s',
 'Cybersecurity L2s',
 'Cybersecurity L3s',
 'All consolidated']

## DRY
We need to do this about a dozen more times, so let's abstract.


In [335]:
def load_revisions(sheet_name):
    df = pd.read_excel(path, sheet_name=sheet_name)
    df = df.rename(columns={'display_name': 'child_name'})
    df['parent_name'] = df['parent_name'].apply(lambda x: to_title(clean_text(x)))
    df['child_name'] = df['child_name'].apply(lambda x: to_title(clean_text(x)))
    assert df['parent_name'].nunique() == 1
    assert not df['child_name'].duplicated().any()
    assert df['level'].nunique() == 1
    assert (df['parent_name'] != '').all()
    assert (df['child_name'] != '').all()
    assert df[['parent_name', 'child_name']].duplicated().sum() == 0, df.loc[df[['parent_name', 'child_name']].duplicated(), :]
    print('Read sheet', sheet_name, 'with', df.shape[0], 'rows')
    return df

def drop_children_meta(meta, parent_name):
    n1 = meta.shape[0]
    assert parent_name in meta['name'].values, parent_name
    meta = meta.loc[~meta['name'].isin(children.loc[children['parent_name'] == parent_name, 'child_name'])]
    n2 = meta.shape[0]
    print('Dropped', n1 - n2, 'rows from meta')
    return meta

def drop_children_children(children, parent_name):
    n1 = children.shape[0]
    assert parent_name in children['parent_name'].values
    children = children.loc[children['parent_name'] != parent_name]
    n2 = children.shape[0]
    print('Dropped', n1 - n2, 'rows from children')
    return children

def add_children(children, revisions):
    children = pd.concat([children, revisions[['parent_name', 'child_name']]], ignore_index=True)
    assert not children.duplicated().any(), children.loc[children.duplicated(), :]
    print('Added', revisions.shape[0], 'rows to children')
    return children

def add_meta(meta, revisions, level):
    new_meta = revisions[['child_name']].rename(columns={'child_name': 'name'}).assign(level=level)
    new_meta = new_meta.loc[~new_meta['name'].isin(meta['name'])]
    meta = pd.concat([meta, new_meta], ignore_index=True)
    print('Added', revisions.shape[0], 'rows to meta')
    return meta

def update_fields(meta, children, revisions):
    l0_name = revisions['parent_name'].iloc[0]
    level = revisions['level'].iloc[0]
    meta = drop_children_meta(meta, l0_name)
    children = drop_children_children(children, l0_name)
    children = add_children(children, revisions)
    meta = add_meta(meta, revisions, level)
    return meta, children

def add_fields(meta, children, revisions):
    level = revisions['level'].iloc[0]
    children = add_children(children, revisions)
    meta = add_meta(meta, revisions, level)
    return meta, children

In [336]:
for sheet_name in wb.sheetnames:
    if sheet_name == 'All consolidated':
        continue
    print(sheet_name)
    revisions = load_revisions(sheet_name)
    level = revisions['level'].iloc[0]
    if level == 1 and not sheet_name.startswith('Additional'):
        meta, children = update_fields(meta, children, revisions)
    else:
        meta, children = add_fields(meta, children, revisions) 

Computer Science L1s
Read sheet Computer Science L1s with 18 rows
Dropped 34 rows from meta
Dropped 34 rows from children
Added 18 rows to children
Added 18 rows to meta
Biology L1s
Read sheet Biology L1s with 31 rows
Dropped 32 rows from meta
Dropped 32 rows from children
Added 31 rows to children
Added 31 rows to meta
AI L2s
Read sheet AI L2s with 12 rows
Added 12 rows to children
Added 12 rows to meta
AI L3s
Read sheet AI L3s with 195 rows
Added 195 rows to children
Added 195 rows to meta
Additional Eng L1
Read sheet Additional Eng L1 with 1 rows
Added 1 rows to children
Added 1 rows to meta
Semiconductor L2s
Read sheet Semiconductor L2s with 15 rows
Added 15 rows to children
Added 15 rows to meta
Semiconductor L3s
Read sheet Semiconductor L3s with 81 rows
Added 81 rows to children
Added 81 rows to meta
Biotech L2s
Read sheet Biotech L2s with 7 rows
Added 7 rows to children
Added 7 rows to meta
Biotech L3s
Read sheet Biotech L3s with 45 rows
Added 45 rows to children
Added 45 rows t

In [337]:
meta.duplicated().any()

False

In [338]:
meta.shape

(1101, 2)

In [339]:
meta.head()

Unnamed: 0,name,level
0,Political Science,0
1,Psychology,0
2,Mathematics,0
3,Chemistry,0
4,Art,0


In [340]:
meta['name'].str.lower().duplicated().any()

False

In [341]:
children.duplicated().any()

False

In [342]:
meta.head()

Unnamed: 0,name,level
0,Political Science,0
1,Psychology,0
2,Mathematics,0
3,Chemistry,0
4,Art,0


In [343]:
children.head()

Unnamed: 0,parent_name,child_name
0,Art,Aesthetics
1,Art,Art History
2,Art,Classics
3,Art,Humanities
4,Art,Literature


In [344]:
meta.to_json(ASSETS_DIR / "fields/field_meta.jsonl", orient='records', lines=True)
children.to_json(ASSETS_DIR / "fields/field_children.jsonl", orient='records', lines=True)

In [345]:
children['child_name'].isin(meta['name']).all()

False

In [346]:
children.loc[~children['child_name'].isin(meta['name'])]

Unnamed: 0,parent_name,child_name
27,Chemistry,Food Science
89,Engineering,Computer Engineering
95,Engineering,Embedded System
117,Engineering,Simulation
121,Engineering,Telecommunications
124,Environmental Science,Agricultural Science
125,Environmental Science,Agroforestry
176,Mathematics,Algorithm
195,Medicine,Anatomy
206,Medicine,Endocrinology


In [347]:
meta.loc[(~meta['name'].isin(children['child_name'])) & (meta['level'] != 0)]

Unnamed: 0,name,level


In [348]:
from io import StringIO
missing_children = pd.read_csv(StringIO("""\
parent_name,child_name
Chemistry,Food Science
Engineering,Computer Engineering
Engineering,Embedded System
Engineering,Telecommunications
Environmental Science,Agricultural Science
Environmental Science,Agroforestry
Mathematics,Algorithms
Medicine,Anatomy
Medicine,Endocrinology
"""))
missing_children['parent_name'] = missing_children['parent_name'].apply(lambda x: to_title(clean_text(x)))
missing_children['child_name'] = missing_children['child_name'].apply(lambda x: to_title(clean_text(x)))
missing_children

Unnamed: 0,parent_name,child_name
0,Chemistry,Food Science
1,Engineering,Computer Engineering
2,Engineering,Embedded System
3,Engineering,Telecommunications
4,Environmental Science,Agricultural Science
5,Environmental Science,Agroforestry
6,Mathematics,Algorithms
7,Medicine,Anatomy
8,Medicine,Endocrinology


In [349]:
children = children.loc[children['child_name'].isin(meta['name'])]
children = pd.concat([children, missing_children], ignore_index=True)

children.loc[~children['child_name'].isin(meta['name'])]

Unnamed: 0,parent_name,child_name
1204,Chemistry,Food Science
1205,Engineering,Computer Engineering
1206,Engineering,Embedded System
1207,Engineering,Telecommunications
1208,Environmental Science,Agricultural Science
1209,Environmental Science,Agroforestry
1211,Medicine,Anatomy
1212,Medicine,Endocrinology


In [350]:
# Add to meta
missing_meta = pd.read_csv(StringIO("""\
name,level
Food Science,1
Computer Engineering,1
Embedded System,1
Telecommunications,1
Agricultural Science,1
Agroforestry,1
Anatomy,1
Endocrinology,1
"""))
missing_meta['name'] = missing_meta['name'].apply(lambda x: to_title(clean_text(x)))
missing_meta.head()                    

Unnamed: 0,name,level
0,Food Science,1
1,Computer Engineering,1
2,Embedded System,1
3,Telecommunications,1
4,Agricultural Science,1


In [351]:
meta = pd.concat([meta, missing_meta], ignore_index=True)
meta.shape

(1109, 2)

In [388]:
def test_meta(meta):
    # No duplicate field names
    assert not meta['name'].duplicated().any()
    # Still no duplicate field names after lowercasing and stripping whitespace
    assert not meta['name'].str.lower().str.strip().duplicated().any()
    # No field names should have leading/trailing whitespace
    assert (meta['name'].str.strip() == meta['name']).all()
    # Field names should be title-cased
    assert (meta['name'].apply(lambda x: to_title(clean_text(x))) == meta['name']).all()
    # No nulls
    assert not meta['name'].isnull().any()
    # No empty strings
    assert not (meta['name'] == '').any()

test_meta(meta)

In [389]:
def test_children(children):
    # No duplicate parent-child pairs
    assert not children.duplicated().any()
    # No nulls
    assert not children['parent_name'].isnull().any()
    assert not children['child_name'].isnull().any()
    # No empty strings
    assert not (children['parent_name'] == '').any()
    assert not (children['child_name'] == '').any()
    
test_children(children)    

In [356]:
def test_children_in_meta(meta, children):
    # All children should be in meta
    assert children['child_name'].isin(meta['name']).all()
    # All parents should be in meta
    assert children['parent_name'].isin(meta['name']).all()
    
test_children_in_meta(meta, children)

In [357]:
def test_meta_in_children(meta, children):
    # All fields in meta should be in children
    assert (meta['name'].isin(children['child_name']) | meta['name'].isin(children['parent_name'])).all()

test_meta_in_children(meta, children)

In [358]:
children['parent_name'].nunique()

28

In [359]:
children['child_name'].duplicated().sum()

123

We can expect child names to appear more than once in the parent-child table because children can have multiple parents. For example, "Computer Engineering" is a child of both "Engineering" and "Computer Science".

In [360]:
dupe_children_names = children.loc[children['child_name'].duplicated(), 'child_name']
children.loc[children['child_name'].isin(dupe_children_names)].sort_values(['child_name', 'parent_name'])[['child_name', 'parent_name']]

Unnamed: 0,child_name,parent_name
6,Accounting,Business
39,Accounting,Economics
7,Actuarial Science,Business
40,Actuarial Science,Economics
0,Aesthetics,Art
...,...,...
795,Viral Metagenomics,Genetics
837,Viral Phylodynamics,Immunology
1002,Viral Phylodynamics,Virology
332,Virology,Biology


What we shouldn't find is any children whose parents are at different levels. This suggests we might've moved a child field under a new parent without removing the old parent-child link.

Reviewing the above table manually, for example I see "Computer Simulation" appears as a child of both "Artificial Intelligence" and "Computer Science". We actually wanted to move it from being a child of the latter to that of the former.

Let's check for this programmatically.

In [361]:
# Filter the parent-child table to only include children that appear more than once, i.e.,
# have multiple parents
dupe_children = children.loc[children['child_name'].isin(dupe_children_names)].\
    sort_values(['child_name', 'parent_name'])[['child_name', 'parent_name']]
# Join with the meta table to get the levels of the parents
dupe_meta = meta.merge(dupe_children, left_on='name', right_on='parent_name', how='inner').\
    drop(columns=['name']).\
    rename(columns={'level': 'parent_level'})
dupe_meta.head()

Unnamed: 0,parent_level,child_name,parent_name
0,0,Clinical Psychology,Psychology
1,0,Communication,Psychology
2,0,Criminology,Psychology
3,0,Mathematics Education,Psychology
4,0,Neuroscience,Psychology


In [362]:
# Get frequency counts over parent-child pairs for the number of unique parent levels
#   (we want all 1)
dupe_meta.groupby('child_name').agg({'parent_level': 'nunique'})['parent_level'].value_counts()

1    113
2      7
Name: parent_level, dtype: int64

We have 7 child fields with 2 parents at levels.

In [363]:
inconsistent_dupes = dupe_meta.groupby('child_name').\
    agg(parent_levels = ('parent_level', lambda x: x.tolist()),
        parent_names = ('parent_name', lambda x: x.tolist()),
        n_levels = ('parent_level', 'nunique')).\
    loc[lambda x: x['n_levels'] > 1]

inconsistent_dupes

Unnamed: 0_level_0,parent_levels,parent_names,n_levels
child_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Biomedical Engineering,"[0, 0, 1]","[Medicine, Engineering, Biotechnology]",2
Computational Biology,"[0, 1]","[Biology, Bioinformatics]",2
Computer Simulation,"[0, 1]","[Computer Science, Artificial Intelligence]",2
Human-Computer Interaction,"[0, 1]","[Computer Science, Artificial Intelligence]",2
Mathematical Optimization,"[0, 1]","[Mathematics, Artificial Intelligence]",2
Synthetic Biology,"[0, 1]","[Biology, Biotechnology]",2
Systems Biology,"[0, 1]","[Biology, Bioinformatics]",2


The six cases where one of the parents is an L0 and one of the parents is an L1 are easy fixes -- in each of these cases, we wanted to move the child field from L1 to L2. The solution is to drop the L0-L1 parent-child link.

The correct placement of Biomedical Engineering isn't immediately clear to me. Its placement as an L2 under Biotechnology must've been intentional, but should it not also appear (somewhere) under Medicine or Engineering?

For now, I'll keep it under Biotechnology and drop it from Medicine and Engineering.

In [364]:
children.shape

(1213, 2)

In [365]:
# Get the subset of the parent-child table where a child has inconsistent parent levels
# and just keep the rows where the parent level isn't 0, since those happen to be the
# parents we want to drop. We should have 6 resulting rows (see above), and we do.
to_keep = pd.merge(
    children.loc[children.child_name.isin(inconsistent_dupes.index)],
    dupe_meta.loc[dupe_meta.parent_level != 0], how='inner').drop(columns=['parent_level'])
to_keep

Unnamed: 0,parent_name,child_name
0,Artificial Intelligence,Mathematical Optimization
1,Artificial Intelligence,Computer Simulation
2,Artificial Intelligence,Human-Computer Interaction
3,Biotechnology,Biomedical Engineering
4,Biotechnology,Synthetic Biology
5,Bioinformatics,Computational Biology
6,Bioinformatics,Systems Biology


In [366]:
# We can build the result we want now: the parent-child table, with only the rows we kept
# above for the children with inconsistent parent levels
children = pd.concat(
    [children.loc[~children.child_name.isin(inconsistent_dupes.index)], to_keep],
    ignore_index=True) 
children.shape

(1205, 2)

We should've dropped 8 rows (see above) and we did.

In [385]:
test_meta(meta)
test_children(children)
test_children_in_meta(meta, children)
test_meta_in_children(meta, children)

Almost good to go. Let's check we have field text for each field.

In [401]:
meta.loc[~meta['name'].isin(db['display_name'])]

Unnamed: 0,name,level
1037,Buffer Overflow Attacks,3


In [402]:
db.loc[db['display_name'].str.contains('Buffer')]

Unnamed: 0,id,level,display_name,normalized_name,en_title_1,page_id_1,en_html_1,wiki_title_1_section,wiki_title_2_section,en_title_2,page_id_2,wiki_title_3_section,en_title_3,page_id_3,en_html_2,en_html_3,en_text
1073,1074,3,Buffer Overflow Protection,buffer overflow protection,Buffer overflow protection,608625,"<!DOCTYPE html>\n<html prefix=""dc: http://purl...",,,,,,,,,,Buffer overflow protection is any of various ...


In [404]:
meta.loc[meta['name'].str.contains('Buffer')]

Unnamed: 0,name,level
1037,Buffer Overflow Attacks,3
1065,Buffer Overflow Protection,3


In [406]:
meta = meta.loc[meta['name'] != 'Buffer Overflow Attacks']

In [407]:
children.loc[children['child_name'].str.contains('Buffer')]

Unnamed: 0,parent_name,child_name
1125,Computer Security,Buffer Overflow Attacks
1153,Computer Security,Buffer Overflow Protection


In [409]:
children = children.loc[children['child_name'] != 'Buffer Overflow Attacks']

In [410]:
test_meta(meta)
test_children(children)
test_children_in_meta(meta, children)
test_meta_in_children(meta, children)

In [411]:
def test_meta_in_db(meta, db):
    assert meta['name'].isin(db['display_name']).all()
    assert db['display_name'].isin(meta['name']).all()

test_meta_in_db(meta, db)

In [412]:
meta.to_json(ASSETS_DIR / "fields/field_meta.jsonl", orient='records', lines=True)
children.to_json(ASSETS_DIR / "fields/field_children.jsonl", orient='records', lines=True)