In [None]:
import ipytest
import numpy as np
import pandas as pd
import pytest

In [None]:
PYTEST_OPTIONS = ['-v', '--color=yes']

In [None]:
@pytest.fixture
def expected_num_years_lived_dead():
    return pd.Series(87,
    name='num_years_lived')


@pytest.fixture
def expected_num_years_lived_alive():
    return pd.Series(55,
    name='num_years_lived')


@pytest.fixture
def expected_is_theoretical_physicist_categories():
    return pd.Series([
        'yes',
        'no',
    ],
    name='is_theoretical_physicist')


@pytest.fixture
def expected_is_theoretical_physicist_field():
    return pd.Series([
        'yes',
        'no',
        'no'
    ],
    name='is_theoretical_physicist')


@pytest.fixture
def expected_is_experimental_physicist_description():
    return pd.Series([
        'yes',
        'no',
        'no'
    ],
    name='is_experimental_physicist')


@pytest.fixture
def expected_is_astronomer_comment():
    return pd.Series([
        'yes',
        'no',
        'no'
    ],
    name='is_astronomer')


@pytest.fixture
def expected_num_laureates():
    return pd.Series([
        2,
        0,
        0
    ],
    name='num_physics_laureate_academic_advisors')


@pytest.fixture
def expected_num_laureates_name():
    return pd.Series([
        1,
        0
    ],
    name='num_chemistry_laureate_spouses')


@pytest.fixture
def expected_places_codes():
    return pd.Series([
        ['DNK'],
        [],
        ['GBR', 'IND', 'PAK'],
        ['AZE', 'GEO', 'RUS'],
        ['DEU', 'UKR', 'USA'],
        []
    ],
    name='workplaces_alpha_3_codes')


@pytest.fixture
def expected_citizenship_codes_citizenship():
    return pd.DataFrame([
        [['USA'], 1, ['NA'], 1],
        [['USA'], 1, ['NA'], 1],
        [['DEU', 'GBR', 'USA'], 3, ['EU', 'NA'], 2]],
        columns=['citizenship_country_alpha_3_codes',
                 'num_citizenship_country_alpha_3_codes',
                 'citizenship_continent_codes',
                 'num_citizenship_continent_codes'])


@pytest.fixture
def expected_citizenship_codes_nationality():
    return pd.DataFrame([
        [['IND'], 1, ['AS'], 1],
        [['RUS'], 1, ['EU'], 1],
        [['IRN', 'USA'], 2, ['AS', 'NA'], 2]],
        columns=['citizenship_country_alpha_3_codes',
                 'num_citizenship_country_alpha_3_codes',
                 'citizenship_continent_codes',
                 'num_citizenship_continent_codes'])


@pytest.fixture
def expected_citizenship_codes_description():
    return pd.DataFrame([
        [['GBR'], 1, ['EU'], 1],
        [['GBR'], 1, ['EU'], 1],
        [['DEU', 'USA'], 2, ['EU', 'NA'], 2]],
        columns=['citizenship_country_alpha_3_codes',
                 'num_citizenship_country_alpha_3_codes',
                 'citizenship_continent_codes',
                 'num_citizenship_continent_codes'])


@pytest.fixture
def expected_citizenship_codes_multiple():
    return pd.DataFrame([
        [['AUS', 'GBR'], 2, ['EU', 'OC'], 2],
        [['FRA', 'GBR'], 2, ['EU'], 1],
        [['RUS', 'UKR'], 2, ['EU'], 1],
        [['COL', 'ESP', 'USA'], 3, ['EU', 'NA', 'SA'], 3]],
        columns=['citizenship_country_alpha_3_codes',
                 'num_citizenship_country_alpha_3_codes',
                 'citizenship_continent_codes',
                 'num_citizenship_continent_codes'])


@pytest.fixture
def expected_citizenship_codes_not_found():
    return pd.DataFrame([
        [[], 0, [], 0]],
        columns=['citizenship_country_alpha_3_codes',
                 'num_citizenship_country_alpha_3_codes',
                 'citizenship_continent_codes',
                 'num_citizenship_continent_codes'])


@pytest.fixture
def expected_binary_features_train():
    columns = ['born_in_GBR', 'born_in_JPN', 'born_in_AS', 'born_in_EU', 'lived_in_GBR',
               'lived_in_JPN', 'lived_in_USA', 'lived_in_AS', 'lived_in_EU',
               'lived_in_NA', 'alumnus_of_Columbia_University',
               'alumnus_of_Harvard_University', 'alumnus_in_USA', 'alumnus_in_NA',
               'worked_at_Columbia_University', 'worked_at_RIKEN',
               'worked_at_Stanford_University', 'worked_in_JPN', 'worked_in_USA',
               'worked_in_AS', 'worked_in_NA', 'citizen_of_GBR', 'citizen_of_JPN',
               'citizen_of_NZL', 'citizen_of_USA', 'citizen_in_AS', 'citizen_in_AU',
               'citizen_in_EU', 'citizen_in_NA']
    
    expected = np.array([
        ['yes', 'no', 'no', 'yes', 'yes', 'no',
        'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes',
        'no', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'no', 'yes', 'yes',
        'no', 'yes', 'yes', 'yes'],
        ['no', 'yes', 'yes', 'no', 'no', 'yes',
        'no', 'yes', 'no', 'no', 'yes', 'yes', 'no', 'no', 'no', 'yes',
        'no', 'yes', 'no', 'yes', 'no', 'no', 'yes', 'no', 'no', 'yes',
        'no', 'no', 'no']])
    
    expected = pd.DataFrame(expected, columns=columns, index=[1, 2])
    return expected


@pytest.fixture
def expected_binary_features_test_less_categories_than_train():
    columns = ['born_in_JPN', 'born_in_AS', 'born_in_EU', 'lived_in_GBR', 'lived_in_JPN',
               'lived_in_USA', 'lived_in_AS', 'lived_in_EU', 'lived_in_NA',
               'alumnus_of_Columbia_University', 'alumnus_of_Harvard_University',
               'alumnus_in_USA', 'alumnus_in_NA', 'worked_at_Columbia_University',
               'worked_at_RIKEN', 'worked_at_Stanford_University', 'worked_in_JPN',
               'worked_in_USA', 'worked_in_AS', 'worked_in_NA', 'citizen_of_GBR',
               'citizen_of_JPN', 'citizen_of_NZL', 'citizen_of_USA', 'citizen_in_AS',
               'citizen_in_AU', 'citizen_in_EU', 'citizen_in_NA', 'born_in_GBR']
    
    expected = np.array([
        ['no', 'no', 'yes', 'yes', 'no', 'yes',
         'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no',
         'yes', 'no', 'yes', 'no', 'yes', 'yes', 'no', 'yes', 'yes', 'no',
         'yes', 'yes', 'yes', 'no'],
        ['yes', 'yes', 'no', 'no', 'yes', 'no',
         'yes', 'no', 'no', 'yes', 'yes', 'no', 'no', 'no', 'yes', 'no',
         'yes', 'no', 'yes', 'no', 'no', 'yes', 'no', 'no', 'yes', 'no',
         'no', 'no', 'no']])
    
    expected = pd.DataFrame(expected, columns=columns, index=[1, 2])
    return expected
    
    
@pytest.fixture
def expected_binary_features_train_threshold():
    columns = ['born_in_***', 'born_in_**',
               'lived_in_***', 'lived_in_**', 'alumnus_of_Columbia_University',
               'alumnus_of_Harvard_University', 'alumnus_in_***', 'alumnus_in_**',
               'worked_at_*', 'worked_in_***', 'worked_in_**', 'citizen_of_***',
               'citizen_in_**']
    
    expected = np.array([
        ['yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes',
         'yes', 'yes', 'yes', 'yes', 'yes', 'yes'],
        ['yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no',
         'no', 'yes', 'yes', 'yes', 'yes', 'yes']])
    
    expected = pd.DataFrame(expected, columns=columns, index=[1, 2])
    return expected


@pytest.fixture
def expected_binary_features_test():
    columns = ['born_in_GBR', 'born_in_JPN', 'born_in_***', 'born_in_AS', 'born_in_EU',
               'lived_in_GBR', 'lived_in_JPN', 'lived_in_USA', 'lived_in_AS',
               'lived_in_EU', 'lived_in_NA', 'alumnus_of_Columbia_University',
               'alumnus_of_Harvard_University', 'alumnus_of_*', 'alumnus_in_USA',
               'alumnus_in_NA', 'worked_at_Columbia_University', 'worked_at_RIKEN',
               'worked_at_Stanford_University', 'worked_in_JPN', 'worked_in_USA',
               'worked_in_AS', 'worked_in_NA', 'citizen_of_GBR', 'citizen_of_JPN',
               'citizen_of_NZL', 'citizen_of_USA', 'citizen_in_AS', 'citizen_in_AU',
               'citizen_in_EU', 'citizen_in_NA', 'citizen_in_**']
    
    expected = np.array([
        ['yes', 'no', 'yes', 'no', 'yes', 'yes',
        'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes',
        'yes', 'yes', 'no', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'no',
        'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no'],
        ['no', 'yes', 'no', 'yes', 'no', 'no',
        'yes', 'no', 'yes', 'no', 'no', 'yes', 'yes', 'yes', 'no', 'no',
        'no', 'yes', 'no', 'yes', 'no', 'yes', 'no', 'no', 'yes', 'no',
        'no', 'yes', 'no', 'no', 'no', 'yes']])
    
    expected = pd.DataFrame(expected, columns=columns, index=[1, 2])
    return expected



@pytest.fixture
def train_counts():
    return pd.DataFrame([
        [1, 'Sheldon Cooper', 2, 'male', 0],
        [1, 'Amy Fowler', 1, 'female', 0],
        [1, 'Dexter', 2, 'male', 0]
    ],
    columns=['count1', 'name', 'count2', 'gender', 'count3'],
    dtype='int64')


@pytest.fixture
def expected_count_ratios_train():
    return pd.DataFrame([
        ['Sheldon Cooper', 'male', 1.0, 1.2],
        ['Amy Fowler', 'female', 1.0, 0.6],
        ['Dexter', 'male', 1.0, 1.2],
    ],
    columns=['name', 'gender', 'ratio_count1', 'ratio_count2'],
    dtype='float64')



@pytest.fixture
def expected_count_ratios_test():
    return pd.DataFrame([
        ['Albert Einstein', 'male', 3.0, 3.0],
        ['Emmy Noether', 'female', 2.0, 0.0],
        ['Michael Green', 'male', 1.0, 0.6],
    ],
    columns=['name', 'gender', 'ratio_count1', 'ratio_count2'],
    dtype='float64')

In [None]:
ipytest.clean_tests("test_build_features*")
ipytest.clean_tests("test_binarize_list_features*")
ipytest.clean_tests("test_get_alma_mater_or_workplaces*")
ipytest.clean_tests("test_convert_counts_to_ratios*")


def test_build_features_full_name():
    features = build_features(train_physicists, nobel_physicists, 
                              nobel_chemists, places, nationalities)
    assert(features.full_name.equals(train_physicists.fullName))

    
def test_build_features_gender():
    features = build_features(train_physicists, nobel_physicists, 
                              nobel_chemists, places, nationalities)
    assert(features.gender.equals(train_physicists.gender))    


def test_build_features_years_lived_dead(expected_num_years_lived_dead):
    birth_date = pd.Series('1908-11-12')
    death_date = pd.Series('1996-1-19')
    assert(_build_num_years_lived(birth_date, death_date).equals(
        expected_num_years_lived_dead))
    
    
def test_build_features_years_lived_alive(expected_num_years_lived_alive):
    birth_date = pd.Series('1963-05-07')
    death_date = pd.Series(np.nan)
    assert(_build_num_years_lived(birth_date, death_date).equals(
        expected_num_years_lived_alive))
    

def test_build_features_physics_subfield_categories(
    expected_is_theoretical_physicist_categories):
    categories = pd.Series(['Nuclear Physicist|Theoretical physicists',
                            '1932 deaths|American physicists'])
    assert(_build_physics_subfield(
        categories,
        pd.Series(dtype=str),
        pd.Series(dtype=str),
        pd.Series(dtype=str),
        {'categories': 'Theoretical physicists',
         'others': 'theoretical physic'}).equals(
        expected_is_theoretical_physicist_categories))
    
    
def test_build_features_physics_subfield_field(
    expected_is_theoretical_physicist_field):
    field = pd.Series(['physics|Theoretical physics',
                       'Philosophy|Mathematics|Quantum field theory',
                       np.nan])
    assert(_build_physics_subfield(
        pd.Series(dtype=str, index=field.index),
        field,
        pd.Series(np.full(len(field), np.nan)),
        pd.Series(np.full(len(field), np.nan)),
        {'categories': 'Theoretical physicists',
         'others': 'theoretical physic'}).equals(
        expected_is_theoretical_physicist_field))
                     

def test_build_features_physics_subfield_description(
    expected_is_experimental_physicist_description):
    description = pd.Series(['Marie Curie was chair of experimental physics',
                             'Lise Meitner was a nuclear physicist',
                             np.nan])
    assert(_build_physics_subfield(
        pd.Series(dtype=str, index=description.index),
        pd.Series(np.full(len(description), np.nan)),
        description,
        pd.Series(np.full(len(description), np.nan)),
        {'categories': 'Experimental physicists',
         'others': 'experimental physic'}).equals(
        expected_is_experimental_physicist_description))
    

def test_build_features_physics_subfield_comment(
    expected_is_astronomer_comment):
    comment = pd.Series(['Antony Hewish was an astronomer',
                         'Jocelyn Bell Burnell is an Astrophysicist',
                         np.nan])
    assert(_build_physics_subfield(
        pd.Series(dtype=str, index=comment.index),
        pd.Series(np.full(len(comment), np.nan)),
        comment,
        pd.Series(np.full(len(comment), np.nan)),
        {'categories': 'astronomers',
         'others': 'astronom'}).equals(
        expected_is_astronomer_comment))
    
    
def test_build_features_num_laureates(expected_num_laureates):
    academic_advisors = pd.Series(['J. J. Thomson|William Henry Bragg',
                                   'Joe Bloggs',
                                   np.nan])
    assert(_build_num_laureates(
        academic_advisors,
        nobel_physicists.Laureate,
        nobel_physicists.name).equals(
        expected_num_laureates))
    

def test_build_features_num_laureates_name(expected_num_laureates_name):
    spouses = pd.Series(['Marie Skłodowska-Curie',
                         'Pierre Curie'])
    assert(_build_num_laureates(
        spouses,
        nobel_chemists.Laureate,
        nobel_chemists.name).equals(
        expected_num_laureates_name)) 

    
def test_build_features_places_codes(expected_places_codes):
    birth_places = pd.Series([
        'Copenhagen',
        np.nan,
        'Jhang|Presidencies and provinces of British India|Punjab Province (British India)',
        'Elisabethpol Governorate|Ganja, Azerbaijan|Russian Empire|Tbilisi',
        'California Institute of Technology|Institute for Advanced Study|'
        'Kharkiv Polytechnic Institute|Leipzig University|'
        'Los Angeles Bureau of Power and Light|University of Cincinnati|Xavier University',
        'No possible chance of being found in places dataframe'
    ])
    assert(_build_places_codes(
        birth_places,
        places.fullName,
        places.countryAlpha3Code).equals(expected_places_codes))
    
    
def test_get_alma_mater_or_workplaces_endswith_comma_oxford():
    places_in_cell = 'Harvard University|Pembroke College, Oxford'
    places = _get_alma_mater_or_workplaces(places_in_cell)
    assert(places == ['Harvard University', 'University of Oxford'])
    
    
def test_get_alma_mater_or_workplaces_ends_with_comma_oxford():
    places_in_cell = 'Pembroke College, Cambridge|University of Bristol'
    places = _get_alma_mater_or_workplaces(places_in_cell)
    assert(places == ['University of Bristol', 'University of Cambridge'])
    

def test_get_alma_mater_or_workplaces_duplicate():
    places_in_cell = 'Pembroke College, Cambridge|University of Cambridge'
    places = _get_alma_mater_or_workplaces(places_in_cell)
    assert(places == ['University of Cambridge'])


def test_get_alma_mater_or_workplaces_no_oxford_cambridge():
    places_in_cell = 'University of Dublin'
    places = _get_alma_mater_or_workplaces(places_in_cell)
    assert(places == ['University of Dublin'])


def test_get_alma_mater_or_workplaces_float():
    places_in_cell = np.nan
    places = _get_alma_mater_or_workplaces(places_in_cell)
    assert(places == list())
    

def test_build_features_citizenship_citizenship(
    expected_citizenship_codes_citizenship):
    features = pd.DataFrame()
    
    physicists = pd.DataFrame([
        ['American', np.nan, np.nan],
        ['United States', np.nan, np.nan],
        ['British nationality law|Citizenship of United States|Nazi Germany',
         np.nan,
         np.nan]
    ], columns=['citizenship', 'nationality', 'description'])
        
    _build_citizenship_features(features, physicists, nationalities)
    assert(features.equals(expected_citizenship_codes_citizenship))
    

def test_build_features_citizenship_nationality(
    expected_citizenship_codes_nationality):
    features = pd.DataFrame()
    
    physicists = pd.DataFrame([
        [np.nan, 'Indian people', np.nan],
        [np.nan, 'Soviet Union', np.nan],
        [np.nan,
         'Iranian Americans|No chance in hell',
         np.nan]
    ], columns=['citizenship', 'nationality', 'description'])
        
    _build_citizenship_features(features, physicists, nationalities)
    assert(features.equals(expected_citizenship_codes_nationality))
    
    
def test_build_features_citizenship_description(
    expected_citizenship_codes_description):
    features = pd.DataFrame()
    
    physicists = pd.DataFrame([
        [np.nan, np.nan, 'Scottish physicist'],
        [np.nan, np.nan, 'British physicist and discovered the atom.'],
        [np.nan,
         np.nan,
         'German-American physicist and founder of relativity theory.']
    ], columns=['citizenship', 'nationality', 'description'])
        
    _build_citizenship_features(features, physicists, nationalities)
    assert(features.equals(expected_citizenship_codes_description))
    
    
def test_build_features_citizenship_multiple(
    expected_citizenship_codes_multiple):
    features = pd.DataFrame()
    
    physicists = pd.DataFrame([
        ['Australia', 'Australian-British theoretical physicist', np.nan],
        ['British', np.nan, 'French born mathematician'],
        [np.nan, 'Soviet Union', 'Soviet-Ukrainian optical physicist'],
        ['Colombian', 'Colombian-Spanish', 'Colombian born physicist living in '
         'the United States of America'],
        
    ], columns=['citizenship', 'nationality', 'description'])
        
    _build_citizenship_features(features, physicists, nationalities)
    assert(features.equals(expected_citizenship_codes_multiple))
    
    
def test_build_features_citizenship_not_found(
    expected_citizenship_codes_not_found):
    features = pd.DataFrame()
    
    physicists = pd.DataFrame([
        [np.nan, 'Wakanda', 'physicist'],
    ], columns=['citizenship', 'nationality', 'description'])
        
    _build_citizenship_features(features, physicists, nationalities)
    assert(features.equals(expected_citizenship_codes_not_found))
    
    
def test_binarize_list_features_train_no_threshold(
    expected_binary_features_train):
    features = pd.DataFrame(
        [
            [['GBR'], ['EU'], ['GBR', 'USA'], ['EU', 'NA'],
             ['Columbia University', 'Harvard University'], ['USA'], ['NA'],
             ['Columbia University', 'Stanford University'], ['USA'], ['NA'],
             ['GBR', 'NZL', 'USA'], ['AU', 'EU', 'NA']],
            [['JPN'], ['AS'], ['JPN'], ['AS'],
             ['Columbia University', 'Harvard University'], [], [],
             ['RIKEN'], ['JPN'], ['AS'],
             ['JPN'], ['AS']],
        ],
        columns = [
            'birth_country_alpha_3_codes', 'birth_continent_codes',
            'residence_country_alpha_3_codes', 'residence_continent_codes',
            'alma_mater', 'alma_mater_country_alpha_3_codes',
            'alma_mater_continent_codes', 'workplaces',
            'workplaces_country_alpha_3_codes', 'workplaces_continent_codes',
            'citizenship_country_alpha_3_codes', 'citizenship_continent_codes'
        ])
    features.index = [1, 2]
    
    binarize = binarize_list_features(features, presence_threshold=0.0)
    assert(binarize.equals(expected_binary_features_train))

    
def test_binarize_list_features_train_threshold(
    expected_binary_features_train_threshold):
    features = pd.DataFrame(
        [
            [['GBR'], ['EU'], ['GBR', 'USA'], ['EU', 'NA'],
             ['Columbia University', 'Harvard University'], ['USA'], ['NA'],
             ['Columbia University', 'Stanford University'], ['USA'], ['NA'],
             ['GBR', 'NZL', 'USA'], ['AU', 'EU', 'NA']],
            [['JPN'], ['AS'], ['JPN'], ['AS'],
             ['Columbia University', 'Harvard University'], [], [],
             ['RIKEN'], ['JPN'], ['AS'],
             ['JPN'], ['AS']],
        ],
        columns = [
            'birth_country_alpha_3_codes', 'birth_continent_codes',
            'residence_country_alpha_3_codes', 'residence_continent_codes',
            'alma_mater', 'alma_mater_country_alpha_3_codes',
            'alma_mater_continent_codes', 'workplaces',
            'workplaces_country_alpha_3_codes', 'workplaces_continent_codes',
            'citizenship_country_alpha_3_codes', 'citizenship_continent_codes'
        ])
    features.index = [1, 2]
    
    binarize = binarize_list_features(features, presence_threshold=1.0)
    assert(binarize.equals(expected_binary_features_train_threshold))    
    
    
def test_binarize_list_features_test(
    expected_binary_features_test, expected_binary_features_train):
    features = pd.DataFrame(
        [
            [['GBR', 'XXX'], ['EU'], ['GBR', 'USA'], ['EU', 'NA'],
             ['Columbia University', 'Harvard University', 'XXX University'],
             ['USA'], ['NA'], ['Columbia University', 'Stanford University'],
             ['USA'], ['NA'], ['GBR', 'NZL', 'USA'], ['AU', 'EU', 'NA']],
            [['JPN'], ['AS'], ['JPN'], ['AS'],
             ['Columbia University', 'Harvard University', 'YYY University'],
             [], [], ['RIKEN'], ['JPN'], ['AS'], ['JPN'], ['AS', 'XX']],
        ],
        columns = [
            'birth_country_alpha_3_codes', 'birth_continent_codes',
            'residence_country_alpha_3_codes', 'residence_continent_codes',
            'alma_mater', 'alma_mater_country_alpha_3_codes',
            'alma_mater_continent_codes', 'workplaces',
            'workplaces_country_alpha_3_codes', 'workplaces_continent_codes',
            'citizenship_country_alpha_3_codes', 'citizenship_continent_codes'
        ])
    features.index = [1, 2]
    
    binarize = binarize_list_features(
        features, train_features=expected_binary_features_train)
    assert(binarize.equals(expected_binary_features_test))
    
    binarize_ignore_threshold = binarize_list_features(
        features, train_features=expected_binary_features_train,
        presence_threshold=0.6)
    assert(binarize_ignore_threshold.equals(binarize))

    
def test_binarize_list_features_pad_features(
    expected_binary_features_train,
    expected_binary_features_test_less_categories_than_train):
    features = pd.DataFrame(
        [
            [[], ['EU'], ['GBR', 'USA'], ['EU', 'NA'],  # GBR removed as birth country
             ['Columbia University', 'Harvard University'], ['USA'], ['NA'],
             ['Columbia University', 'Stanford University'], ['USA'], ['NA'],
             ['GBR', 'NZL', 'USA'], ['AU', 'EU', 'NA']],
            [['JPN'], ['AS'], ['JPN'], ['AS'],
             ['Columbia University', 'Harvard University'], [], [],
             ['RIKEN'], ['JPN'], ['AS'],
             ['JPN'], ['AS']],
        ],
        columns = [
            'birth_country_alpha_3_codes', 'birth_continent_codes',
            'residence_country_alpha_3_codes', 'residence_continent_codes',
            'alma_mater', 'alma_mater_country_alpha_3_codes',
            'alma_mater_continent_codes', 'workplaces',
            'workplaces_country_alpha_3_codes', 'workplaces_continent_codes',
            'citizenship_country_alpha_3_codes', 'citizenship_continent_codes'
        ])
    features.index = [1, 2]
    
    binarize = binarize_list_features(
        features, train_features=expected_binary_features_train, pad_features=True)
    assert(binarize.equals(expected_binary_features_test_less_categories_than_train))    

    
def test_binarize_list_features_train_test_cols_equal(
    expected_binary_features_train):
    features = pd.DataFrame(
        [
            [['GBR'], ['EU'], ['GBR', 'USA'], ['EU', 'NA'],
             ['Columbia University', 'Harvard University'], ['USA'], ['NA'],
             ['Columbia University', 'Stanford University'], ['USA'], ['NA'],
             ['GBR', 'NZL', 'USA'], ['AU', 'EU', 'NA']],
            [['JPN'], ['AS'], ['JPN'], ['AS'],
             ['Columbia University', 'Harvard University'], [], [],
             ['RIKEN'], ['JPN'], ['AS'],
             ['JPN'], ['AS']],
        ],
        columns = [
            'birth_country_alpha_3_codes', 'birth_continent_codes',
            'residence_country_alpha_3_codes', 'residence_continent_codes',
            'alma_mater', 'alma_mater_country_alpha_3_codes',
            'alma_mater_continent_codes', 'workplaces',
            'workplaces_country_alpha_3_codes', 'workplaces_continent_codes',
            'citizenship_country_alpha_3_codes', 'citizenship_continent_codes'
        ])
    features.index = [1, 2]
    
    binarize = binarize_list_features(
        features, train_features=expected_binary_features_train)
    assert(binarize.equals(expected_binary_features_train))     

    
def test_convert_counts_to_ratios_train(train_counts,
                                        expected_count_ratios_train):
    convert_features = convert_counts_to_ratios(
        train_counts, train_features=None)
    assert(convert_features.equals(expected_count_ratios_train))
    


def test_convert_counts_to_ratios_test(train_counts,
                                       expected_count_ratios_test):
    features = pd.DataFrame([
        [3, 'Albert Einstein', 5, 'male', 1],
        [2, 'Emmy Noether', 0, 'female', 1],
        [1, 'Michael Green', 1, 'male', 0]
    ],
    columns=['count1', 'name', 'count2', 'gender', 'count3'],
    dtype='int64')
    
    convert_features = convert_counts_to_ratios(
        features, train_features=train_counts)
    assert(convert_features.equals(expected_count_ratios_test))
    


ipytest.run_pytest(
    filename='../../tests/test_3.0-build-features.ipynb',
    pytest_options=PYTEST_OPTIONS)