# Build Features

## Setting up the Environment

An initialization step is are needed to setup the environment:
- The locale needs to be set for all categories to the user’s default setting (typically specified in the LANG environment variable) to enable correct sorting of words with accents.

In [1]:
import locale
    
locale.setlocale(locale.LC_ALL, '')

'English_United States.1252'

In [2]:
from datetime import datetime

import numpy as np
import pandas as pd

In [3]:
train_physicists = pd.read_csv(
    '../data/processed/train-physicists-from-1901.csv')
train_physicists.head()

Unnamed: 0,abstract,academicAdvisor,almaMater,award,birthDate,birthName,birthPlace,categories,child,citizenship,...,surname,theorized,thesisTitle,thesisUrl,thesisYear,thumbnail,wasDerivedFrom,wikiPageID,wikiPageRevisionID,workplaces
0,Aage Niels Bohr (Danish: [ˈɔːʊ̯ə ˌnels ˈboɐ̯ˀ]...,,University of Copenhagen,Atoms for Peace Award|Dannie Heineman Prize fo...,1922-06-19,,Copenhagen,1922 births|2009 deaths|Atoms for Peace Award ...,,,...,,,Rotational States of Atomic Nuclei,,1954.0,http://commons.wikimedia.org/wiki/Special:File...,http://en.wikipedia.org/wiki/Aage_Bohr?oldid=7...,2201,727860526,Columbia University|Institute for Advanced Stu...
1,"Aaldert Hendrik Wapstra (24 April 1922, Utrech...",,,International Union of Pure and Applied Physics,1922-04-24,,,1922 births|2006 deaths|Delft University of Te...,,,...,Wapstra,,,,,http://commons.wikimedia.org/wiki/Special:File...,http://en.wikipedia.org/wiki/Aaldert_Wapstra?o...,11543116,717669775,
2,"Mohammad Abdus Salam NI, SPk, KBE (Punjabi, Ur...",Paul Taunton Matthews,Government College University (Lahore)|St John...,Adams Prize|Atoms for Peace Award|Copley Medal...,1926-01-29,,Jhang|Presidencies and provinces of British In...,1926 births|1996 deaths|Academics of Imperial ...,Anisa Bushra Salam Bajwa|Aziza Rahman,,...,,W and Z bosons,Renormalisation of Quantum Field Theory,,1951.0,http://commons.wikimedia.org/wiki/Special:File...,http://en.wikipedia.org/wiki/Abdus_Salam?oldid...,304427,742242509,PAECSUPARCOPINSTECHPunjab UniversityImperial C...
3,"Abraham Isahakovich Alikhanov (Alikhanian, Arm...",,Peter the Great St. Petersburg Polytechnic Uni...,Hero of Socialist Labour|USSR State Prize,1904-02-20,,"Elisabethpol Governorate|Ganja, Azerbaijan|Rus...",1904 births|1970 deaths|20th-century physicist...,,,...,Alikhanov,,,,,,http://en.wikipedia.org/wiki/Abraham_Alikhanov...,18594585,736856447,Institute for Theoretical and Experimental Phy...
4,"Abraham Haskel Taub (February 1, 1911 – August...",,Princeton University,,1911-02-01,,Chicago,1911 births|1999 deaths|20th-century American ...,,,...,,,,,,http://commons.wikimedia.org/wiki/Special:File...,http://en.wikipedia.org/wiki/Abraham_H._Taub?o...,2402789,742301006,


In [4]:
nobel_physicists = pd.read_csv(
    '../data/raw/nobel-physics-prize-laureates.csv')
nobel_physicists.head()

Unnamed: 0,Year,Laureate,Country,Rationale
0,1901,Wilhelm Röntgen,German Empire,"""in recognition of the extraordinary services ..."
1,1902,Hendrik Lorentz,Netherlands,"""in recognition of the extraordinary service t..."
2,1902,Pieter Zeeman,Netherlands,"""in recognition of the extraordinary service t..."
3,1903,Henri Becquerel,France,"""for his discovery of spontaneous radioactivity"""
4,1903,Pierre Curie,France,"""for their joint researches on the radiation p..."


In [5]:
nobel_columns = ['Year', 'Laureate', 'name', 'Country', 'Rationale']
nobel_physicists = pd.merge(nobel_physicists, train_physicists, how = 'left', left_on = 'Laureate',
                            right_on = 'fullName')[nobel_columns]
nobel_physicists.head()

Unnamed: 0,Year,Laureate,name,Country,Rationale
0,1901,Wilhelm Röntgen,Wilhelm Röntgen,German Empire,"""in recognition of the extraordinary services ..."
1,1902,Hendrik Lorentz,Hendrik Antoon Lorentz|Hendrik Lorentz,Netherlands,"""in recognition of the extraordinary service t..."
2,1902,Pieter Zeeman,Pieter Zeeman,Netherlands,"""in recognition of the extraordinary service t..."
3,1903,Henri Becquerel,Antoine Henri Becquerel|Henri Becquerel,France,"""for his discovery of spontaneous radioactivity"""
4,1903,Pierre Curie,Pierre Curie,France,"""for their joint researches on the radiation p..."


In [6]:
nobel_chemists = pd.read_csv(
    '../data/raw/nobel-chemistry-prize-laureates.csv')
nobel_chemists.head()

Unnamed: 0,Year,Laureate,Country,Rationale
0,1901,Jacobus Henricus van 't Hoff,Netherlands,"""[for his] discovery of the laws of chemical d..."
1,1902,Emil Fischer,German Empire,"""[for] his work on sugar and purine syntheses"""
2,1903,Svante Arrhenius,Sweden,"""[for] his electrolytic theory of dissociation"""
3,1904,William Ramsay,United Kingdom,"""[for his] discovery of the inert gaseous elem..."
4,1905,Adolf von Baeyer,German Empire,"""[for] the advancement of organic chemistry an..."


In [7]:
nobel_chemists = pd.merge(nobel_chemists, train_physicists, how = 'left', left_on = 'Laureate',
                          right_on = 'fullName')[nobel_columns]
nobel_chemists.head()

Unnamed: 0,Year,Laureate,name,Country,Rationale
0,1901,Jacobus Henricus van 't Hoff,,Netherlands,"""[for his] discovery of the laws of chemical d..."
1,1902,Emil Fischer,,German Empire,"""[for] his work on sugar and purine syntheses"""
2,1903,Svante Arrhenius,,Sweden,"""[for] his electrolytic theory of dissociation"""
3,1904,William Ramsay,,United Kingdom,"""[for his] discovery of the inert gaseous elem..."
4,1905,Adolf von Baeyer,,German Empire,"""[for] the advancement of organic chemistry an..."


In [8]:
places = pd.read_csv('../data/processed/places.csv')
places.head()

Unnamed: 0,abstract,categories,city,comment,continentCode,continentName,country,countryAlpha2Code,countryAlpha3Code,countryName,...,lat,long,name,resource,source,thumbnail,type,wasDerivedFrom,wikiPageID,wikiPageRevisionID
0,The 13th arrondissement of Paris (also known a...,13th arrondissement of Paris|Chinatowns in Eur...,,The 13th arrondissement of Paris (also known a...,EU,Europe,France,FR,FRA,France,...,48.832222,2.355556,13th arrondissement ofParis,http://dbpedia.org/resource/13th_arrondissemen...,http://dbpedia.org/data/13th_arrondissement_of...,http://commons.wikimedia.org/wiki/Special:File...,Municipal arrondissements of France,http://en.wikipedia.org/wiki/13th_arrondisseme...,295543.0,741918596.0
1,"The 3M Company, formerly known as the Minnesot...",1902 establishments in Minnesota|3M|Companies ...,,"The 3M Company, formerly known as the Minnesot...",,,,,,,...,,,3M Company,http://dbpedia.org/resource/3M,http://dbpedia.org/data/3M.json,http://commons.wikimedia.org/wiki/Special:File...,Public company,http://en.wikipedia.org/wiki/3M?oldid=744057099,7664801.0,744057099.0
2,The 5th arrondissement of Paris (also known as...,5th arrondissement of Paris,,The 5th arrondissement of Paris (also known as...,EU,Europe,France,FR,FRA,France,...,48.847221,2.344445,5th arrondissement ofParis,http://dbpedia.org/resource/5th_arrondissement...,http://dbpedia.org/data/5th_arrondissement_of_...,http://commons.wikimedia.org/wiki/Special:File...,Municipal arrondissements of France,http://en.wikipedia.org/wiki/5th_arrondissemen...,89841.0,738284348.0
3,Aachen (German pronunciation: [ˈʔaːxn̩] ) or B...,Aachen|Aachen (district)|Articles including re...,,Aachen (German pronunciation: [ˈʔaːxn̩] ) or B...,EU,Europe,Germany,DE,DEU,Germany,...,50.783333,6.083333,Aachen,http://dbpedia.org/resource/Aachen,http://dbpedia.org/data/Aachen.json,http://commons.wikimedia.org/wiki/Special:File...,,http://en.wikipedia.org/wiki/Aachen?oldid=7449...,1520.0,744991843.0
4,"Aarhus University (Danish: Aarhus Universitet,...",1928 establishments in Denmark|Aarhus Universi...,Aarhus,"Aarhus University (Danish: Aarhus Universitet,...",EU,Europe,Denmark,DK,DNK,Denmark,...,56.166668,10.2,Aarhus Universitet|Aarhus University,http://dbpedia.org/resource/Aarhus_University,http://dbpedia.org/data/Aarhus_University.json,http://commons.wikimedia.org/wiki/Special:File...,Public university,http://en.wikipedia.org/wiki/Aarhus_University...,401280.0,743897453.0


In [11]:
def build_features(physicists, nobel_physicists, nobel_chemists, places):
    features = physicists.copy()[['fullName', 'name']]    
    features['gender'] = _build_gender(physicists.gender)
    features['years_lived'] = _build_years_lived(physicists.birthDate,
                                                 physicists.deathDate)
    
    _build_physics_subfield_features(features, physicists)
    _build_num_laureates_features(features, physicists, nobel_physicists,
                                  nobel_chemists)
    
    _build_places_features(features, physicists, places)
    
    return features


def _build_physics_subfield_features(features, physicists):
    features['theoretical_physicist'] = _build_physics_subfield(
        physicists.categories, physicists.field, 
        physicists.description, physicists.comment,
        search_terms={'categories': 'Theoretical physicists',
                      'others': 'theoretical physic'})
    features['experimental_physicist'] = _build_physics_subfield(
        physicists.categories, physicists.field, 
        physicists.description, physicists.comment,
        search_terms={'categories': 'Experimental physicists',
                      'others': 'experimental physic'})
    features['astronomer'] = _build_physics_subfield(
        physicists.categories, physicists.field, 
        physicists.description, physicists.comment,
        search_terms={'categories': 'astronomers',
                      'others': 'astronom'})


def _build_num_laureates_features(features, physicists, nobel_physicists,
                                  nobel_chemists):
    features['num_physics_laureate_academic_advisors'] = (
        _build_num_laureates(physicists.academicAdvisor,
                             nobel_physicists.Laureate, nobel_physicists.name))
    features['num_chemistry_laureate_academic_advisors'] = (
        _build_num_laureates(physicists.academicAdvisor,
                             nobel_chemists.Laureate, nobel_chemists.name))
    features['num_physics_laureate_doctoral_advisors'] = (
        _build_num_laureates(physicists.doctoralAdvisor,
                             nobel_physicists.Laureate, nobel_physicists.name))
    features['num_chemistry_laureate_doctoral_advisors'] = (
        _build_num_laureates(physicists.doctoralAdvisor,
                             nobel_chemists.Laureate, nobel_chemists.name))
    features['num_physics_laureate_doctoral_students'] = (
        _build_num_laureates(physicists.doctoralStudent,
                             nobel_physicists.Laureate, nobel_physicists.name))
    features['num_chemistry_laureate_doctoral_students'] = (
        _build_num_laureates(physicists.doctoralStudent,
                             nobel_chemists.Laureate, nobel_chemists.name))
    features['num_physics_laureate_notable_students'] = (
        _build_num_laureates(physicists.notableStudent,
                             nobel_physicists.Laureate, nobel_physicists.name))
    features['num_chemistry_laureate_notable_students'] = (
        _build_num_laureates(physicists.notableStudent,
                             nobel_chemists.Laureate, nobel_chemists.name))
    features['num_physics_laureate_children'] = (
        _build_num_laureates(physicists.child,
                             nobel_physicists.Laureate, nobel_physicists.name))
    features['num_chemistry_laureate_children'] = (
        _build_num_laureates(physicists.child,
                             nobel_chemists.Laureate,  nobel_chemists.name))
    _impute_num_laureate_children_parents(features) # impute missing values
    features['num_physics_laureate_spouses'] = (
        _build_num_laureates(physicists.spouse,
                             nobel_physicists.Laureate, nobel_physicists.name))
    features['num_chemistry_laureate_spouses'] = (
        _build_num_laureates(physicists.spouse,
                             nobel_chemists.Laureate,  nobel_chemists.name))
    features['num_physics_laureate_influenced'] = (
        _build_num_laureates(physicists.influenced,
                             nobel_physicists.Laureate, nobel_physicists.name))
    features['num_chemistry_laureate_influenced'] = (
        _build_num_laureates(physicists.influenced,
                             nobel_chemists.Laureate,  nobel_chemists.name))
    features['num_physics_laureate_influenced_by'] = (
        _build_num_laureates(physicists.influencedBy,
                             nobel_physicists.Laureate, nobel_physicists.name))
    features['num_chemistry_laureate_influenced_by'] = (
        _build_num_laureates(physicists.influencedBy,
                             nobel_chemists.Laureate,  nobel_chemists.name))

    
def _build_places_features(features, physicists, places):
    features['birth_country_alpha_2_codes'] = _build_places_codes(
        physicists.birthPlace, places.fullName, places.countryAlpha2Code)
    features['num_birth_country_alpha_2_codes'] = _build_num_places_codes(
        features['birth_country_alpha_2_codes'])
    features['birth_continent_codes'] = _build_places_codes(
        physicists.birthPlace, places.fullName, places.continentCode)
    features['num_birth_continent_codes'] = _build_num_places_codes(
        features['birth_continent_codes'])
    features['death_country_alpha_2_codes'] = _build_places_codes(
        physicists.deathPlace, places.fullName, places.countryAlpha2Code)
    features['num_death_country_alpha_2_codes'] = _build_num_places_codes(
        features['death_country_alpha_2_codes'])
    features['death_continent_codes'] = _build_places_codes(
        physicists.deathPlace, places.fullName, places.continentCode)
    features['num_death_continent_codes'] = _build_num_places_codes(
        features['death_continent_codes'])



def _build_gender(gender):
    return gender.map({'male': 1, 'female': 0})


def _build_years_lived(birth_date, death_date):
    death_date_no_nan = death_date.apply(_date_no_nan)
    birth_date_no_nan = birth_date.apply(_date_no_nan)
    years_lived = ((death_date_no_nan - birth_date_no_nan) / pd.to_timedelta(1, 'Y'))
    return years_lived.astype('int64')


def _build_physics_subfield(categories, field, description, comment, search_terms):
    cat_theoretical_physicist = categories.apply(
        lambda cat: search_terms['categories'] in cat)
    field_theoretical_physicist = field.apply(
        lambda fld: search_terms['others'] in fld.lower() if isinstance(fld, str)
        else False)
    desc_theoretical_physicist = description.apply(
        lambda desc: search_terms['others'] in desc.lower() if isinstance(desc, str)
        else False)
    comm_theoretical_physicist = description.apply(
        lambda comm: search_terms['others'] in comm.lower() if isinstance(comm, str)
        else False)
    return (cat_theoretical_physicist |
            field_theoretical_physicist |
            desc_theoretical_physicist |
            comm_theoretical_physicist).astype('int64')



def _build_num_laureates(series, laureates, names):
    laureate_names = series.apply(_get_nobel_laureates, args=(laureates, names))
    return laureate_names.apply(len)


def _build_places_codes(places_in_physicists, full_name_in_places, places_codes):
    codes = places_in_physicists.apply(_get_places_codes,
                                       args=(full_name_in_places, places_codes))
    return codes


def _build_num_places_codes(places_codes):
    counts = places_codes.apply(
        lambda cd: 0 if isinstance(cd, float) else len(cd))
    return counts


def _impute_num_laureate_children_parents(features):
    
    features['num_physics_laureate_parents'] = 0
    features['num_chemistry_laureate_parents'] = 0 # none for chemistry
    
    # https://www.nobelprize.org/prizes/facts/facts-on-the-nobel-prize-in-physics/
    # William Bragg and Lawrence Bragg, 1915
    features.loc[features.fullName == 'William Henry Bragg',
                 'num_physics_laureate_children'] = 1
    features.loc[features.fullName == 'William Lawrence Bragg',
                 'num_physics_laureate_parents'] = 1
    # Niels Bohr, 1922 and Aage N. Bohr, 1975
    features.loc[features.fullName == 'Aage Bohr',
                 'num_physics_laureate_parents'] = 1
    # Manne Siegbahn, 1924 and Kai M. Siegbahn, 1981
    features.loc[features.fullName == 'Kai Siegbahn',
                 'num_physics_laureate_parents'] = 1
    # J. J. Thomson, 1906 and George Paget Thomson, 1937
    features.loc[features.fullName == 'J. J. Thomson',
                 'num_physics_laureate_children'] = 1
    features.loc[features.fullName == 'George Paget Thomson',
                 'num_physics_laureate_parents'] = 1



def _get_nobel_laureates(cell, laureates, names):
    laureates_in_cell = set()
    
    # flatten the alternative laureate names
    alt_laureate_names = np.concatenate(names.apply(
        lambda nam: str(nam).split('|'))).ravel()
    
    if isinstance(cell, str):
        # assume the same name if only differs by a hyphen
        # or whitespace at front or end of string
        values = cell.strip().replace('-', ' ').split('|')
        for value in values:
            if value in laureates.values:
                laureates_in_cell.add(value)
            if value in alt_laureate_names:
                laureates_in_cell.add(value)
                    
    laureates_in_cell = list(laureates_in_cell)
    
    return laureates_in_cell

    
def _get_places_codes(cell, full_name_in_places, places_codes):
    if isinstance(cell, float):
        return np.nan
    
    places = cell.split('|')
    codes = set()
    for place in places:
        code_indices = full_name_in_places[
            full_name_in_places == place].index
        assert(len(code_indices) <= 1)
        if len(code_indices) == 1:
            code_index = code_indices[0]
            if not np.isnan(code_index):
                code = places_codes[code_index]
                if isinstance(code, str):
                    codes.add(code)
    
    if codes:
        codes = list(sorted(codes, key=locale.strxfrm))
    else:
        codes = np.nan
    return codes
    

def _date_no_nan(date):
    if isinstance(date, str):
        return datetime.strptime(date, '%Y-%m-%d').date()
    return datetime.now().date()

In [12]:
train_physicists_features = build_features(train_physicists, nobel_physicists,
                                           nobel_chemists, places)
train_physicists_features.head()

Unnamed: 0,fullName,name,gender,years_lived,theoretical_physicist,experimental_physicist,astronomer,num_physics_laureate_academic_advisors,num_chemistry_laureate_academic_advisors,num_physics_laureate_doctoral_advisors,...,num_physics_laureate_influenced_by,num_chemistry_laureate_influenced_by,birth_country_alpha_2_codes,num_birth_country_alpha_2_codes,birth_continent_codes,num_birth_continent_codes,death_country_alpha_2_codes,num_death_country_alpha_2_codes,death_continent_codes,num_death_continent_codes
0,Aage Bohr,Aage Niels Bohr,1,87,0,0,0,0,0,0,...,0,0,[DK],1,[EU],1,[DK],1,[EU],1
1,Aaldert Wapstra,Aaldert Hendrik Wapstra|Aaldert Wapstra,1,84,0,0,0,0,0,0,...,0,0,,0,,0,[NL],1,[EU],1
2,Abdus Salam,Abdus Salam|Mohammad Abdus Salam,1,70,1,0,0,0,0,0,...,0,0,[PK],1,[AS],1,[GB],1,[EU],1
3,Abraham Alikhanov,Abraham Alikhanov,1,66,0,0,0,0,0,0,...,0,0,"[AZ, GE, RU]",3,"[AS, EU]",2,[RU],1,[EU],1
4,Abraham H. Taub,Abraham H. Taub|Abraham Taub,1,88,0,0,0,0,0,0,...,0,0,[US],1,,0,,0,,0
