In [1]:
import pandas as pd
import requests
import pickle

In [2]:
URL_DATA_THESES = 'https://www.data.gouv.fr/fr/datasets/r/eb06a4f5-a9f1-4775-8226-33425c933272'
df = pd.read_csv(URL_DATA_THESES)
df_theses = df[['nnt', 'auteurs.0.idref', 'auteurs.0.nom', 'auteurs.0.prenom', 
    'date_soutenance', 'ecoles_doctorales.0.nom', 'etablissements_soutenance.0.nom', 
    'sujets.en', 'sujets.fr', 'titres.fr', 'titres.en']]
df_theses.columns = ['nnt', 'auteur_idref', 'auteur_nom', 'auteur_prenom',
                    'date_soutenance', 'ed_nom', 'etab_nom',
                    'sujets_en', 'sujets_fr', 'titre_fr', 'titre_en']
pickle.dump(df_theses, open('df_theses.pkl', 'wb'))

In [2]:
df_theses = pickle.load(open('df_theses.pkl', 'rb'))

In [3]:
def get_authors_openalex(full_name):
    r = requests.get(f'https://api.openalex.org/autocomplete/authors?q={full_name}&author_hint=institution')
    results = r.json()['results']
    french, foreign, unknown = [], [], []
    for e in results:
        if e.get('display_name') != full_name:
            continue
        if e.get('hint') and ', France' in e.get('hint'):
            french.append(e)
        elif e.get('hint') is None:
            unknown.append(e)
        else:
            foreign.append(e)
    return {'french': french, 'foreign': foreign, 'unknown': unknown}

In [4]:
def get_author_works(author_id):
    r = requests.get(f'https://api.openalex.org/works?filter=author.id:{author_id}')
    results = r.json()['results']
    data = []
    for e in results:
        elt = {}
        for f in ['id', 'doi', 'authorships', 'publication_year', 'title']:
            elt[f] = e[f]
        data.append(elt)
    return data
    

In [5]:
ed="École doctorale Mathématiques, Sciences et Technologies de l'Information et de la Communication"
df_cermics = df_theses[df_theses.ed_nom.apply(lambda x:ed in str(x))]

In [6]:
EXCLUDE_NAME = ['Nguyen', 'Wang', 'Zhang', 'Li', 'Liu', 'Chen', 'Luo',
                'Kim', 'Tran', 'Lee', 'Yang', 'Wu', 'Zhao', 'Sun', 'Peng']

In [91]:
def get_potential(full_name, input_data, min_publication_year_credible):
    potentials = []
    # noms trop communs
    for e in EXCLUDE_NAME:
        if e.lower() in full_name.lower().split(' '):
            return potentials
    # auteurs avec ce full name dans OpenAlex    
    data_openalex = get_authors_openalex(full_name)
    
    # pour chacun
    for d in data_openalex['foreign']:
        
        if d['works_count'] < 2:
            continue
        
        elt = input_data.copy()
        elt.update(d)
        elt['country'] = d['hint'].split(',')[-1].strip()
        country = d['hint'].split(',')[-1].lower()
        openalex_id = d['id']
        works = get_author_works(openalex_id.split('/')[-1])
        if works:
            first_publication_year = min([w['publication_year'] for w in works])
            elt['first_publication_year'] = first_publication_year
            # on vérifie si pas de publi avant une date crédible (par ex 4 ans avant la soutenance)
            if first_publication_year < min_publication_year_credible:
                continue
                
        #recent_works = [w for w in works if w['publication_year'] > 2020]
        
        skip = True
        affiliations, recent_affiliations = [], []
        countries, recent_countries = [], []
        all_works, recent_works = [], []
        
        for w in works:
            current_work = {
                'id': w['id'],
                'doi': w.get('doi'),
                'publication_year': w.get('publication_year'),
                'title': w.get('title')
            }
            for aut in w['authorships']:
                if 'display_name' not in aut['author']:
                    continue
                if isinstance(aut.get('raw_affiliation_string'), str) and len(aut['raw_affiliation_string']) > 2:
                    if aut['author']['display_name'] == d['display_name']:
                        current_work['country_code'] = []
                        current_work['raw_affiliation_string'] = aut['raw_affiliation_string']
                        affiliations.append(aut['raw_affiliation_string'])

                        if w['publication_year'] > 2020:
                            recent_affiliations.append(aut['raw_affiliation_string'])

                        for i in aut.get('institutions'):
                            if isinstance(i.get('country_code'), str) and len(i['country_code']) == 2:
                                countries.append(i['country_code'])
                                current_work['country_code'].append(i['country_code'])

                                if i['country_code'] != 'FR' and w['publication_year'] > 2020:
                                    recent_countries.append(i['country_code'])
            
            if current_work.get('raw_affiliation_string'):
                all_works.append(current_work)
                if w['publication_year'] > 2020:
                    recent_works.append(current_work)

        if recent_countries and 'FR' in countries:
            elt['affiliations'] = affiliations
            elt['recent_affiliations'] = recent_affiliations
            elt['countries'] = list(set(countries))
            elt['recent_countries'] = list(set(recent_countries))
            elt['recent_works'] = recent_works
            elt['works'] = all_works
            potentials.append(elt)
    
    return potentials
    
    
    

In [105]:
ix = 0
potential = []
for row in df_theses.sample(100, random_state=6).itertuples():
    year_these = int(row.nnt[0:4])
    
    full_name = f'{row.auteur_prenom} {row.auteur_nom}'
    
    if len(row.auteur_nom)<3:
        continue
    
    ix += 1
    if ix%100 == 0:
        print(ix, end=',')
    input_data = {'name': row.auteur_nom, 'first_name': row.auteur_prenom, 'nnt': row.nnt}    
    potential += get_potential(full_name = full_name, input_data = input_data,
                              min_publication_year_credible = year_these - 4)
    
    
        

In [106]:
len(df_theses)

429404

In [107]:
x = pd.DataFrame(potential)

In [108]:
x

Unnamed: 0,name,first_name,nnt,id,display_name,hint,cited_by_count,works_count,entity_type,external_id,country,first_publication_year,affiliations,recent_affiliations,countries,recent_countries,recent_works,works
0,Groussin,Mathieu,2013LYO10201,https://openalex.org/A2097997838,Mathieu Groussin,"Kiel University, Germany",2389,57,author,https://orcid.org/0000-0002-0942-7217,Germany,2011,[Department of Civil and Environmental Enginee...,"[Department of Biological Engineering, Massach...","[US, GB, FR]",[US],"[{'id': 'https://openalex.org/W3187959711', 'd...","[{'id': 'https://openalex.org/W3044459694', 'd..."
1,Salles,Nicolas,2013PA112164,https://openalex.org/A1988283114,Nicolas Salles,"National Center for Simulation, USA",35,18,author,https://orcid.org/0000-0003-3600-5754,USA,2013,"[CNR-IOM, Democritos National Simulation Cente...","[CNR, Istituto Officina dei Materiali, c/o SIS...","[US, FR]",[US],"[{'id': 'https://openalex.org/W4206307907', 'd...","[{'id': 'https://openalex.org/W3049616605', 'd..."
2,Agha,Mujtaba Hassan,2009INPT050G,https://openalex.org/A2746802932,Mujtaba Hassan Agha,National University of Sciences and Technology...,378,28,author,https://orcid.org/0000-0002-8384-6743,Pakistan,2009,"[Department of Mechanical Engineering, Capital...","[Department of Operations and Supply Chain, NU...","[FR, PK]",[PK],"[{'id': 'https://openalex.org/W4213437522', 'd...","[{'id': 'https://openalex.org/W2791336582', 'd..."
3,Trautmann,Lydie,2003PA05N089,https://openalex.org/A2101639231,Lydie Trautmann,"Oregon Health & Science University, USA",4714,84,author,https://orcid.org/0000-0002-3012-0009,USA,2000,"[University of Montreal, Laboratoire d'Immunol...","[Vaccine and Gene Therapy Institute, Oregon He...","[FR, US, CA]",[US],"[{'id': 'https://openalex.org/W4200622449', 'd...","[{'id': 'https://openalex.org/W2124114936', 'd..."


In [87]:
# nécessaire de vérifier avec le country matcher que le country (cible) est bien détecté à partir des affiliations récentes