# Introduction

Some very rudimentary snippets of code illustrating how load data from the MENTORSHIP dataset. Data is described in Ke, Q., Liang, L., Ding, Y. et al. A dataset of mentorship in bioscience with semantic and demographic estimations. Sci Data 9, 467 (2022). https://doi.org/10.1038/s41597-022-01578-x

In [2]:
# load required libraries
import pickle
import pathlib
from os import path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial import distance

In [6]:
home_dir = pathlib.Path('../')

# Load information about researchers and their mentoring relationships

In [7]:
connect = pd.read_csv(home_dir / 'data' / 'MENTORSHIP' / 'mentorship.csv')
people = pd.read_csv(home_dir / 'data' / 'MENTORSHIP' / 'researcher.csv')

In [8]:
connect.columns

Index(['CID', 'MenteeID', 'MentorID', 'MentorshipType', 'Institution',
       'InstitutionMAGID', 'StartYear', 'StopYear'],
      dtype='object')

In [9]:
people.columns

Index(['PID', 'FirstName', 'MiddleName', 'LastName', 'Institution',
       'InstitutionMAGID', 'ResearchArea', 'ORCID', 'MAGAuthorID'],
      dtype='object')

In [10]:
connect = connect[['CID','MenteeID','MentorID','MentorshipType','Institution','StopYear']]
people = people[['PID','FirstName','MiddleName','LastName','Institution','ResearchArea']]

Link names to connections

In [11]:
connect_names = connect.merge(people[['PID','FirstName','MiddleName','LastName','ResearchArea']], how='inner', left_on='MenteeID', right_on='PID')
connect_names = connect_names.merge(people[['PID','FirstName','MiddleName','LastName','ResearchArea']], how='inner', left_on='MentorID', 
                                    right_on='PID', suffixes=['_t','_m'])

connect_names.head()

Unnamed: 0,CID,MenteeID,MentorID,MentorshipType,Institution,StopYear,PID_t,FirstName_t,MiddleName_t,LastName_t,ResearchArea_t,PID_m,FirstName_m,MiddleName_m,LastName_m,ResearchArea_m
0,2,2,3,1,"University of California, Berkeley",2005,2,BENJAMIN,Y,HAYDEN,neuro,3,JACK,L,GALLANT,"neuro,psych"
1,3,4,3,2,"University of California, Berkeley",2006,4,BENJAMIN,,WILLMORE,neuro,3,JACK,L,GALLANT,"neuro,psych"
2,5,6,3,1,"University of California, Berkeley",2008,6,RYAN,,PRENGER,neuro,3,JACK,L,GALLANT,"neuro,psych"
3,17,27,3,1,"University of California, Berkeley",-1,27,JOSEPH,P,ROGERS,neuro,3,JACK,L,GALLANT,"neuro,psych"
4,18,28,3,2,"University of California, Berkeley",-1,28,RACHEL,,SHOUP,neuro,3,JACK,L,GALLANT,"neuro,psych"


# Find a researcher and their trainees

Simple string match to first and last name

In [12]:
row = people.loc[(people.FirstName=='ERIC') & (people.LastName=='KANDEL')]
PID = row.PID.values[0]
print(f"matching PID={PID}")

matching PID=331


Connect table links PID_m (mentor PID) to PID_t (trainee PIDs)

In [13]:
connect_names.loc[connect_names.PID_m==PID,['PID_t','FirstName_t','MiddleName_t','LastName_t','ResearchArea_t']]

Unnamed: 0,PID_t,FirstName_t,MiddleName_t,LastName_t,ResearchArea_t
7368,334,J,DAVID,SWEATT,neuro
7369,355,IRVING,,KUPFERMANN,neuro
7370,356,TOM,,CAREW,neuro
7371,357,HAROLD,,PINSKER,neuro
7372,354,JAMES,H,SCHWARTZ,neuro
...,...,...,...,...,...
7460,767246,EUGENE,P,BRANDON,neuro
7461,767647,JUAN,MARCOS,ALARCON,neuro
7462,76072,BENNY,,HOCHNER,neuro
7463,800683,PO-TAO,,CHEN,neuro


In [14]:
trainee_PID = connect_names.loc[connect_names.PID_m==PID,'PID_t'].values
trainee_PID

array([   334,    355,    356,    357,    354,    899,    953,    954,
          956,   1346,   1481,   1906,   2193,   2215,   2221,   2223,
         2333,   2603,   3070,   3373,   3441,   3442,   3528,   3439,
         4086,   4995,   5657,   6367,  58611,   6674,   6202,   8933,
         9843,  10030,  11892,  12008,  13423,  13436,  13480,  13552,
         7299,  15424,  16069,  16727,   2248,  11427,  18132,  17179,
        20649,   7078,  22763,  23049,  23356,  23363,  23785,  25998,
        11043,  26587,  26644,   2220,  29321,  29368,  19353,  32885,
        24722,  38092,  30421,  47779,  48781,  50175,  50444,  50835,
        50954,  56551,  58317,  60353,  61614,  65230,  34641,  65991,
        66614,  74996,  83578,  91626, 173942,  51431,  10680,  82051,
       737509, 750864, 751257,  20002, 767246, 767647,  76072, 800683,
        48450])

# Find publications associated with a researcher

MAGPaperID is the unique identifier for papers in the MAG database. Note that this code matches to Papers.csv, which is a subset of the entire MAG corpus!

In [15]:
pid_magid = pd.read_csv(home_dir / 'data' / 'MENTORSHIP' / 'authorship.csv')

In [16]:
pid_magid.loc[pid_magid.PID==PID]

Unnamed: 0,PID,MAGPaperID
41025,331,111832100
41026,331,1216687283
41027,331,132816808
41028,331,1427063205
41029,331,144537577
...,...,...
41632,331,67755273
41633,331,6776920
41634,331,79844685
41635,331,84539758
