In [1]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
manuscript_min_tf = 10
manuscript_max_tf = 0.9
state_path = 's3://ci-elife-data-pipeline/airflow-config/data-science/state-dev'

In [2]:
from functools import partial

import re
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import data_science_pipeline.configure_warnings  # pylint: disable=unused-import

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.io import serialize_object_to
from data_science_pipeline.utils.misc import identity_fn
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query,
)

In [3]:
model_output_path = os.path.join(state_path, 'reviewing_editor_model.joblib')

In [4]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [5]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [6]:
manuscript_editor_assignment_df = read_big_query(
    get_sql('reviewing-editor-assignments.sql').format(
        **default_query_props
    )
)
manuscript_editor_assignment_df.head()

> ```sql
> WITH t_related_person_id AS (
>   SELECT DISTINCT Version_ID AS version_id, 'Reviewing Editor' AS relationship_type, Person.Person_ID
>   FROM `elife-data-pipeline.de_dev.mv_Editorial_All_Manuscript_Version` AS Version
>   JOIN UNNEST(Version.Reviewing_Editors) AS Person
> )
> 
> SELECT
>   Version.Manuscript_ID AS manuscript_id,
>   Version.Version_ID AS version_id,
>   Version.Overall_Stage AS overall_stage,
>   Version.QC_Complete_Timestamp AS qc_complete_timestamp,
>   Version.Position_In_Overall_Stage AS position_in_overall_stage,
>   related_person_id.relationship_type,
>   related_person_id.person_id AS person_id,
>   Related_Person.Name AS name,
>   ARRAY_TO_STRING(ARRAY(SELECT Role_Name FROM UNNEST(Related_Person.Roles)), '|') AS person_roles,
>   ARRAY_TO_STRING(ARRAY(SELECT Keyword FROM UNNEST(Related_Person.Keywords)), '|') AS person_keywords,
>   ARRAY_TO_STRING(ARRAY(SELECT Subject_Area_Name FROM UNNEST(Related_Person.Subject_Areas)), '|') AS person_subject_areas
> 
> FROM `elife-data-pipeline.de_dev.mv_Editorial_All_Manuscript_Version` AS Version
> JOIN t_related_person_id AS related_person_id
>   ON related_person_id.version_Id = Version.Version_ID
> JOIN `elife-data-pipeline.prod.mv_Editorial_Person` AS Related_Person
>   ON Related_Person.Person_ID = related_person_id.person_id
> ORDER BY Manuscript_ID, Version_ID, relationship_type, person_id
> ```

Downloading: 100%|██████████| 25938/25938 [00:04<00:00, 6279.12rows/s]


Unnamed: 0,manuscript_id,version_id,overall_stage,qc_complete_timestamp,position_in_overall_stage,relationship_type,person_id,name,person_roles,person_keywords,person_subject_areas
0,1,00001/2012-10-22T03:36:46Z,Full Submission,2012-10-26 05:37:50+00:00,1,Reviewing Editor,1141,Gil McVean,Guest Editor,,Computational and Systems Biology|Genetics and...
1,2,00002/2012-05-24T16:00:00Z,Full Submission,2012-05-24 12:00:00+00:00,1,Reviewing Editor,1151,Jodi Nunnari,,,Biochemistry and Chemical Biology|Cell Biology
2,3,00003/2012-06-21T18:34:14Z,Full Submission,2012-06-27 05:06:17+00:00,1,Reviewing Editor,1123,Roberto Kolter,Guest Editor,,Microbiology and Infectious Disease|Plant Biology
3,3,00003/2012-09-05T04:24:15Z,Full Submission,2012-09-05 13:06:43+00:00,2,Reviewing Editor,1123,Roberto Kolter,Guest Editor,,Microbiology and Infectious Disease|Plant Biology
4,5,00005/2012-06-25T19:03:52Z,Full Submission,2012-06-28 16:34:47+00:00,1,Reviewing Editor,1189,Kevin Struhl,Senior Editor,chromatin biology|cancer biology|transcription...,Cell Biology|Chromosomes and Gene Expression


In [7]:
manuscript_version_extracted_keywords_df = read_big_query(
    get_sql('manuscript-version-extracted-keywords.sql').format(
        **default_query_props
    )
)
print(len(manuscript_version_extracted_keywords_df))
manuscript_version_extracted_keywords_df.head()

> ```sql
> WITH t_manuscript_version_abstract_keywords AS (
>   SELECT
>     manuscript_abstract_keywords.manuscript_id,
>     manuscript_abstract_keywords.version_id,
>     manuscript_abstract_keywords.extracted_keywords,
>     ROW_NUMBER() OVER (
>       PARTITION BY version_id
>       ORDER BY data_hub_imported_timestamp DESC
>     ) AS version_id_row_number
>   FROM `elife-data-pipeline.de_dev.manuscript_abstract_keywords` AS manuscript_abstract_keywords
>   WHERE ARRAY_LENGTH(extracted_keywords) > 0
> )
> 
> SELECT
>   * EXCEPT(version_id_row_number)
> FROM t_manuscript_version_abstract_keywords
> WHERE version_id_row_number = 1
> ORDER BY version_id
> ```

Downloading: 100%|██████████| 77127/77127 [00:57<00:00, 1344.04rows/s]

77127





Unnamed: 0,manuscript_id,version_id,extracted_keywords
0,1,00001/2012-10-22T03:36:46Z,"[1,129,022 snp, achievement, action, analysis,..."
1,2,00002/2012-05-24T16:00:00Z,"[assembly, assembly factor, associate, associa..."
2,3,00003/2012-06-21T18:34:14Z,"[advantage, antibacterial, antibacterial defen..."
3,3,00003/2012-09-05T04:24:15Z,"[adult, adult fly, advantage, antibacterial, a..."
4,5,00005/2012-06-25T19:03:52Z,"[activity, aebp2, allosteric, allosteric role,..."


In [8]:
reviewing_editors_df = read_big_query(
    get_sql('reviewing-editors.sql').format(
        **default_query_props
    )
)
print(len(reviewing_editors_df))
reviewing_editors_df.head()

> ```sql
> SELECT
>   Person_ID AS person_id,
>   Name AS name
> FROM `elife-data-pipeline.de_dev.mv_Editorial_Editor_Profile`
> WHERE Role_Name = 'Reviewing Editor'
> ```

Downloading: 100%|██████████| 558/558 [00:00<00:00, 698.77rows/s]

558





Unnamed: 0,person_id,name
0,126873,Yuuki Watanabe
1,178962,Chima Nwaogu
2,70207,Bernhard Schmid
3,14193,Merijn Kant
4,131589,Ammie Kalan


In [9]:
keyword_exclusion_df = read_big_query(
    get_sql('keyword-exclusion.sql').format(
        **default_query_props
    )
)
print(len(keyword_exclusion_df))
keyword_exclusion_df.head()

> ```sql
> SELECT excluded_keyword
> FROM `elife-data-pipeline.de_dev.keyword_exclusion`
> ORDER BY excluded_keyword
> ```

Downloading: 100%|██████████| 100/100 [00:01<00:00, 98.75rows/s]

100





Unnamed: 0,excluded_keyword
0,a
1,about
2,across
3,additional
4,all


In [10]:
reviewing_editor_names = set(reviewing_editors_df['name'])
len(reviewing_editor_names)

558

In [11]:
sorted(reviewing_editor_names)

['Abby Dernburg',
 'Adam Aron',
 'Adam Frost',
 'Adam Linstedt',
 'Adèle Marston',
 'Agnese Seminara',
 'Agnieszka Chacinska',
 'Ahmet Yildiz',
 'Alan Hinnebusch',
 'Alan Moses',
 'Albert Cardona',
 'Alejandro Sánchez Alvarado',
 'Alex Cook',
 'Alex Fornito',
 'Alexander Borst',
 'Alexander Shackman',
 'Alexander Westermann',
 'Alfonso Valencia',
 'Alicia Izquierdo',
 'Alison Goate',
 'Allan Basbaum',
 'Alphee Michelot',
 'Ambra Pozzi',
 'Amita Sehgal',
 'Ammie Kalan',
 'Amy Gladfelter',
 'Amy Wesolowski',
 'Ana Domingos',
 'Andrea Martin',
 'Andreas Martin',
 'Andreas Schaefer',
 'Andrei Lupas',
 'Andrew Brack',
 'Andrew Carter',
 'Andrew Kruse',
 'Andrew MacPherson',
 'Andrew Morris',
 'Andrew West',
 'Andrés Aguilera',
 'Anita Bhattacharyya',
 'Anna Diehl',
 'Anna Pyle',
 'Anna Roe',
 'Anna Schapiro',
 'Anne West',
 'Annelien Bredenoord',
 'Antonis Rokas',
 'Antony Rosen',
 'Anurag Agrawal',
 'Apurva Sarin',
 'Arduino Mangoni',
 'Armita Nourmohammad',
 'Arun Radhakrishnan',
 'Aryn G

In [12]:
manuscript_editor_assignment_with_extracted_keywords_df = (
    manuscript_editor_assignment_df
    .merge(
        manuscript_version_extracted_keywords_df.drop(columns=['manuscript_id']),
        on='version_id',
        suffixes=('', '_extracted')
    )
)
print(manuscript_editor_assignment_with_extracted_keywords_df.columns)
manuscript_editor_assignment_with_extracted_keywords_df.head()

Index(['manuscript_id', 'version_id', 'overall_stage', 'qc_complete_timestamp',
       'position_in_overall_stage', 'relationship_type', 'person_id', 'name',
       'person_roles', 'person_keywords', 'person_subject_areas',
       'extracted_keywords'],
      dtype='object')


Unnamed: 0,manuscript_id,version_id,overall_stage,qc_complete_timestamp,position_in_overall_stage,relationship_type,person_id,name,person_roles,person_keywords,person_subject_areas,extracted_keywords
0,1,00001/2012-10-22T03:36:46Z,Full Submission,2012-10-26 05:37:50+00:00,1,Reviewing Editor,1141,Gil McVean,Guest Editor,,Computational and Systems Biology|Genetics and...,"[1,129,022 snp, achievement, action, analysis,..."
1,2,00002/2012-05-24T16:00:00Z,Full Submission,2012-05-24 12:00:00+00:00,1,Reviewing Editor,1151,Jodi Nunnari,,,Biochemistry and Chemical Biology|Cell Biology,"[assembly, assembly factor, associate, associa..."
2,3,00003/2012-06-21T18:34:14Z,Full Submission,2012-06-27 05:06:17+00:00,1,Reviewing Editor,1123,Roberto Kolter,Guest Editor,,Microbiology and Infectious Disease|Plant Biology,"[advantage, antibacterial, antibacterial defen..."
3,3,00003/2012-09-05T04:24:15Z,Full Submission,2012-09-05 13:06:43+00:00,2,Reviewing Editor,1123,Roberto Kolter,Guest Editor,,Microbiology and Infectious Disease|Plant Biology,"[adult, adult fly, advantage, antibacterial, a..."
4,5,00005/2012-06-25T19:03:52Z,Full Submission,2012-06-28 16:34:47+00:00,1,Reviewing Editor,1189,Kevin Struhl,Senior Editor,chromatin biology|cancer biology|transcription...,Cell Biology|Chromosomes and Gene Expression,"[activity, aebp2, allosteric, allosteric role,..."


In [13]:
full_submission_reviewing_editor_assignment_df = manuscript_editor_assignment_with_extracted_keywords_df[
    (manuscript_editor_assignment_with_extracted_keywords_df['overall_stage'] == 'Full Submission')
    & (manuscript_editor_assignment_with_extracted_keywords_df['position_in_overall_stage'] == 1)
    & (manuscript_editor_assignment_with_extracted_keywords_df['relationship_type'] == 'Reviewing Editor')
    & (
        manuscript_editor_assignment_with_extracted_keywords_df['name']
        .fillna('').isin(reviewing_editor_names)
    )
]
print(len(full_submission_reviewing_editor_assignment_df))
full_submission_reviewing_editor_assignment_df.head()

7849


Unnamed: 0,manuscript_id,version_id,overall_stage,qc_complete_timestamp,position_in_overall_stage,relationship_type,person_id,name,person_roles,person_keywords,person_subject_areas,extracted_keywords
12,13,00013/2012-05-30T16:00:00Z,Full Submission,2012-05-30 12:00:00+00:00,1,Reviewing Editor,1095,E Peter Greenberg,Reviewing Editor|Editorial Board Member,microbe-microbe communication,Microbiology and Infectious Disease,"[- nine order, 10-15 m, animal, animal health,..."
23,39,00039/2012-07-06T22:13:18Z,Full Submission,2012-07-10 11:58:11+00:00,1,Reviewing Editor,22520,Timothy Nilsen,Reviewing Editor|Editorial Board Member,drosophila|mrna processing|mechanism of mirna ...,Biochemistry and Chemical Biology|Chromosomes ...,"[activity, alternative, alternative mirna path..."
36,65,00065/2012-07-17T18:32:20Z,Full Submission,2012-07-18 05:55:45+00:00,1,Reviewing Editor,1067,Michael Czech,Reviewing Editor|Editorial Board Member,obesity|insulin resistance|diabetes|signal tra...,Cell Biology|Human Biology and Medicine,"[acid, acid oxidation, adaptive, adaptive star..."
47,90,00090/2012-07-25T15:45:08Z,Full Submission,2012-07-26 09:14:51+00:00,1,Reviewing Editor,1072,Roger Davis,Reviewing Editor|Editorial Board Member,signal transduction,Cell Biology|Chromosomes and Gene Expression,"[biomarker, breast, breast cancer, cancer, can..."
50,93,00093/2012-07-30T17:59:17Z,Full Submission,2012-07-31 07:06:51+00:00,1,Reviewing Editor,1046,Carl Bergstrom,Reviewing Editor|Editorial Board Member,metaresearch|science of science|science studie...,Computational and Systems Biology,"[agent-base, agent-based model, antigenic, ant..."


In [14]:
print('unique manuscript ids:', full_submission_reviewing_editor_assignment_df['manuscript_id'].nunique())
print('duplicate manuscript ids (if any):')
full_submission_reviewing_editor_assignment_df[
    full_submission_reviewing_editor_assignment_df['manuscript_id'].isin(
        full_submission_reviewing_editor_assignment_df
        .groupby('manuscript_id')
        .size()
        .pipe(lambda s: s[s > 1])
        .index
    )
]

unique manuscript ids: 7849
duplicate manuscript ids (if any):


Unnamed: 0,manuscript_id,version_id,overall_stage,qc_complete_timestamp,position_in_overall_stage,relationship_type,person_id,name,person_roles,person_keywords,person_subject_areas,extracted_keywords


In [15]:
tf_idf_vectorizer = TfidfVectorizer(
    tokenizer=identity_fn,
    token_pattern=None,
    lowercase=False,
    min_df=manuscript_min_tf,
    max_df=manuscript_max_tf
)
print(tf_idf_vectorizer)
tf_idf_vectorizer.fit(
    full_submission_reviewing_editor_assignment_df['extracted_keywords']
)
all_keywords_set = set(tf_idf_vectorizer.get_feature_names())
len(all_keywords_set)

TfidfVectorizer(lowercase=False, max_df=0.9, min_df=10, token_pattern=None,
                tokenizer=<function identity_fn at 0x7f81ba491ef0>)


5777

In [16]:
print(sorted(all_keywords_set)[:10])

['- mouse', '-dependent', '-end', '100-fold', '2 diabetes', '2d', "3'", '3d', '40s', "5'"]


In [17]:
all_keywords_set = {
    keyword
    for keyword in all_keywords_set
    if re.match(r'^[a-zA-Z]', keyword)
}
print('all_keywords_set len (after filter):', len(all_keywords_set))
print(sorted(all_keywords_set)[:10])

all_keywords_set len (after filter): 5758
['aaa', 'aaa+', 'ab', 'aba', 'abc', 'aberrant', 'aberrant expression', 'aberration', 'ability', 'ablation']


In [18]:
all_keywords_set = all_keywords_set - set(keyword_exclusion_df['excluded_keyword'])
print('all_keywords_set len (after exclusion):', len(all_keywords_set))

all_keywords_set len (after exclusion): 5742


In [19]:
editor_extracted_keywords_df = (
    full_submission_reviewing_editor_assignment_df
    [['name', 'extracted_keywords']]
    .groupby('name')
    .agg(
        lambda keywords_list: [
            keyword
            for keywords in keywords_list
            for keyword in keywords
            if keyword in all_keywords_set
        ]
    )
    .reset_index()
    .sort_values('name')
)
editor_extracted_keywords_df.head()

Unnamed: 0,name,extracted_keywords
0,Abby Dernburg,"[biological, centromere, chaperone, complex, c..."
1,Adam Aron,"[activation, amygdala, analyzes, approximately..."
2,Adam Frost,"[alternative, bacteria, cell, center, conforma..."
3,Adam Linstedt,"[adrenergic, basis, biological, biologically, ..."
4,Adèle Marston,"[ablation, cell, cell cycle, chromatid, chromo..."


In [20]:
editor_person_id_df = (
    full_submission_reviewing_editor_assignment_df
    [['name', 'person_id']]
    .dropna()
    .groupby('name')
    .last()
    .loc[editor_extracted_keywords_df['name']]
    .reset_index()
)
editor_person_id_df.head()

Unnamed: 0,name,person_id
0,Abby Dernburg,3357
1,Adam Aron,33041
2,Adam Frost,86941
3,Adam Linstedt,4372
4,Adèle Marston,7131


In [21]:
editor_tf_idf_vectorizer = TfidfVectorizer(
    tokenizer=identity_fn,
    token_pattern=None,
    lowercase=False,
    norm='l2',
    smooth_idf=False,
    sublinear_tf=False,
    min_df=1,
    max_df=1.0
)
print(editor_tf_idf_vectorizer)
editor_tf_idf = editor_tf_idf_vectorizer.fit_transform(
    editor_extracted_keywords_df['extracted_keywords']
)
editor_tf_idf

TfidfVectorizer(lowercase=False, smooth_idf=False, token_pattern=None,
                tokenizer=<function identity_fn at 0x7f81ba491ef0>)


<514x5742 sparse matrix of type '<class 'numpy.float64'>'
	with 193040 stored elements in Compressed Sparse Row format>

In [22]:
cosine_similarity(
    editor_tf_idf_vectorizer.transform(
        editor_extracted_keywords_df
        ['extracted_keywords'][:1]
    ),
    editor_tf_idf
)

array([[1.        , 0.02149986, 0.08348098, 0.09563401, 0.22788936,
        0.04830816, 0.17276296, 0.07799492, 0.19825057, 0.07978822,
        0.18114558, 0.01077698, 0.05128793, 0.0999316 , 0.0418764 ,
        0.16069694, 0.06969839, 0.06967869, 0.07106076, 0.0890073 ,
        0.06685728, 0.06352023, 0.0590869 , 0.02896746, 0.1255277 ,
        0.04807207, 0.11050465, 0.11086018, 0.17236821, 0.0236406 ,
        0.08895306, 0.09677909, 0.1077891 , 0.24346063, 0.12577905,
        0.05386862, 0.11486388, 0.05060387, 0.02550576, 0.18459895,
        0.15821001, 0.02793841, 0.05442309, 0.06389973, 0.07460132,
        0.08368066, 0.0836749 , 0.03767049, 0.12972618, 0.06776979,
        0.0989705 , 0.1612831 , 0.10942976, 0.08961884, 0.10034212,
        0.0272556 , 0.06729406, 0.14007383, 0.02738536, 0.10111323,
        0.07449252, 0.28787198, 0.05450988, 0.1334282 , 0.05179607,
        0.08541232, 0.04378151, 0.08158469, 0.2105784 , 0.1635173 ,
        0.11526231, 0.08239549, 0.13825637, 0.08

In [23]:
print('saving to:', model_output_path)
serialize_object_to({
    'editor_tf_idf_vectorizer': editor_tf_idf_vectorizer,
    'editor_tf_idf': editor_tf_idf,
    'editor_names': editor_extracted_keywords_df['name'],
    'editor_person_ids': editor_person_id_df['person_id']
}, model_output_path)
print('done')

saving to: s3://ci-elife-data-pipeline/airflow-config/data-science/state-dev/reviewing_editor_model.joblib
done
