In [1]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
manuscript_min_tf = 10
manuscript_max_tf = 0.9
state_path = 's3://ci-elife-data-pipeline/airflow-config/data-science/state-dev'

In [2]:
from functools import partial

import re
import os

import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import data_science_pipeline.configure_warnings  # pylint: disable=unused-import
import data_science_pipeline.configure_notebook_logging  # pylint: disable=unused-import

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.io import serialize_object_to
from data_science_pipeline.utils.misc import identity_fn
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query,
)

In [3]:
model_output_path = os.path.join(state_path, 'reviewing_editor_model.joblib')

In [4]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [5]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [6]:
manuscript_editor_assignment_df = read_big_query(
    get_sql('reviewing-editor-assignments.sql').format(
        **default_query_props
    )
)
manuscript_editor_assignment_df.head()

> ```sql
> WITH t_related_person_id AS (
>   SELECT DISTINCT Version_ID AS version_id, 'Reviewing Editor' AS relationship_type, Person.Person_ID
>   FROM `elife-data-pipeline.de_dev.mv_Editorial_All_Manuscript_Version` AS Version
>   JOIN UNNEST(Version.Reviewing_Editors) AS Person
> )
> 
> SELECT
>   Version.Manuscript_ID AS manuscript_id,
>   Version.Version_ID AS version_id,
>   Version.Overall_Stage AS overall_stage,
>   Version.QC_Complete_Timestamp AS qc_complete_timestamp,
>   Version.Position_In_Overall_Stage AS position_in_overall_stage,
>   related_person_id.relationship_type,
>   related_person_id.person_id AS person_id,
>   Related_Person.Name AS name,
>   ARRAY_TO_STRING(ARRAY(SELECT Role_Name FROM UNNEST(Related_Person.Roles)), '|') AS person_roles,
>   ARRAY_TO_STRING(ARRAY(SELECT Keyword FROM UNNEST(Related_Person.Keywords)), '|') AS person_keywords,
>   ARRAY_TO_STRING(ARRAY(SELECT Subject_Area_Name FROM UNNEST(Related_Person.Subject_Areas)), '|') AS person_subject_areas
> 
> FROM `elife-data-pipeline.de_dev.mv_Editorial_All_Manuscript_Version` AS Version
> JOIN t_related_person_id AS related_person_id
>   ON related_person_id.version_Id = Version.Version_ID
> JOIN `elife-data-pipeline.prod.mv_Editorial_Person` AS Related_Person
>   ON Related_Person.Person_ID = related_person_id.person_id
> ORDER BY Manuscript_ID, Version_ID, relationship_type, person_id
> ```

Downloading: 100%|██████████| 26070/26070 [00:03<00:00, 6934.94rows/s]


Unnamed: 0,manuscript_id,version_id,overall_stage,qc_complete_timestamp,position_in_overall_stage,relationship_type,person_id,name,person_roles,person_keywords,person_subject_areas
0,1,00001/2012-10-22T03:36:46Z,Full Submission,2012-10-26 05:37:50+00:00,1,Reviewing Editor,1141,Gil McVean,Guest Editor,,Computational and Systems Biology|Genetics and...
1,2,00002/2012-05-24T16:00:00Z,Full Submission,2012-05-24 12:00:00+00:00,1,Reviewing Editor,1151,Jodi Nunnari,,,Biochemistry and Chemical Biology|Cell Biology
2,3,00003/2012-06-21T18:34:14Z,Full Submission,2012-06-27 05:06:17+00:00,1,Reviewing Editor,1123,Roberto Kolter,Guest Editor,,Microbiology and Infectious Disease|Plant Biology
3,3,00003/2012-09-05T04:24:15Z,Full Submission,2012-09-05 13:06:43+00:00,2,Reviewing Editor,1123,Roberto Kolter,Guest Editor,,Microbiology and Infectious Disease|Plant Biology
4,5,00005/2012-06-25T19:03:52Z,Full Submission,2012-06-28 16:34:47+00:00,1,Reviewing Editor,1189,Kevin Struhl,Senior Editor,chromatin biology|cancer biology|transcription...,Cell Biology|Chromosomes and Gene Expression


In [7]:
manuscript_version_extracted_keywords_df = read_big_query(
    get_sql('manuscript-version-extracted-keywords.sql').format(
        **default_query_props
    )
)
print(len(manuscript_version_extracted_keywords_df))
manuscript_version_extracted_keywords_df.head()

> ```sql
> WITH t_manuscript_version_abstract_keywords AS (
>   SELECT
>     manuscript_abstract_keywords.manuscript_id,
>     manuscript_abstract_keywords.version_id,
>     manuscript_abstract_keywords.extracted_keywords,
>     ROW_NUMBER() OVER (
>       PARTITION BY version_id
>       ORDER BY data_hub_imported_timestamp DESC
>     ) AS version_id_row_number
>   FROM `elife-data-pipeline.de_dev.manuscript_abstract_keywords` AS manuscript_abstract_keywords
>   WHERE ARRAY_LENGTH(extracted_keywords) > 0
> )
> 
> SELECT
>   * EXCEPT(version_id_row_number)
> FROM t_manuscript_version_abstract_keywords
> WHERE version_id_row_number = 1
> ORDER BY version_id
> ```

Downloading: 100%|██████████| 77539/77539 [00:56<00:00, 1377.78rows/s]

77539





Unnamed: 0,manuscript_id,version_id,extracted_keywords
0,1,00001/2012-10-22T03:36:46Z,"[1,129,022 snp, achievement, action, analysis,..."
1,2,00002/2012-05-24T16:00:00Z,"[assembly, assembly factor, associate, associa..."
2,3,00003/2012-06-21T18:34:14Z,"[advantage, antibacterial, antibacterial defen..."
3,3,00003/2012-09-05T04:24:15Z,"[adult, adult fly, advantage, antibacterial, a..."
4,5,00005/2012-06-25T19:03:52Z,"[activity, aebp2, allosteric, allosteric role,..."


In [8]:
publications_extracted_keywords_df = read_big_query(
    get_sql('publications_extracted_keywords.sql').format(
        **default_query_props
    )
).rename(columns={'abstract_keywords': 'extracted_keywords'})
print(len(publications_extracted_keywords_df))
publications_extracted_keywords_df.head()

> ```sql
> WITH t_paper_extracted_keywords AS (
>   SELECT
>     *,
>     ROW_NUMBER() OVER(PARTITION BY pmid ORDER BY data_hub_imported_timestamp DESC) AS pmid_row_number
>   FROM `elife-data-pipeline.de_dev.data_science_disambiguated_editor_papers_abstract_keywords`
> ),
> 
> t_paper_summary AS (
>   SELECT
>     *,
>     ROW_NUMBER() OVER(PARTITION BY pmid ORDER BY provenance.imported_timestamp DESC) AS pmid_row_number
>   FROM `elife-data-pipeline.de_dev.data_science_external_manuscript_summary`
> )
> 
> SELECT
>   CONCAT('pmid:', paper_extracted_keywords.pmid) AS publication_id,
>   paper_extracted_keywords.pmid,
>   paper_summary.doi,
>   REGEXP_EXTRACT(LOWER(paper_summary.doi), r'10.7554/elife.([\d]{5})') AS manuscript_id,
>   paper_summary.firstPublicationDate AS publication_date,
>   paper_extracted_keywords.extracted_keywords AS abstract_keywords
> FROM t_paper_extracted_keywords AS paper_extracted_keywords
> JOIN t_paper_summary AS paper_summary
>   ON paper_summary.pmid = paper_extracted_keywords.pmid
>   AND paper_summary.pmid_row_number = 1
> WHERE paper_extracted_keywords.pmid_row_number = 1
>   AND ARRAY_LENGTH(paper_extracted_keywords.extracted_keywords) > 0
> ```

Downloading: 100%|██████████| 13888/13888 [00:12<00:00, 1098.98rows/s]

13888





Unnamed: 0,publication_id,pmid,doi,manuscript_id,publication_date,extracted_keywords
0,pmid:10235685,10235685,10.1002/(sici)1097-4695(199905)39:2<323::aid-n...,,1999-05-01,"[absence, activation, active, active trk recep..."
1,pmid:10398591,10398591,10.1126/science.285.5425.215,,1999-07-01,"[array, assembly, attention, capacity, cargo, ..."
2,pmid:10419689,10419689,10.1006/dbio.1999.9356,,1999-08-01,"[actin, actin bundle, addition, adult, adult h..."
3,pmid:10606204,10606204,10.1007/s001099900054,,1999-10-01,"[addition, alpha, alpha-subunit, bind, binding..."
4,pmid:10805343,10805343,10.1002/1531-8249(200005)47:5<670::aid-ana20>3...,,2000-05-01,"[13 clinical characteristic, 23 different dise..."


In [9]:
editor_publication_ids_df = read_big_query(
    get_sql('editor_publication_ids.sql').format(
        **default_query_props
    )
)
print(len(editor_publication_ids_df))
editor_publication_ids_df.head()

> ```sql
> WITH t_editor_pubmed_ids AS (
>   SELECT
>     *,
>     ROW_NUMBER() OVER(PARTITION BY person_id ORDER BY provenance.imported_timestamp DESC) AS person_id_row_number
>   FROM `elife-data-pipeline.de_dev.data_science_editor_pubmed_ids`
> ),
> 
> t_paper_summary AS (
>   SELECT
>     *,
>     ROW_NUMBER() OVER(PARTITION BY pmid ORDER BY provenance.imported_timestamp DESC) AS pmid_row_number
>   FROM `elife-data-pipeline.de_dev.data_science_external_manuscript_summary`
> )
> 
> SELECT
>   disambiguated_editor_papers.person_id,
>   CONCAT('pmid:', pmid) AS publication_id,
>   pmid,
>   doi,
>   REGEXP_EXTRACT(LOWER(paper_summary.doi), r'10.7554/elife.([\d]{5})') AS manuscript_id,
>   paper_summary.firstPublicationDate AS publication_date,
>   (pmid IN UNNEST(editor_pubmed_links.relevant_pubmed_ids)) AS is_relevant_pubmed_id,
>   (pmid IN UNNEST(editor_pubmed_ids.pubmed_ids)) AS is_search_pubmed_id
> 
> FROM `elife-data-pipeline.de_dev.data_science_disambiguated_editor_papers` AS disambiguated_editor_papers
> JOIN `elife-data-pipeline.de_dev.data_science_editor_pubmed_links` AS editor_pubmed_links
>   ON editor_pubmed_links.person_id = disambiguated_editor_papers.person_id
> JOIN t_editor_pubmed_ids AS editor_pubmed_ids
>   ON editor_pubmed_ids.person_id = disambiguated_editor_papers.person_id
>   AND editor_pubmed_ids.person_id_row_number = 1 
> JOIN UNNEST(disambiguated_editor_papers.disambiguated_pubmed_ids) AS pmid
> JOIN t_paper_summary AS paper_summary 
>   ON paper_summary.pmid = pmid
>   AND paper_summary.pmid_row_number = 1
> ```

Downloading: 100%|██████████| 15091/15091 [00:02<00:00, 5649.75rows/s]

15091





Unnamed: 0,person_id,publication_id,pmid,doi,manuscript_id,publication_date,is_relevant_pubmed_id,is_search_pubmed_id
0,145921,pmid:1995917,1995917,10.1093/jnci/83.5.336,,1991-03-01,False,True
1,145921,pmid:23677572,23677572,10.1158/1055-9965.epi-13-0105,,2013-05-15,False,True
2,145921,pmid:21070806,21070806,10.1016/j.ypmed.2010.10.014,,2010-11-08,False,True
3,145921,pmid:26970739,26970739,10.1007/s10552-016-0726-5,,2016-03-12,False,True
4,145921,pmid:20843488,20843488,10.1016/j.ejca.2010.07.028,,2010-09-01,False,True


In [10]:
reviewing_editors_df = read_big_query(
    get_sql('reviewing-editors.sql').format(
        **default_query_props
    )
)
print(len(reviewing_editors_df))
reviewing_editors_df.head()

> ```sql
> SELECT
>   Person_ID AS person_id,
>   Name AS name
> FROM `elife-data-pipeline.de_dev.mv_Editorial_Editor_Profile`
> WHERE Role_Name = 'Reviewing Editor'
> ```

Downloading: 100%|██████████| 566/566 [00:00<00:00, 1812.70rows/s]

566





Unnamed: 0,person_id,name
0,126873,Yuuki Watanabe
1,178962,Chima Nwaogu
2,14193,Merijn Kant
3,70207,Bernhard Schmid
4,136296,David Donoso


In [11]:
keyword_exclusion_df = read_big_query(
    get_sql('keyword-exclusion.sql').format(
        **default_query_props
    )
)
print(len(keyword_exclusion_df))
keyword_exclusion_df.head()

> ```sql
> SELECT excluded_keyword
> FROM `elife-data-pipeline.de_dev.keyword_exclusion`
> ORDER BY excluded_keyword
> ```

Downloading: 100%|██████████| 100/100 [00:00<00:00, 315.86rows/s]

100





Unnamed: 0,excluded_keyword
0,a
1,about
2,across
3,additional
4,all


In [12]:
reviewing_editor_names = set(reviewing_editors_df['name'])
len(reviewing_editor_names)

566

In [13]:
sorted(reviewing_editor_names)

['Abby Dernburg',
 'Adam Aron',
 'Adam Frost',
 'Adam Linstedt',
 'Adrien Peyrache',
 'Adèle Marston',
 'Agnese Seminara',
 'Agnieszka Chacinska',
 'Ahmet Yildiz',
 'Alan Hinnebusch',
 'Alan Moses',
 'Albert Cardona',
 'Alejandro Sánchez Alvarado',
 'Alex Cook',
 'Alex Fornito',
 'Alexander Borst',
 'Alexander Shackman',
 'Alexander Westermann',
 'Alfonso Valencia',
 'Alicia Izquierdo',
 'Alison Goate',
 'Allan Basbaum',
 'Alphee Michelot',
 'Ambra Pozzi',
 'Amita Sehgal',
 'Ammie Kalan',
 'Amy Gladfelter',
 'Amy Wesolowski',
 'Ana Domingos',
 'Andrea Martin',
 'Andreas Martin',
 'Andreas Schaefer',
 'Andrei Lupas',
 'Andrew Brack',
 'Andrew Carter',
 'Andrew Kruse',
 'Andrew MacPherson',
 'Andrew Morris',
 'Andrew West',
 'Andrés Aguilera',
 'Anita Bhattacharyya',
 'Anna Diehl',
 'Anna Pyle',
 'Anna Roe',
 'Anna Schapiro',
 'Anne West',
 'Annelien Bredenoord',
 'Antonis Rokas',
 'Antony Rosen',
 'Anurag Agrawal',
 'Apurva Sarin',
 'Arduino Mangoni',
 'Armita Nourmohammad',
 'Arun Radh

In [14]:
manuscript_editor_assignment_with_extracted_keywords_df = (
    manuscript_editor_assignment_df
    .merge(
        manuscript_version_extracted_keywords_df.drop(columns=['manuscript_id']),
        on='version_id',
        suffixes=('', '_extracted')
    )
)
print(manuscript_editor_assignment_with_extracted_keywords_df.columns)
manuscript_editor_assignment_with_extracted_keywords_df.head()

Index(['manuscript_id', 'version_id', 'overall_stage', 'qc_complete_timestamp',
       'position_in_overall_stage', 'relationship_type', 'person_id', 'name',
       'person_roles', 'person_keywords', 'person_subject_areas',
       'extracted_keywords'],
      dtype='object')


Unnamed: 0,manuscript_id,version_id,overall_stage,qc_complete_timestamp,position_in_overall_stage,relationship_type,person_id,name,person_roles,person_keywords,person_subject_areas,extracted_keywords
0,1,00001/2012-10-22T03:36:46Z,Full Submission,2012-10-26 05:37:50+00:00,1,Reviewing Editor,1141,Gil McVean,Guest Editor,,Computational and Systems Biology|Genetics and...,"[1,129,022 snp, achievement, action, analysis,..."
1,2,00002/2012-05-24T16:00:00Z,Full Submission,2012-05-24 12:00:00+00:00,1,Reviewing Editor,1151,Jodi Nunnari,,,Biochemistry and Chemical Biology|Cell Biology,"[assembly, assembly factor, associate, associa..."
2,3,00003/2012-06-21T18:34:14Z,Full Submission,2012-06-27 05:06:17+00:00,1,Reviewing Editor,1123,Roberto Kolter,Guest Editor,,Microbiology and Infectious Disease|Plant Biology,"[advantage, antibacterial, antibacterial defen..."
3,3,00003/2012-09-05T04:24:15Z,Full Submission,2012-09-05 13:06:43+00:00,2,Reviewing Editor,1123,Roberto Kolter,Guest Editor,,Microbiology and Infectious Disease|Plant Biology,"[adult, adult fly, advantage, antibacterial, a..."
4,5,00005/2012-06-25T19:03:52Z,Full Submission,2012-06-28 16:34:47+00:00,1,Reviewing Editor,1189,Kevin Struhl,Senior Editor,chromatin biology|cancer biology|transcription...,Cell Biology|Chromosomes and Gene Expression,"[activity, aebp2, allosteric, allosteric role,..."


In [15]:
full_submission_reviewing_editor_assignment_df = manuscript_editor_assignment_with_extracted_keywords_df[
    (manuscript_editor_assignment_with_extracted_keywords_df['overall_stage'] == 'Full Submission')
    & (manuscript_editor_assignment_with_extracted_keywords_df['position_in_overall_stage'] == 1)
    & (manuscript_editor_assignment_with_extracted_keywords_df['relationship_type'] == 'Reviewing Editor')
    & (
        manuscript_editor_assignment_with_extracted_keywords_df['name']
        .fillna('').isin(reviewing_editor_names)
    )
]
print(len(full_submission_reviewing_editor_assignment_df))
full_submission_reviewing_editor_assignment_df.head()

7904


Unnamed: 0,manuscript_id,version_id,overall_stage,qc_complete_timestamp,position_in_overall_stage,relationship_type,person_id,name,person_roles,person_keywords,person_subject_areas,extracted_keywords
12,13,00013/2012-05-30T16:00:00Z,Full Submission,2012-05-30 12:00:00+00:00,1,Reviewing Editor,1095,E Peter Greenberg,Reviewing Editor|Editorial Board Member,microbe-microbe communication,Microbiology and Infectious Disease,"[- nine order, 10-15 m, animal, animal health,..."
23,39,00039/2012-07-06T22:13:18Z,Full Submission,2012-07-10 11:58:11+00:00,1,Reviewing Editor,22520,Timothy Nilsen,Reviewing Editor|Editorial Board Member,drosophila|mrna processing|mechanism of mirna ...,Biochemistry and Chemical Biology|Chromosomes ...,"[activity, alternative, alternative mirna path..."
36,65,00065/2012-07-17T18:32:20Z,Full Submission,2012-07-18 05:55:45+00:00,1,Reviewing Editor,1067,Michael Czech,Reviewing Editor|Editorial Board Member,obesity|insulin resistance|diabetes|signal tra...,Cell Biology|Human Biology and Medicine,"[acid, acid oxidation, adaptive, adaptive star..."
47,90,00090/2012-07-25T15:45:08Z,Full Submission,2012-07-26 09:14:51+00:00,1,Reviewing Editor,1072,Roger Davis,Reviewing Editor|Editorial Board Member,signal transduction,Cell Biology|Chromosomes and Gene Expression,"[biomarker, breast, breast cancer, cancer, can..."
50,93,00093/2012-07-30T17:59:17Z,Full Submission,2012-07-31 07:06:51+00:00,1,Reviewing Editor,1046,Carl Bergstrom,Reviewing Editor|Editorial Board Member,metaresearch|science of science|science studie...,Computational and Systems Biology,"[agent-base, agent-based model, antigenic, ant..."


In [16]:
print('unique manuscript ids:', full_submission_reviewing_editor_assignment_df['manuscript_id'].nunique())
print('duplicate manuscript ids (if any):')
full_submission_reviewing_editor_assignment_df[
    full_submission_reviewing_editor_assignment_df['manuscript_id'].isin(
        full_submission_reviewing_editor_assignment_df
        .groupby('manuscript_id')
        .size()
        .pipe(lambda s: s[s > 1])
        .index
    )
]

unique manuscript ids: 7904
duplicate manuscript ids (if any):


Unnamed: 0,manuscript_id,version_id,overall_stage,qc_complete_timestamp,position_in_overall_stage,relationship_type,person_id,name,person_roles,person_keywords,person_subject_areas,extracted_keywords


In [17]:
editor_publications_with_extracted_keywords_df = (
    reviewing_editors_df[['person_id', 'name']]
    .merge(
        editor_publication_ids_df
        [['person_id', 'publication_id', 'is_relevant_pubmed_id', 'is_search_pubmed_id']],
        on='person_id'
    )
    .merge(
        publications_extracted_keywords_df,
        on='publication_id',
        suffixes=('', '_extracted')
    )
).sort_values(['publication_id', 'person_id']).copy()
print(len(editor_publications_with_extracted_keywords_df))
editor_publications_with_extracted_keywords_df.head(3)

10940


Unnamed: 0,person_id,name,publication_id,is_relevant_pubmed_id,is_search_pubmed_id,pmid,doi,manuscript_id,publication_date,extracted_keywords
384,1040,Mohan Balasubramanian,pmid:10022828,False,True,10022828,10.1093/emboj/18.4.854,,1999-02-01,"[accumulation, actin-bind, actin-binding domai..."
2660,1054,Axel Brunger,pmid:10024461,False,True,10024461,10.1006/jmbi.1998.2512,,1999-02-01,"[-6.2, -9.9, 97-residue, 97-residue m2 protein..."
2809,1054,Axel Brunger,pmid:10025402,False,True,10025402,10.1016/s0092-8674(00)80549-8,,1999-02-01,"[analysis, area, biochemical, biochemical datu..."


In [18]:
full_submission_reviewing_editor_assignment_full_df = pd.concat([
    full_submission_reviewing_editor_assignment_df,
    editor_publications_with_extracted_keywords_df[
        editor_publications_with_extracted_keywords_df['name'].isin(
            full_submission_reviewing_editor_assignment_df['name']
        )
    ]
])
print(len(full_submission_reviewing_editor_assignment_full_df))
full_submission_reviewing_editor_assignment_full_df.head(3)

18460


Unnamed: 0,manuscript_id,version_id,overall_stage,qc_complete_timestamp,position_in_overall_stage,relationship_type,person_id,name,person_roles,person_keywords,person_subject_areas,extracted_keywords,publication_id,is_relevant_pubmed_id,is_search_pubmed_id,pmid,doi,publication_date
12,13,00013/2012-05-30T16:00:00Z,Full Submission,2012-05-30 12:00:00+00:00,1.0,Reviewing Editor,1095,E Peter Greenberg,Reviewing Editor|Editorial Board Member,microbe-microbe communication,Microbiology and Infectious Disease,"[- nine order, 10-15 m, animal, animal health,...",,,,,,NaT
23,39,00039/2012-07-06T22:13:18Z,Full Submission,2012-07-10 11:58:11+00:00,1.0,Reviewing Editor,22520,Timothy Nilsen,Reviewing Editor|Editorial Board Member,drosophila|mrna processing|mechanism of mirna ...,Biochemistry and Chemical Biology|Chromosomes ...,"[activity, alternative, alternative mirna path...",,,,,,NaT
36,65,00065/2012-07-17T18:32:20Z,Full Submission,2012-07-18 05:55:45+00:00,1.0,Reviewing Editor,1067,Michael Czech,Reviewing Editor|Editorial Board Member,obesity|insulin resistance|diabetes|signal tra...,Cell Biology|Human Biology and Medicine,"[acid, acid oxidation, adaptive, adaptive star...",,,,,,NaT


In [19]:
tf_idf_vectorizer = TfidfVectorizer(
    tokenizer=identity_fn,
    token_pattern=None,
    lowercase=False,
    min_df=manuscript_min_tf,
    max_df=manuscript_max_tf
)
print(tf_idf_vectorizer)
tf_idf_vectorizer.fit(
    full_submission_reviewing_editor_assignment_full_df['extracted_keywords']
)
all_keywords_set = set(tf_idf_vectorizer.get_feature_names())
len(all_keywords_set)

TfidfVectorizer(lowercase=False, max_df=0.9, min_df=10, token_pattern=None,
                tokenizer=<function identity_fn at 0x7f131d54ff80>)


13061

In [20]:
print(sorted(all_keywords_set)[:10])

[', - mouse', '- cell', '- mouse', '-)', '-) mouse', '-activated', '-dependent', '-end', '-independent', '1 diabetes']


In [21]:
all_keywords_set = {
    keyword
    for keyword in all_keywords_set
    if re.match(r'^[a-zA-Z]', keyword)
    and not keyword.startswith('a ')
}
print('all_keywords_set len (after filter):', len(all_keywords_set))
print(sorted(all_keywords_set)[:10])

all_keywords_set len (after filter): 12983
['a-site', 'a.', 'a1', 'aa', 'aaa', 'aaa+', 'aaa+ atpase', 'aaa+ domain', 'aaa-atpase', 'aacr']


In [22]:
all_keywords_set = all_keywords_set - set(keyword_exclusion_df['excluded_keyword'])
print('all_keywords_set len (after exclusion):', len(all_keywords_set))

all_keywords_set len (after exclusion): 12964


In [23]:
editor_extracted_keywords_df = (
    full_submission_reviewing_editor_assignment_full_df
    [['name', 'extracted_keywords']]
    .groupby('name')
    .agg(
        lambda keywords_list: [
            keyword
            for keywords in keywords_list
            for keyword in keywords
            if keyword in all_keywords_set
        ]
    )
    .reset_index()
    .sort_values('name')
)
editor_extracted_keywords_df.head()

Unnamed: 0,name,extracted_keywords
0,Abby Dernburg,"[biological, biological significance, centrome..."
1,Adam Aron,"[activation, amygdala, analyzes, antidepressan..."
2,Adam Frost,"[alternative, bacteria, cell, center, conforma..."
3,Adam Linstedt,"[adrenergic, adrenergic receptor, basis, biolo..."
4,Adèle Marston,"[ablation, cell, cell cycle, chromatid, chroma..."


In [24]:
editor_person_id_df = (
    full_submission_reviewing_editor_assignment_df
    [['name', 'person_id']]
    .dropna()
    .groupby('name')
    .last()
    .loc[editor_extracted_keywords_df['name']]
    .reset_index()
)
editor_person_id_df.head()

Unnamed: 0,name,person_id
0,Abby Dernburg,3357
1,Adam Aron,33041
2,Adam Frost,86941
3,Adam Linstedt,4372
4,Adèle Marston,7131


In [25]:
editor_tf_idf_vectorizer = TfidfVectorizer(
    tokenizer=identity_fn,
    token_pattern=None,
    lowercase=False,
    norm='l2',
    smooth_idf=False,
    sublinear_tf=False,
    min_df=1,
    max_df=1.0
)
print(editor_tf_idf_vectorizer)
editor_tf_idf = editor_tf_idf_vectorizer.fit_transform(
    editor_extracted_keywords_df['extracted_keywords']
)
editor_tf_idf

TfidfVectorizer(lowercase=False, smooth_idf=False, token_pattern=None,
                tokenizer=<function identity_fn at 0x7f131d54ff80>)


<523x12964 sparse matrix of type '<class 'numpy.float64'>'
	with 397981 stored elements in Compressed Sparse Row format>

In [26]:
cosine_similarity(
    editor_tf_idf_vectorizer.transform(
        editor_extracted_keywords_df
        ['extracted_keywords'][:1]
    ),
    editor_tf_idf
)

array([[1.        , 0.01865128, 0.09016082, 0.0867965 , 0.44161049,
        0.03912625, 0.14962182, 0.07852633, 0.15375397, 0.13433561,
        0.14441691, 0.0267893 , 0.05778472, 0.08577248, 0.0564257 ,
        0.13305484, 0.06951396, 0.05366461, 0.03363445, 0.06050807,
        0.0962479 , 0.0477471 , 0.10845229, 0.03910047, 0.04455843,
        0.03815135, 0.11503525, 0.07611077, 0.11056966, 0.083612  ,
        0.16587795, 0.13124258, 0.08116211, 0.08188922, 0.10873183,
        0.26673158, 0.12013977, 0.04299838, 0.10193516, 0.0640219 ,
        0.05068233, 0.14498366, 0.18086921, 0.07906889, 0.03118948,
        0.073024  , 0.06278515, 0.07638611, 0.10858076, 0.06590993,
        0.10980194, 0.07394353, 0.10741368, 0.17023079, 0.09778446,
        0.08427098, 0.08333296, 0.02075586, 0.0507134 , 0.13022699,
        0.01610861, 0.14656821, 0.07161923, 0.54863553, 0.04140503,
        0.11216906, 0.0529313 , 0.07842697, 0.07986581, 0.05905642,
        0.20663311, 0.1187741 , 0.10691583, 0.05

In [27]:
print('saving to:', model_output_path)
serialize_object_to({
    'editor_tf_idf_vectorizer': editor_tf_idf_vectorizer,
    'editor_tf_idf': editor_tf_idf,
    'editor_names': editor_extracted_keywords_df['name'],
    'editor_person_ids': editor_person_id_df['person_id']
}, model_output_path)
print('done')

saving to: s3://ci-elife-data-pipeline/airflow-config/data-science/state-dev/reviewing_editor_model.joblib
done
