In [1]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
manuscript_min_tf = 10
manuscript_max_tf = 0.9
state_path = 's3://ci-elife-data-pipeline/airflow-config/data-science/state-dev'

In [2]:
from functools import partial

import re
import os
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import data_science_pipeline.configure_warnings  # pylint: disable=unused-import
import data_science_pipeline.configure_notebook_logging  # pylint: disable=unused-import

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.io import serialize_object_to
from data_science_pipeline.utils.misc import identity_fn
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query,
)

In [3]:
model_output_path = os.path.join(state_path, 'senior_editor_model.joblib')

In [4]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [5]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [6]:
manuscript_editor_assignment_df = read_big_query(
    get_sql('senior-editor-assignments.sql').format(
        **default_query_props
    )
)
manuscript_editor_assignment_df.head()

> ```sql
> WITH t_related_person_id AS (
>   SELECT DISTINCT Version_ID AS version_id, 'Senior Editor' AS relationship_type, Person.Person_ID
>   FROM `elife-data-pipeline.de_dev.mv_Editorial_All_Manuscript_Version` AS Version
>   JOIN UNNEST(Version.Senior_Editors) AS Person
> 
>   UNION ALL
> 
>   SELECT DISTINCT Version_ID AS version_id, 'Reviewing Editor' AS relationship_type, Person.Person_ID
>   FROM `elife-data-pipeline.de_dev.mv_Editorial_All_Manuscript_Version` AS Version
>   JOIN UNNEST(Version.Reviewing_Editors) AS Person
> )
> 
> SELECT
>   Version.Manuscript_ID AS manuscript_id,
>   Version.Version_ID AS version_id,
>   Version.Overall_Stage AS overall_stage,
>   Version.QC_Complete_Timestamp AS qc_complete_timestamp,
>   Version.Position_In_Overall_Stage AS position_in_overall_stage,
>   related_person_id.relationship_type,
>   related_person_id.person_id AS person_id,
>   Related_Person.Name AS name,
>   ARRAY_TO_STRING(ARRAY(SELECT Role_Name FROM UNNEST(Related_Person.Roles)), '|') AS person_roles,
>   ARRAY_TO_STRING(ARRAY(SELECT Keyword FROM UNNEST(Related_Person.Keywords)), '|') AS person_keywords,
>   ARRAY_TO_STRING(ARRAY(SELECT Subject_Area_Name FROM UNNEST(Related_Person.Subject_Areas)), '|') AS person_subject_areas
> 
> FROM `elife-data-pipeline.de_dev.mv_Editorial_All_Manuscript_Version` AS Version
> LEFT JOIN t_related_person_id AS related_person_id
>   ON related_person_id.version_Id = Version.Version_ID
> LEFT JOIN `elife-data-pipeline.prod.mv_Editorial_Person` AS Related_Person
>   ON Related_Person.Person_ID = related_person_id.person_id
> ORDER BY Manuscript_ID, Version_ID, relationship_type, person_id
> ```

Downloading: 100%|██████████| 98896/98896 [00:15<00:00, 6518.60rows/s]


Unnamed: 0,manuscript_id,version_id,overall_stage,qc_complete_timestamp,position_in_overall_stage,relationship_type,person_id,name,person_roles,person_keywords,person_subject_areas
0,1,00001/2012-06-13T16:00:00Z,Initial Submission,2012-06-13 12:00:00+00:00,1,Senior Editor,84742,Chris Ponting,Reviewing Editor|Editorial Board Member,computational genomics|lncrnas|transcription f...,Computational and Systems Biology|Genetics and...
1,1,00001/2012-06-18T18:37:20Z,Initial Submission,2012-06-18 15:21:37+00:00,2,Senior Editor,1013,Ian Baldwin,,evolutionary biology|plant biology|evolution a...,Ecology|Evolutionary Biology|Plant Biology
2,1,00001/2012-10-22T03:36:46Z,Full Submission,2012-10-26 05:37:50+00:00,1,Reviewing Editor,1141,Gil McVean,Guest Editor,,Computational and Systems Biology|Genetics and...
3,1,00001/2012-10-22T03:36:46Z,Full Submission,2012-10-26 05:37:50+00:00,1,Senior Editor,84742,Chris Ponting,Reviewing Editor|Editorial Board Member,computational genomics|lncrnas|transcription f...,Computational and Systems Biology|Genetics and...
4,2,00002/2012-05-19T16:00:00Z,Initial Submission,2012-05-19 12:00:00+00:00,1,Senior Editor,1032,Randy Schekman,Reviewing Editor|Editorial Board Member,membrane assembly|vesicular traffic|protein tr...,Cell Biology


In [7]:
manuscript_version_extracted_keywords_df = read_big_query(
    get_sql('manuscript-version-extracted-keywords.sql').format(
        **default_query_props
    )
)
print(len(manuscript_version_extracted_keywords_df))
manuscript_version_extracted_keywords_df.head()

> ```sql
> WITH t_manuscript_version_abstract_keywords AS (
>   SELECT
>     manuscript_abstract_keywords.manuscript_id,
>     manuscript_abstract_keywords.version_id,
>     manuscript_abstract_keywords.extracted_keywords,
>     ROW_NUMBER() OVER (
>       PARTITION BY version_id
>       ORDER BY data_hub_imported_timestamp DESC
>     ) AS version_id_row_number
>   FROM `elife-data-pipeline.de_dev.manuscript_abstract_keywords` AS manuscript_abstract_keywords
>   WHERE ARRAY_LENGTH(extracted_keywords) > 0
> )
> 
> SELECT
>   * EXCEPT(version_id_row_number)
> FROM t_manuscript_version_abstract_keywords
> WHERE version_id_row_number = 1
> ORDER BY version_id
> ```

Downloading: 100%|██████████| 49573/49573 [00:34<00:00, 1426.91rows/s]

49573





Unnamed: 0,manuscript_id,version_id,extracted_keywords
0,1,00001/2012-10-22T03:36:46Z,"[1,129,022 snp, achievement, action, analysis,..."
1,2,00002/2012-05-24T16:00:00Z,"[assembly, assembly factor, associate, associa..."
2,3,00003/2012-06-21T18:34:14Z,"[advantage, antibacterial, antibacterial defen..."
3,3,00003/2012-09-05T04:24:15Z,"[adult, adult fly, advantage, antibacterial, a..."
4,5,00005/2012-06-25T19:03:52Z,"[activity, aebp2, allosteric, allosteric role,..."


In [8]:
senior_editors_df = read_big_query(
    get_sql('senior-editors.sql').format(
        **default_query_props
    )
)
print(len(senior_editors_df))
senior_editors_df.head()

> ```sql
> SELECT
>   Person_ID AS person_id,
>   Name AS name
> FROM `elife-data-pipeline.de_dev.mv_Editorial_Editor_Profile`
> WHERE Role_Name = 'Senior Editor'
> ```

Downloading: 100%|██████████| 65/65 [00:00<00:00, 79.62rows/s]

65





Unnamed: 0,person_id,name
0,42011,George Perry
1,1160,Suzanne Pfeffer
2,8518,Anna Akhmanova
3,8364,Piali Sengupta
4,1062,Jonathan Cooper


In [9]:
keyword_exclusion_df = read_big_query(
    get_sql('keyword-exclusion.sql').format(
        **default_query_props
    )
)
print(len(keyword_exclusion_df))
keyword_exclusion_df.head()

> ```sql
> SELECT excluded_keyword
> FROM `elife-data-pipeline.de_dev.keyword_exclusion`
> ORDER BY excluded_keyword
> ```

Downloading: 100%|██████████| 100/100 [00:00<00:00, 443.26rows/s]

100





Unnamed: 0,excluded_keyword
0,a
1,about
2,across
3,additional
4,all


In [10]:
senior_editor_names = set(senior_editors_df['name'])
len(senior_editor_names)

65

In [11]:
sorted(senior_editor_names)

['Aleksandra Walczak',
 'Andrew King',
 'Anna Akhmanova',
 'Barbara Shinn-Cunningham',
 'Carla Rothlin',
 'Catherine Dulac',
 'Chris Baker',
 'Christian Büchel',
 'Christian Hardtke',
 'Christian Rutz',
 'Clifford Rosen',
 'Cynthia Wolberger',
 'David Ron',
 'Detlef Weigel',
 'Didier Stainier',
 'Diethard Tautz',
 'Dominique Soldati-Favre',
 'Eduardo Franco',
 'Edward Morrisey',
 'Floris de Lange',
 'Gary Westbrook',
 'George Perry',
 'Gisela Storz',
 'Huda Zoghbi',
 'James Manley',
 'Jessica Tyler',
 'John Huguenard',
 'John Kuriyan',
 'Jonathan Cooper',
 'Jos van der Meer',
 'Joshua Gold',
 'José Faraldo-Gómez',
 'K VijayRaghavan',
 'Karla Kirkegaard',
 'Kate Wassum',
 'Kathryn Cheah',
 'Kenton Swartz',
 'Kevin Struhl',
 'Laura Colgin',
 'Marianne Bronner',
 'Matthias Barton',
 'Maureen Murphy',
 'Michael Eisen',
 'Michael Frank',
 'Michael Marletta',
 'Miles Davenport',
 'Naama Barkai',
 'Neil Ferguson',
 'Olga Boudker',
 'Patricia Wittkopp',
 'Philip Cole',
 'Piali Sengupta',
 'Päi

In [12]:
manuscript_editor_assignment_with_extracted_keywords_df = (
    manuscript_editor_assignment_df
    .merge(
        manuscript_version_extracted_keywords_df.drop(columns=['manuscript_id']),
        on='version_id',
        suffixes=('', '_extracted')
    )
)
print(manuscript_editor_assignment_with_extracted_keywords_df.columns)
manuscript_editor_assignment_with_extracted_keywords_df.head()

Index(['manuscript_id', 'version_id', 'overall_stage', 'qc_complete_timestamp',
       'position_in_overall_stage', 'relationship_type', 'person_id', 'name',
       'person_roles', 'person_keywords', 'person_subject_areas',
       'extracted_keywords'],
      dtype='object')


Unnamed: 0,manuscript_id,version_id,overall_stage,qc_complete_timestamp,position_in_overall_stage,relationship_type,person_id,name,person_roles,person_keywords,person_subject_areas,extracted_keywords
0,1,00001/2012-10-22T03:36:46Z,Full Submission,2012-10-26 05:37:50+00:00,1,Reviewing Editor,1141,Gil McVean,Guest Editor,,Computational and Systems Biology|Genetics and...,"[1,129,022 snp, achievement, action, analysis,..."
1,1,00001/2012-10-22T03:36:46Z,Full Submission,2012-10-26 05:37:50+00:00,1,Senior Editor,84742,Chris Ponting,Reviewing Editor|Editorial Board Member,computational genomics|lncrnas|transcription f...,Computational and Systems Biology|Genetics and...,"[1,129,022 snp, achievement, action, analysis,..."
2,2,00002/2012-05-24T16:00:00Z,Full Submission,2012-05-24 12:00:00+00:00,1,Reviewing Editor,1151,Jodi Nunnari,,,Biochemistry and Chemical Biology|Cell Biology,"[assembly, assembly factor, associate, associa..."
3,2,00002/2012-05-24T16:00:00Z,Full Submission,2012-05-24 12:00:00+00:00,1,Senior Editor,1032,Randy Schekman,Reviewing Editor|Editorial Board Member,membrane assembly|vesicular traffic|protein tr...,Cell Biology,"[assembly, assembly factor, associate, associa..."
4,3,00003/2012-06-21T18:34:14Z,Full Submission,2012-06-27 05:06:17+00:00,1,Reviewing Editor,1123,Roberto Kolter,Guest Editor,,Microbiology and Infectious Disease|Plant Biology,"[advantage, antibacterial, antibacterial defen..."


In [13]:
initial_submission_senior_editor_assignment_df = manuscript_editor_assignment_with_extracted_keywords_df[
    (manuscript_editor_assignment_with_extracted_keywords_df['overall_stage'] == 'Initial Submission')
    & (manuscript_editor_assignment_with_extracted_keywords_df['position_in_overall_stage'] == 1)
    & (manuscript_editor_assignment_with_extracted_keywords_df['relationship_type'] == 'Senior Editor')
    & (
        manuscript_editor_assignment_with_extracted_keywords_df['name']
        .fillna('').isin(senior_editor_names)
    )
]
print(len(initial_submission_senior_editor_assignment_df))
initial_submission_senior_editor_assignment_df.head()

20291


Unnamed: 0,manuscript_id,version_id,overall_stage,qc_complete_timestamp,position_in_overall_stage,relationship_type,person_id,name,person_roles,person_keywords,person_subject_areas,extracted_keywords
5159,5096,05096/2014-10-08T11:11:16Z,Initial Submission,2014-10-22 01:21:28+00:00,1,Senior Editor,1030,Detlef Weigel,Senior Editor|Deputy Editor,natural variation|epigenetics|evolutionary gen...,Evolutionary Biology|Genetics and Genomics|Pla...,"[aba, aba-independent, aba-independent signali..."
5168,5109,05109/2014-10-09T09:19:54Z,Initial Submission,2014-10-31 10:21:41+00:00,1,Senior Editor,1026,Tadatsugu Taniguchi,Senior Editor,inflammation|innate immunity|adaptive immunity...,Immunology and Inflammation,"[allele, blot, cd4, contrast, control, crf02_a..."
5264,5251,05251/2014-10-19T13:58:37Z,Initial Submission,2014-10-22 09:54:03+00:00,1,Senior Editor,1029,Huda Zoghbi,Senior Editor,neurobiology of disease|animal models of human...,Human Biology and Medicine|Neuroscience,"[anaesthetic propofol, analysis, anesthetic, b..."
5271,5258,05258/2014-10-20T14:29:42Z,Initial Submission,2014-10-21 04:34:27+00:00,1,Senior Editor,1014,Catherine Dulac,Senior Editor,cellular and molecular neuroscience|molecular ...,Neuroscience,"[3xtg-ad, 3xtg-ad mouse, ad, ad patient, adeno..."
5275,5264,05264/2014-10-20T21:09:37Z,Initial Submission,2014-10-21 04:46:52+00:00,1,Senior Editor,1029,Huda Zoghbi,Senior Editor,neurobiology of disease|animal models of human...,Human Biology and Medicine|Neuroscience,"[activation, activity, brain, central, central..."


In [14]:
full_submission_senior_as_reviewing_editor_assignment_df = manuscript_editor_assignment_with_extracted_keywords_df[
    (manuscript_editor_assignment_with_extracted_keywords_df['overall_stage'] == 'Full Submission')
    & (manuscript_editor_assignment_with_extracted_keywords_df['position_in_overall_stage'] == 1)
    & (manuscript_editor_assignment_with_extracted_keywords_df['relationship_type'] == 'Reviewing Editor')
    & (
        manuscript_editor_assignment_with_extracted_keywords_df['name']
        .fillna('').isin(senior_editor_names)
    )
]
print(len(full_submission_senior_as_reviewing_editor_assignment_df))
full_submission_senior_as_reviewing_editor_assignment_df.head()

1696


Unnamed: 0,manuscript_id,version_id,overall_stage,qc_complete_timestamp,position_in_overall_stage,relationship_type,person_id,name,person_roles,person_keywords,person_subject_areas,extracted_keywords
8,5,00005/2012-06-25T19:03:52Z,Full Submission,2012-06-28 16:34:47+00:00,1,Reviewing Editor,1189,Kevin Struhl,Senior Editor,chromatin biology|cancer biology|transcription...,Cell Biology|Chromosomes and Gene Expression,"[activity, aebp2, allosteric, allosteric role,..."
12,7,00007/2012-05-07T16:00:00Z,Full Submission,2012-05-07 12:00:00+00:00,1,Reviewing Editor,1030,Detlef Weigel,Senior Editor|Deputy Editor,natural variation|epigenetics|evolutionary gen...,Evolutionary Biology|Genetics and Genomics|Pla...,"[animal, antidigestive, antidigestive trypsin ..."
50,45,00045/2012-07-09T20:34:46Z,Full Submission,2012-07-17 06:20:02+00:00,1,Reviewing Editor,1191,Diethard Tautz,Senior Editor,evolutionary genetics|population genetics|mole...,Evolutionary Biology|Genetics and Genomics,"[15 yeast specie, accompany, accompanying chan..."
64,51,00051/2012-07-18T23:32:31Z,Full Submission,2012-07-20 06:41:37+00:00,1,Reviewing Editor,1086,Eduardo Franco,Senior Editor,cancer epidemiology|cancer prevention|cancer s...,Cancer Biology|Epidemiology and Global Health|...,"[1913, 1913 michaelis-menten enzyme kinetics, ..."
108,102,00102/2012-07-28T17:54:36Z,Full Submission,2012-07-30 09:37:19+00:00,1,Reviewing Editor,1026,Tadatsugu Taniguchi,Senior Editor,inflammation|innate immunity|adaptive immunity...,Immunology and Inflammation,"[13 nucleotide, 23s, 23s ribosomal rna, 23s rr..."


In [15]:
senior_editor_assignment_df = pd.concat([
    initial_submission_senior_editor_assignment_df,
    full_submission_senior_as_reviewing_editor_assignment_df[
        ~full_submission_senior_as_reviewing_editor_assignment_df['manuscript_id'].isin(
            initial_submission_senior_editor_assignment_df['manuscript_id']
        )
    ]
])
print(len(senior_editor_assignment_df))
senior_editor_assignment_df.head()

21228


Unnamed: 0,manuscript_id,version_id,overall_stage,qc_complete_timestamp,position_in_overall_stage,relationship_type,person_id,name,person_roles,person_keywords,person_subject_areas,extracted_keywords
5159,5096,05096/2014-10-08T11:11:16Z,Initial Submission,2014-10-22 01:21:28+00:00,1,Senior Editor,1030,Detlef Weigel,Senior Editor|Deputy Editor,natural variation|epigenetics|evolutionary gen...,Evolutionary Biology|Genetics and Genomics|Pla...,"[aba, aba-independent, aba-independent signali..."
5168,5109,05109/2014-10-09T09:19:54Z,Initial Submission,2014-10-31 10:21:41+00:00,1,Senior Editor,1026,Tadatsugu Taniguchi,Senior Editor,inflammation|innate immunity|adaptive immunity...,Immunology and Inflammation,"[allele, blot, cd4, contrast, control, crf02_a..."
5264,5251,05251/2014-10-19T13:58:37Z,Initial Submission,2014-10-22 09:54:03+00:00,1,Senior Editor,1029,Huda Zoghbi,Senior Editor,neurobiology of disease|animal models of human...,Human Biology and Medicine|Neuroscience,"[anaesthetic propofol, analysis, anesthetic, b..."
5271,5258,05258/2014-10-20T14:29:42Z,Initial Submission,2014-10-21 04:34:27+00:00,1,Senior Editor,1014,Catherine Dulac,Senior Editor,cellular and molecular neuroscience|molecular ...,Neuroscience,"[3xtg-ad, 3xtg-ad mouse, ad, ad patient, adeno..."
5275,5264,05264/2014-10-20T21:09:37Z,Initial Submission,2014-10-21 04:46:52+00:00,1,Senior Editor,1029,Huda Zoghbi,Senior Editor,neurobiology of disease|animal models of human...,Human Biology and Medicine|Neuroscience,"[activation, activity, brain, central, central..."


In [16]:
print('unique manuscript ids:', senior_editor_assignment_df['manuscript_id'].nunique())
print('duplicate manuscript ids (if any):')
senior_editor_assignment_df[
    senior_editor_assignment_df['manuscript_id'].isin(
        senior_editor_assignment_df
        .groupby('manuscript_id')
        .size()
        .pipe(lambda s: s[s > 1])
        .index
    )
]

unique manuscript ids: 21228
duplicate manuscript ids (if any):


Unnamed: 0,manuscript_id,version_id,overall_stage,qc_complete_timestamp,position_in_overall_stage,relationship_type,person_id,name,person_roles,person_keywords,person_subject_areas,extracted_keywords


In [17]:
tf_idf_vectorizer = TfidfVectorizer(
    tokenizer=identity_fn,
    token_pattern=None,
    lowercase=False,
    min_df=manuscript_min_tf,
    max_df=manuscript_max_tf
)
print(tf_idf_vectorizer)
tf_idf_vectorizer.fit(
    senior_editor_assignment_df['extracted_keywords']
)
all_keywords_set = set(tf_idf_vectorizer.get_feature_names())
len(all_keywords_set)

TfidfVectorizer(lowercase=False, max_df=0.9, min_df=10, token_pattern=None,
                tokenizer=<function identity_fn at 0x7f47dfe6dcb0>)


13756

In [18]:
print(sorted(all_keywords_set)[:10])

[', - mouse', '- cell', '- mouse', '- neuron', '-)', '-) mouse', '-binding', '-dependent', '-end', '-independent']


In [19]:
all_keywords_set = {
    keyword
    for keyword in all_keywords_set
    if re.match(r'^[a-zA-Z]', keyword)
}
print('all_keywords_set len (after filter):', len(all_keywords_set))
print(sorted(all_keywords_set)[:10])

all_keywords_set len (after filter): 13677
['a virus', 'a1', 'a2', 'a2a', 'a2a receptor', 'a2ar', 'a549', 'aa', 'aaa', 'aaa+']


In [20]:
all_keywords_set = all_keywords_set - set(keyword_exclusion_df['excluded_keyword'])
print('all_keywords_set len (after exclusion):', len(all_keywords_set))

all_keywords_set len (after exclusion): 13658


In [21]:
editor_extracted_keywords_df = (
    senior_editor_assignment_df
    [['name', 'extracted_keywords']]
    .groupby('name')
    .agg(
        lambda keywords_list: [
            keyword
            for keywords in keywords_list
            for keyword in keywords
            if keyword in all_keywords_set
        ]
    )
    .reset_index()
    .sort_values('name')
)
editor_extracted_keywords_df.head()

Unnamed: 0,name,extracted_keywords
0,Aleksandra Walczak,"[antibiotic, antibiotic resistance, antibiotic..."
1,Andrew King,"[analysis, application, auditory, auditory sys..."
2,Anna Akhmanova,"[acid, acquire, analysis, approach, base, body..."
3,Barbara Shinn-Cunningham,"[activity, attend, attention, attention task, ..."
4,Carla Rothlin,"[activation, adhesion, adhesion molecule, adul..."


In [22]:
editor_person_id_df = (
    senior_editor_assignment_df
    [['name', 'person_id']]
    .dropna()
    .groupby('name')
    .last()
    .loc[editor_extracted_keywords_df['name']]
    .reset_index()
)
editor_person_id_df.head()

Unnamed: 0,name,person_id
0,Aleksandra Walczak,50904
1,Andrew King,14601
2,Anna Akhmanova,8518
3,Barbara Shinn-Cunningham,19576
4,Carla Rothlin,44396


In [23]:
editor_tf_idf_vectorizer = TfidfVectorizer(
    tokenizer=identity_fn,
    token_pattern=None,
    lowercase=False,
    norm='l2',
    smooth_idf=False,
    sublinear_tf=False,
    min_df=1,
    max_df=1.0
)
print(editor_tf_idf_vectorizer)
editor_tf_idf = editor_tf_idf_vectorizer.fit_transform(
    editor_extracted_keywords_df['extracted_keywords']
)
editor_tf_idf

TfidfVectorizer(lowercase=False, smooth_idf=False, token_pattern=None,
                tokenizer=<function identity_fn at 0x7f47dfe6dcb0>)


<65x13658 sparse matrix of type '<class 'numpy.float64'>'
	with 218399 stored elements in Compressed Sparse Row format>

In [24]:
cosine_similarity(
    editor_tf_idf_vectorizer.transform(
        editor_extracted_keywords_df
        ['extracted_keywords'][:1]
    ),
    editor_tf_idf
)

array([[1.        , 0.39172274, 0.57999512, 0.30461302, 0.2722393 ,
        0.46506895, 0.19326803, 0.36196061, 0.45019283, 0.38070011,
        0.40239325, 0.45474642, 0.42133389, 0.61959774, 0.54927467,
        0.58436329, 0.3746185 , 0.41991623, 0.39248737, 0.34839466,
        0.41116685, 0.18148192, 0.57193647, 0.45155641, 0.47380054,
        0.46340737, 0.31475368, 0.49943468, 0.51784213, 0.36305111,
        0.39351185, 0.44418235, 0.51120487, 0.42694166, 0.32130984,
        0.45012896, 0.34297685, 0.49465634, 0.3726109 , 0.51883197,
        0.30837512, 0.42115115, 0.47784149, 0.4024741 , 0.51818649,
        0.30039957, 0.73847253, 0.47819954, 0.42458346, 0.58634041,
        0.5264267 , 0.34066349, 0.44534476, 0.46810717, 0.39727562,
        0.44313471, 0.46225262, 0.47576683, 0.45983994, 0.49281062,
        0.19098133, 0.41624576, 0.47603886, 0.51410902, 0.52315945]])

In [25]:
print('saving to:', model_output_path)
serialize_object_to({
    'editor_tf_idf_vectorizer': editor_tf_idf_vectorizer,
    'editor_tf_idf': editor_tf_idf,
    'editor_names': editor_extracted_keywords_df['name'],
    'editor_person_ids': editor_person_id_df['person_id']
}, model_output_path)
print('done')

saving to: s3://ci-elife-data-pipeline/airflow-config/data-science/state-dev/senior_editor_model.joblib
done
