In [None]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
manuscript_min_tf = 10
manuscript_max_tf = 0.9
state_path = 's3://ci-elife-data-pipeline/airflow-config/data-science/state-dev'

In [None]:
from functools import partial

import re
import os

import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import data_science_pipeline.configure_warnings  # pylint: disable=unused-import
import data_science_pipeline.configure_notebook_logging  # pylint: disable=unused-import

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.io import serialize_object_to
from data_science_pipeline.utils.misc import identity_fn
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query,
)

In [None]:
model_output_path = os.path.join(state_path, 'reviewing_editor_model.joblib')

In [None]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [None]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [None]:
manuscript_editor_assignment_df = read_big_query(
    get_sql('reviewing-editor-assignments.sql').format(
        **default_query_props
    )
)
manuscript_editor_assignment_df.head()

In [None]:
manuscript_version_extracted_keywords_df = read_big_query(
    get_sql('manuscript-version-extracted-keywords.sql').format(
        **default_query_props
    )
)
print(len(manuscript_version_extracted_keywords_df))
manuscript_version_extracted_keywords_df.head()

In [None]:
publications_extracted_keywords_df = read_big_query(
    get_sql('publications_extracted_keywords.sql').format(
        **default_query_props
    )
).rename(columns={'abstract_keywords': 'extracted_keywords'})
print(len(publications_extracted_keywords_df))
publications_extracted_keywords_df.head()

In [None]:
editor_publication_ids_df = read_big_query(
    get_sql('editor_publication_ids.sql').format(
        **default_query_props
    )
)
print(len(editor_publication_ids_df))
editor_publication_ids_df.head()

In [None]:
reviewing_editors_df = read_big_query(
    get_sql('reviewing-editors.sql').format(
        **default_query_props
    )
)
print(len(reviewing_editors_df))
reviewing_editors_df.head()

In [None]:
keyword_exclusion_df = read_big_query(
    get_sql('keyword-exclusion.sql').format(
        **default_query_props
    )
)
print(len(keyword_exclusion_df))
keyword_exclusion_df.head()

In [None]:
reviewing_editor_names = set(reviewing_editors_df['name'])
len(reviewing_editor_names)

In [None]:
sorted(reviewing_editor_names)

In [None]:
manuscript_editor_assignment_with_extracted_keywords_df = (
    manuscript_editor_assignment_df
    .merge(
        manuscript_version_extracted_keywords_df,
        on='version_id',
        suffixes=('', '_extracted')
    )
)
print(manuscript_editor_assignment_with_extracted_keywords_df.columns)
manuscript_editor_assignment_with_extracted_keywords_df.head()

In [None]:
full_submission_reviewing_editor_assignment_df = manuscript_editor_assignment_with_extracted_keywords_df[
    (manuscript_editor_assignment_with_extracted_keywords_df['overall_stage'] == 'Full Submission')
    & (manuscript_editor_assignment_with_extracted_keywords_df['position_in_overall_stage'] == 1)
    & (manuscript_editor_assignment_with_extracted_keywords_df['relationship_type'] == 'Reviewing Editor')
    & (
        manuscript_editor_assignment_with_extracted_keywords_df['name']
        .fillna('').isin(reviewing_editor_names)
    )
]
print(len(full_submission_reviewing_editor_assignment_df))
full_submission_reviewing_editor_assignment_df.head()

In [None]:
print('unique manuscript ids:', full_submission_reviewing_editor_assignment_df['manuscript_id'].nunique())
print('duplicate manuscript ids (if any):')
full_submission_reviewing_editor_assignment_df[
    full_submission_reviewing_editor_assignment_df['manuscript_id'].isin(
        full_submission_reviewing_editor_assignment_df
        .groupby('manuscript_id')
        .size()
        .pipe(lambda s: s[s > 1])
        .index
    )
]

In [None]:
editor_publications_with_extracted_keywords_df = (
    reviewing_editors_df[['person_id', 'name']]
    .merge(
        editor_publication_ids_df
        [['person_id', 'publication_id', 'is_relevant_pubmed_id', 'is_search_pubmed_id']],
        on='person_id'
    )
    .merge(
        publications_extracted_keywords_df,
        on='publication_id',
        suffixes=('', '_extracted')
    )
).sort_values(['publication_id', 'person_id']).copy()
print(len(editor_publications_with_extracted_keywords_df))
editor_publications_with_extracted_keywords_df.head(3)

In [None]:
full_submission_reviewing_editor_assignment_full_df = pd.concat([
    full_submission_reviewing_editor_assignment_df,
    editor_publications_with_extracted_keywords_df[
        editor_publications_with_extracted_keywords_df['name'].isin(
            full_submission_reviewing_editor_assignment_df['name']
        )
    ]
])
print(len(full_submission_reviewing_editor_assignment_full_df))
full_submission_reviewing_editor_assignment_full_df.head(3)

In [None]:
tf_idf_vectorizer = TfidfVectorizer(
    tokenizer=identity_fn,
    token_pattern=None,
    lowercase=False,
    min_df=manuscript_min_tf,
    max_df=manuscript_max_tf
)
print(tf_idf_vectorizer)
tf_idf_vectorizer.fit(
    full_submission_reviewing_editor_assignment_full_df['extracted_keywords']
)
all_keywords_set = set(tf_idf_vectorizer.get_feature_names())
len(all_keywords_set)

In [None]:
print(sorted(all_keywords_set)[:10])

In [None]:
all_keywords_set = {
    keyword
    for keyword in all_keywords_set
    if re.match(r'^[a-zA-Z]', keyword)
    and not keyword.startswith('a ')
}
print('all_keywords_set len (after filter):', len(all_keywords_set))
print(sorted(all_keywords_set)[:10])

In [None]:
all_keywords_set = all_keywords_set - set(keyword_exclusion_df['excluded_keyword'])
print('all_keywords_set len (after exclusion):', len(all_keywords_set))

In [None]:
editor_extracted_keywords_df = (
    full_submission_reviewing_editor_assignment_full_df
    [['name', 'extracted_keywords']]
    .groupby('name')
    .agg(
        lambda keywords_list: [
            keyword
            for keywords in keywords_list
            for keyword in keywords
            if keyword in all_keywords_set
        ]
    )
    .reset_index()
    .sort_values('name')
)
editor_extracted_keywords_df.head()

In [None]:
editor_person_id_df = (
    full_submission_reviewing_editor_assignment_df
    [['name', 'person_id']]
    .dropna()
    .groupby('name')
    .last()
    .loc[editor_extracted_keywords_df['name']]
    .reset_index()
)
editor_person_id_df.head()

In [None]:
editor_tf_idf_vectorizer = TfidfVectorizer(
    tokenizer=identity_fn,
    token_pattern=None,
    lowercase=False,
    norm='l2',
    smooth_idf=False,
    sublinear_tf=False,
    min_df=1,
    max_df=1.0
)
print(editor_tf_idf_vectorizer)
editor_tf_idf = editor_tf_idf_vectorizer.fit_transform(
    editor_extracted_keywords_df['extracted_keywords']
)
editor_tf_idf

In [None]:
cosine_similarity(
    editor_tf_idf_vectorizer.transform(
        editor_extracted_keywords_df
        ['extracted_keywords'][:1]
    ),
    editor_tf_idf
)

In [None]:
print('saving to:', model_output_path)
serialize_object_to({
    'editor_tf_idf_vectorizer': editor_tf_idf_vectorizer,
    'editor_tf_idf': editor_tf_idf,
    'editor_names': editor_extracted_keywords_df['name'],
    'editor_person_ids': editor_person_id_df['person_id']
}, model_output_path)
print('done')