In [1]:
project_id = 'elife-data-pipeline'
source_dataset = 'prod'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
manuscript_min_tf = 10
manuscript_max_tf = 0.9
state_path = 's3://ci-elife-data-pipeline/airflow-config/keyword-extraction/state-dev'

In [2]:
from functools import partial
from typing import List, Tuple, TypeVar

import joblib
import os
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import data_science_pipeline.configure_warnings

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.io import load_object_from
from data_science_pipeline.utils.misc import identity_fn
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query,
)

In [3]:
model_path = os.path.join(state_path, 'senior_editor_model.joblib')
recommendation_output_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='editor_recommendation'
)

In [4]:
print('loading model from:', model_path)
model_dict = load_object_from(model_path)
model_dict.keys()

loading model from: s3://ci-elife-data-pipeline/airflow-config/keyword-extraction/state-dev/senior_editor_model.joblib


dict_keys(['editor_tf_idf_vectorizer', 'doc_length_weighted_editor_tf_idf', 'editor_names'])

In [5]:
editor_tf_idf_vectorizer = model_dict['editor_tf_idf_vectorizer']
editor_tf_idf_vectorizer

TfidfVectorizer(lowercase=False, smooth_idf=False, token_pattern=None,
                tokenizer=<function identity_fn at 0x7f1ca3d33c20>)

In [6]:
doc_length_weighted_editor_tf_idf = model_dict['doc_length_weighted_editor_tf_idf']
doc_length_weighted_editor_tf_idf

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.00139181, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.00150984, 0.00109358, 0.        , ..., 0.        , 0.00292823,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.0175357 ,
         0.        ],
        [0.        , 0.00100183, 0.        , ..., 0.00068434, 0.00076645,
         0.        ],
        [0.00065094, 0.00141444, 0.        , ..., 0.        , 0.00216421,
         0.        ]])

In [7]:
editor_names = model_dict['editor_names']
editor_names

0           Aleksandra Walczak
1                  Andrew King
2               Anna Akhmanova
3     Barbara Shinn-Cunningham
4                Carla Rothlin
                ...           
60                 Tamar Makin
61             Timothy Behrens
62              Utpal Banerjee
63              Vivek Malhotra
64               Wendy Garrett
Name: name, Length: 65, dtype: object

In [8]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [9]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [10]:
manuscript_version_for_recommendation_df = read_big_query(
    get_sql('manuscript-version-initial-submissions-for-senior-editor-recommendation.sql').format(
        **default_query_props
    )
)
manuscript_version_for_recommendation_df.head()

> ```sql
> WITH t_manuscript_version_abstract_keywords AS (
>   SELECT
>     LPAD(CAST(manuscript_abstract_keywords.manuscript_id AS STRING), 5, '0') AS manuscript_id,
>     manuscript_abstract_keywords.version_id,
>     manuscript_abstract_keywords.extracted_keywords,
>     ROW_NUMBER() OVER (
>       PARTITION BY version_id
>       ORDER BY data_hub_imported_timestamp DESC
>     ) AS version_id_row_number
>   FROM `elife-data-pipeline.prod.manuscript_abstract_keywords` AS manuscript_abstract_keywords
>   WHERE ARRAY_LENGTH(extracted_keywords) > 0
> ),
> 
> t_last_manuscript_version_abstract_keywords AS (
>   SELECT
>     * EXCEPT(version_id_row_number)
>   FROM t_manuscript_version_abstract_keywords
>   WHERE version_id_row_number = 1
>   ORDER BY version_id
> )
> 
> SELECT version.version_id, manuscript_version_abstract_keywords.extracted_keywords
> FROM `elife-data-pipeline.prod.mv_manuscript_version` AS version
> JOIN t_last_manuscript_version_abstract_keywords AS manuscript_version_abstract_keywords
>   ON manuscript_version_abstract_keywords.version_id = version.version_id
> WHERE version.overall_stage = 'Initial Submission'
>   AND ARRAY_LENGTH(version.senior_editors) = 0
>   AND TIMESTAMP_DIFF(CURRENT_TIMESTAMP, version.created_timestamp, DAY) < 365
> ```

Downloading: 100%|██████████| 135/135 [00:00<00:00, 240.71rows/s]


Unnamed: 0,version_id,extracted_keywords
0,49378/2019-06-15T23:34:46Z,"[- wgs, 16s, 16s rrna gene sequencing, 16s seq..."
1,49596/2019-06-23T05:30:37Z,"[accurate, accurate localisation, auditory, au..."
2,50466/2019-07-23T16:01:08Z,"[-ward, \textit{klebsiella, \textit{klebsiella..."
3,50710/2019-07-30T17:40:09Z,"[activity, activity onset, avlpfc, avlpfc), av..."
4,51092/2019-08-14T17:24:40Z,"[ability, autism, autism spectrum, battery, br..."


In [11]:
keyword_similarity = cosine_similarity(
    editor_tf_idf_vectorizer.transform(
        manuscript_version_for_recommendation_df
        ['extracted_keywords']
    ),
    doc_length_weighted_editor_tf_idf
)
print(keyword_similarity.max())
keyword_similarity

0.3355773628462569


array([[0.0816525 , 0.05653703, 0.06520036, ..., 0.06783907, 0.05629434,
        0.25762246],
       [0.0715818 , 0.25775286, 0.0803795 , ..., 0.06152131, 0.08123971,
        0.05404424],
       [0.13542505, 0.06842914, 0.08114764, ..., 0.06294691, 0.06878266,
        0.12591147],
       ...,
       [0.07888089, 0.06324242, 0.12455613, ..., 0.09481188, 0.1221229 ,
        0.09331774],
       [0.0785239 , 0.06627219, 0.09322913, ..., 0.08578878, 0.0754884 ,
        0.12426033],
       [0.1157139 , 0.09222445, 0.10315648, ..., 0.0922783 , 0.07615232,
        0.12924913]])

In [12]:
T = TypeVar('T')


def get_recommended_editors_with_probability(
        proba_matrix: List[List[float]],
        indices: List[T],
        threshold: float = 0.5) -> List[List[Tuple[float, T]]]:
    return [
        sorted([(p, key) for p, key in zip(row, indices) if p >= threshold], reverse=True)
        for row in proba_matrix
    ]


prediction_results_with_similarity = pd.Series(
    get_recommended_editors_with_probability(
        keyword_similarity,
        editor_names,
        threshold=0.001
    ),
    index=manuscript_version_for_recommendation_df.index
)
prediction_results_with_similarity[:5]

0    [(0.2576224636285809, Wendy Garrett), (0.12859...
1    [(0.25775285504800116, Andrew King), (0.216688...
2    [(0.3355773628462569, Miles Davenport), (0.218...
3    [(0.3242674309862605, Laura Colgin), (0.278114...
4    [(0.18513078658363272, Floris de Lange), (0.17...
dtype: object

In [13]:
prediction_results_df = pd.concat([
    manuscript_version_for_recommendation_df['version_id'],
    prediction_results_with_similarity.to_frame('prediction'),
], axis=1)
print(len(prediction_results_df))
prediction_results_df.head()

135


Unnamed: 0,version_id,prediction
0,49378/2019-06-15T23:34:46Z,"[(0.2576224636285809, Wendy Garrett), (0.12859..."
1,49596/2019-06-23T05:30:37Z,"[(0.25775285504800116, Andrew King), (0.216688..."
2,50466/2019-07-23T16:01:08Z,"[(0.3355773628462569, Miles Davenport), (0.218..."
3,50710/2019-07-30T17:40:09Z,"[(0.3242674309862605, Laura Colgin), (0.278114..."
4,51092/2019-08-14T17:24:40Z,"[(0.18513078658363272, Floris de Lange), (0.17..."


In [14]:
prediction_results_df['prediction'][0]

[(0.2576224636285809, 'Wendy Garrett'),
 (0.12859540922753968, 'Eduardo Franco'),
 (0.1239735226507961, 'Detlef Weigel'),
 (0.12083610116492209, 'Tadatsugu Taniguchi'),
 (0.11690343142169272, 'Satyajit Rath'),
 (0.10780433478211955, 'Neil Ferguson'),
 (0.10633318208607495, 'Jos van der Meer'),
 (0.09788480964181845, 'Matthias Barton'),
 (0.09648800393877986, 'Gisela Storz'),
 (0.0903316076481428, 'Huda Zoghbi'),
 (0.0870382552375424, 'Naama Barkai'),
 (0.08536416496871513, 'Diethard Tautz'),
 (0.08454969300596514, 'Patricia Wittkopp'),
 (0.0816524968309963, 'Aleksandra Walczak'),
 (0.08013881731354638, 'Päivi Ojala'),
 (0.07984502063884127, 'Carla Rothlin'),
 (0.0794001345761173, 'Christian Büchel'),
 (0.07916325360513271, 'Maureen Murphy'),
 (0.07877346807192248, 'Michael Eisen'),
 (0.07827072752639062, 'Catherine Dulac'),
 (0.0762697990682644, 'Philip Cole'),
 (0.07589600781608948, 'Kathryn Cheah'),
 (0.0754383537608571, 'Didier Stainier'),
 (0.07175171987774549, 'Gary Westbrook'),
 

In [15]:
prediction_results_flat_df = pd.DataFrame([
    {
        'version_id': row.version_id,
        'score': predicted_editor[0],
        'name': predicted_editor[1]
    }
    for row in prediction_results_df.itertuples()
    for predicted_editor in row.prediction
])
print(len(prediction_results_flat_df))
prediction_results_flat_df.head()

8639


Unnamed: 0,version_id,score,name
0,49378/2019-06-15T23:34:46Z,0.257622,Wendy Garrett
1,49378/2019-06-15T23:34:46Z,0.128595,Eduardo Franco
2,49378/2019-06-15T23:34:46Z,0.123974,Detlef Weigel
3,49378/2019-06-15T23:34:46Z,0.120836,Tadatsugu Taniguchi
4,49378/2019-06-15T23:34:46Z,0.116903,Satyajit Rath


In [16]:
print('writing to:', recommendation_output_table_name)
prediction_results_flat_df.to_gbq(
    recommendation_output_table_name,
    project_id=project_id,
    if_exists='replace'
)
print('done')

writing to: de_dev.data_science_editor_recommendation


1it [00:05,  5.08s/it]

done



