In [1]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
manuscript_min_tf = 10
manuscript_max_tf = 0.9
state_path = 's3://ci-elife-data-pipeline/airflow-config/data-science/state-dev'

In [2]:
import os
from functools import partial
from itertools import groupby
from typing import List, Tuple, TypeVar

import joblib
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import data_science_pipeline.configure_warnings

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.bq import to_gbq
from data_science_pipeline.utils.io import load_object_from
from data_science_pipeline.utils.misc import identity_fn
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query
)
from data_science_pipeline.peerscout.models import (
    WeightedKeywordModel
)

In [3]:
model_path = os.path.join(state_path, 'reviewing_editor_model.joblib')
recommendation_output_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='reviewing_editor_recommendation'
)

In [4]:
print('loading model from:', model_path)
model_dict = load_object_from(model_path)
model_dict.keys()

loading model from: s3://ci-elife-data-pipeline/airflow-config/data-science/state-dev/reviewing_editor_model.joblib


dict_keys(['editor_tf_idf_vectorizer', 'editor_tf_idf', 'editor_names', 'editor_person_ids'])

In [5]:
editor_tf_idf_vectorizer = model_dict['editor_tf_idf_vectorizer']
editor_tf_idf_vectorizer

TfidfVectorizer(lowercase=False, smooth_idf=False, token_pattern=None,
                tokenizer=<function identity_fn at 0x7f81c02d6320>)

In [6]:
editor_tf_idf = model_dict['editor_tf_idf']
editor_tf_idf

<512x5720 sparse matrix of type '<class 'numpy.float64'>'
	with 192186 stored elements in Compressed Sparse Row format>

In [7]:
editor_names = model_dict['editor_names']
editor_names

0         Abby Dernburg
1             Adam Aron
2            Adam Frost
3         Adam Linstedt
4         Adèle Marston
             ...       
507         Yukiko Goda
508    Yukiko Yamashita
509           Yuting Ma
510      Yuuki Watanabe
511        Zsolt Molnár
Name: name, Length: 512, dtype: object

In [8]:
editor_person_ids = model_dict['editor_person_ids']
editor_person_ids

0        3357
1       33041
2       86941
3        4372
4        7131
        ...  
507     17746
508      4715
509    145793
510    126873
511    140825
Name: person_id, Length: 512, dtype: object

In [9]:
editor_person_id_by_name_map = dict(zip(editor_names, editor_person_ids))
editor_person_id_by_name_map

{'Abby Dernburg': '3357',
 'Adam Aron': '33041',
 'Adam Frost': '86941',
 'Adam Linstedt': '4372',
 'Adèle Marston': '7131',
 'Agnese Seminara': '72412',
 'Agnieszka Chacinska': '1532',
 'Ahmet Yildiz': '3258',
 'Alan Hinnebusch': '3613',
 'Alan Moses': '17261',
 'Alejandro Sánchez Alvarado': '1179',
 'Alex Cook': '104734',
 'Alex Fornito': '52199',
 'Alexander Borst': '6613',
 'Alexander Shackman': '133489',
 'Alfonso Valencia': '7284',
 'Alison Goate': '66036',
 'Allan Basbaum': '9287',
 'Ambra Pozzi': '36644',
 'Amita Sehgal': '117606',
 'Ammie Kalan': '131589',
 'Amy Gladfelter': '147660',
 'Ana Domingos': '134266',
 'Andrea Martin': '171158',
 'Andreas Martin': '11271',
 'Andreas Schaefer': '39183',
 'Andrei Lupas': '3654',
 'Andrew Brack': '15012',
 'Andrew Carter': '14040',
 'Andrew Kruse': '44764',
 'Andrew MacPherson': '13704',
 'Andrew Morris': '46368',
 'Andrew West': '64900',
 'Andrés Aguilera': '3681',
 'Anita Bhattacharyya': '63118',
 'Anna Diehl': '143521',
 'Anna Pyle':

In [10]:
weighted_keyword_valid_model = WeightedKeywordModel.from_tf_matrix(
    editor_tf_idf.todense(),
    vectorizer=editor_tf_idf_vectorizer,
    choices=editor_names
)
weighted_keyword_valid_model

<data_science_pipeline.peerscout.models.WeightedKeywordModel at 0x7f82045bb050>

In [11]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [12]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [13]:
manuscript_version_for_recommendation_df = read_big_query(
    get_sql('manuscript-version-full-submissions-for-reviewing-editor-recommendation.sql').format(
        **default_query_props
    )
)
print(len(manuscript_version_for_recommendation_df))
manuscript_version_for_recommendation_df.head()

> ```sql
> -- Main features:
> --    - Returns Full Submissions for the purpose of Reviewing Editor recommendation
> --    - No older than a year
> --    - Not have Reviewing Editor assigned for more than 30 days
> 
> WITH t_manuscript_version_abstract_keywords AS (
>   SELECT
>     manuscript_abstract_keywords.manuscript_id AS manuscript_id,
>     manuscript_abstract_keywords.version_id,
>     manuscript_abstract_keywords.extracted_keywords,
>     ROW_NUMBER() OVER (
>       PARTITION BY version_id
>       ORDER BY data_hub_imported_timestamp DESC
>     ) AS version_id_row_number
>   FROM `elife-data-pipeline.de_dev.manuscript_abstract_keywords` AS manuscript_abstract_keywords
>   WHERE ARRAY_LENGTH(extracted_keywords) > 0
> ),
> 
> t_last_manuscript_version_abstract_keywords AS (
>   SELECT
>     * EXCEPT(version_id_row_number)
>   FROM t_manuscript_version_abstract_keywords
>   WHERE version_id_row_number = 1
>   ORDER BY version_id
> )
> 
> SELECT version.version_id, manuscript_version_abstract_keywords.extracted_keywords
> FROM `elife-data-pipeline.de_dev.v_manuscript_version_last_editor_assigned_timestamp` AS version
> JOIN t_last_manuscript_version_abstract_keywords AS manuscript_version_abstract_keywords
>   ON manuscript_version_abstract_keywords.version_id = version.version_id
> WHERE version.overall_stage = 'Full Submission'
>   AND (
>     ARRAY_LENGTH(version.reviewing_editors) = 0
>     OR TIMESTAMP_DIFF(
>       CURRENT_TIMESTAMP,
>       (SELECT MAX(last_assigned_timestamp) FROM UNNEST(version.senior_editors)),
>       DAY
>     ) < 30
>   )
>   AND TIMESTAMP_DIFF(CURRENT_TIMESTAMP, version.created_timestamp, DAY) < 365
>   AND NOT is_withdrawn
>   AND NOT is_deleted
> ```

Downloading: 100%|██████████| 1017/1017 [00:01<00:00, 531.31rows/s]

1017





Unnamed: 0,version_id,extracted_keywords
0,50060/2020-02-13T18:16:14Z,"[a case, ability, age, age distribution, age s..."
1,57681/2020-06-02T15:01:12Z,"[14 hz, aperture, aperture optics, axial, axia..."
2,55720/2020-02-03T15:52:38Z,"[3,500 child, achievement, background, child, ..."
3,56235/2020-06-17T15:07:47Z,"[1280, 1280 compound, acid, administration, ag..."
4,52974/2020-05-28T17:58:22Z,"[acquisition, area, behavioral, behavioral evi..."


In [14]:
keyword_similarity = cosine_similarity(
    editor_tf_idf_vectorizer.transform(
        manuscript_version_for_recommendation_df
        ['extracted_keywords']
    ),
    editor_tf_idf
)
print(keyword_similarity.max())
keyword_similarity

1.0000000000000004


array([[0.01233973, 0.02583074, 0.02089928, ..., 0.0904356 , 0.01036622,
        0.02868696],
       [0.043765  , 0.01211671, 0.02802146, ..., 0.01831063, 0.00537139,
        0.02585142],
       [0.        , 0.02599563, 0.00939863, ..., 0.04613364, 0.        ,
        0.        ],
       ...,
       [0.06076351, 0.01105797, 0.05404575, ..., 0.06055957, 0.        ,
        0.03263625],
       [0.03707914, 0.00880584, 0.02509713, ..., 0.01785721, 0.0101679 ,
        0.03692463],
       [0.01426269, 0.04087129, 0.02297357, ..., 0.02460926, 0.0357275 ,
        0.01699269]])

In [15]:
weighted_keyword_valid_model.predict_ranking(
    manuscript_version_for_recommendation_df['extracted_keywords'][:1],
).proba_matrix

matrix([[0.01233973, 0.02583074, 0.02089928, 0.02475307, 0.01010384,
         0.03616367, 0.06004528, 0.0059471 , 0.02926068, 0.01810574,
         0.05139636, 0.07857052, 0.04793406, 0.04257255, 0.05723666,
         0.10438878, 0.03987644, 0.05759133, 0.0327505 , 0.01446153,
         0.03325881, 0.0130636 , 0.05908579, 0.01074247, 0.01673888,
         0.02189816, 0.02784802, 0.03937096, 0.02031302, 0.        ,
         0.11799704, 0.06791915, 0.04783872, 0.02354334, 0.04277967,
         0.03101006, 0.01412007, 0.01721054, 0.03401069, 0.04643093,
         0.08301793, 0.0640966 , 0.03768868, 0.00701101, 0.05436786,
         0.15572141, 0.02292952, 0.02026591, 0.02000684, 0.03083051,
         0.05918672, 0.04614704, 0.02859464, 0.0669923 , 0.02633116,
         0.11136667, 0.28968435, 0.07327961, 0.01027852, 0.03230418,
         0.07155634, 0.01891687, 0.05898551, 0.04690485, 0.04852004,
         0.06142518, 0.04939484, 0.02208133, 0.01799603, 0.14447923,
         0.04247006, 0.088928  , 0

In [16]:
manuscript_matching_keywords_list = weighted_keyword_valid_model.predict_ranking(
    manuscript_version_for_recommendation_df['extracted_keywords']
).matching_keywords_list
pd.Series(manuscript_matching_keywords_list[:5])

0    [[(0.006338551212703835, factor), (0.003398535...
1    [[(0.018215840622960344, volumetric), (0.00484...
2    [[], [(0.019074534462485235, previous), (0.006...
3    [[(0.01508950316374485, readout), (0.008859785...
4    [[(0.006108451601542264, factor), (0.005543801...
dtype: object

In [17]:
T = TypeVar('T')


def get_recommended_editors_with_probability(
        proba_matrix: List[List[float]],
        manuscript_matching_keywords_list: List[List[Tuple[float, str]]],
        indices: List[T],
        threshold: float = 0.5) -> List[List[Tuple[float, T]]]:
    return [
        sorted([
            (
                p,
                key,
                sum(
                    s for s, _ in editor_matching_keywords
                ),
                editor_matching_keywords
            )
            for p, key, editor_matching_keywords in zip(
                row,
                indices,
                editors_matching_keywords
            ) if p >= threshold
        ], reverse=True)
        for row, editors_matching_keywords in zip(proba_matrix, manuscript_matching_keywords_list)
    ]


prediction_results_with_similarity = pd.Series(
    get_recommended_editors_with_probability(
        keyword_similarity,
        manuscript_matching_keywords_list,
        editor_names,
        threshold=0.001
    ),
    index=manuscript_version_for_recommendation_df.index
)
prediction_results_with_similarity[:5]

0    [(0.2896843486518553, Ben Cooper, 0.2896843486...
1    [(0.8189549171101689, Melike Lakadamyali, 0.81...
2    [(0.1332676257119115, Ben Cooper, 0.1332676257...
3    [(0.47150859072861084, Arduino Mangoni, 0.4715...
4    [(0.39826640138751823, Thorsten Kahnt, 0.39826...
dtype: object

In [18]:
prediction_results_df = pd.concat([
    manuscript_version_for_recommendation_df['version_id'],
    prediction_results_with_similarity.to_frame('prediction'),
], axis=1)
print(len(prediction_results_df))
prediction_results_df.head()

1017


Unnamed: 0,version_id,prediction
0,50060/2020-02-13T18:16:14Z,"[(0.2896843486518553, Ben Cooper, 0.2896843486..."
1,57681/2020-06-02T15:01:12Z,"[(0.8189549171101689, Melike Lakadamyali, 0.81..."
2,55720/2020-02-03T15:52:38Z,"[(0.1332676257119115, Ben Cooper, 0.1332676257..."
3,56235/2020-06-17T15:07:47Z,"[(0.47150859072861084, Arduino Mangoni, 0.4715..."
4,52974/2020-05-28T17:58:22Z,"[(0.39826640138751823, Thorsten Kahnt, 0.39826..."


In [19]:
prediction_results_df['prediction'][0]

[(0.2896843486518553,
  'Ben Cooper',
  0.28968434865185516,
  [(0.02642050657917536, 'infection'),
   (0.019612794609206296, 'risk'),
   (0.01682063210362375, 'seasonal'),
   (0.014258157813555994, 'vaccine'),
   (0.013200116088323466, 'year'),
   (0.012844152300895244, 'effectiveness'),
   (0.010897336660848992, 'exposure'),
   (0.010627537146885541, 'case'),
   (0.009637440979119125, 'vaccination'),
   (0.00957965209382799, 'influenza'),
   (0.009334213281426994, 'antigenic'),
   (0.009041020041273055, 'age'),
   (0.008096071191327716, 'variation'),
   (0.00785150531796657, 'impact'),
   (0.007084295866782451, 'model'),
   (0.006832911770798254, 'protection'),
   (0.00609367430677734, 'effect'),
   (0.005984194481019524, 'datum'),
   (0.005871942511255967, 'imprinting'),
   (0.005871942511255967, 'last'),
   (0.0049417776560129745, 'statistical'),
   (0.0048187204895595625, 'childhood'),
   (0.004767834886770804, 'birth'),
   (0.004599734185242493, 'evidence'),
   (0.004555274513865

In [20]:
prediction_results_flat_df = pd.DataFrame([
    {
        'version_id': row.version_id,
        'score': predicted_editor[0],
        'name': predicted_editor[1],
        'person_id': editor_person_id_by_name_map[predicted_editor[1]],
        'matching_keyword_score': predicted_editor[2],
        'matching_keywords': [{
            'score': keyword_score,
            'keyword': keyword
        } for keyword_score, keyword in predicted_editor[3]],
    }
    for row in prediction_results_df.itertuples()
    for predicted_editor in row.prediction
])
print(len(prediction_results_flat_df))
prediction_results_flat_df.head()

496917


Unnamed: 0,version_id,score,name,person_id,matching_keyword_score,matching_keywords
0,50060/2020-02-13T18:16:14Z,0.289684,Ben Cooper,34498,0.289684,"[{'score': 0.02642050657917536, 'keyword': 'in..."
1,50060/2020-02-13T18:16:14Z,0.198347,Mark Jit,1708,0.198347,"[{'score': 0.029963913877803488, 'keyword': 'v..."
2,50060/2020-02-13T18:16:14Z,0.179241,Urszula Krzych,21294,0.179241,"[{'score': 0.026197377736477588, 'keyword': 'v..."
3,50060/2020-02-13T18:16:14Z,0.174876,Marc Lipsitch,16032,0.174876,"[{'score': 0.024625299690245038, 'keyword': 'i..."
4,50060/2020-02-13T18:16:14Z,0.16821,Richard Neher,1701,0.16821,"[{'score': 0.026926606321838405, 'keyword': 'i..."


In [21]:
prediction_results_flat_df['version_id'].nunique()

1017

In [22]:
prediction_results_flat_df.max()

version_id                60317/2020-06-22T23:30:46Z
score                                              1
name                                    Zsolt Molnár
person_id                                       9960
matching_keyword_score                             1
dtype: object

In [23]:
print('writing to:', recommendation_output_table_name)
to_gbq(
    prediction_results_flat_df,
    recommendation_output_table_name,
    project_id=project_id,
    if_exists='replace'
)
print('done')

writing to: de_dev.data_science_reviewing_editor_recommendation
done
