In [1]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
manuscript_min_tf = 10
manuscript_max_tf = 0.9
state_path = 's3://ci-elife-data-pipeline/airflow-config/data-science/state-dev'

In [2]:
import os
from functools import partial
from typing import List, Tuple, TypeVar

import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity

import data_science_pipeline.configure_warnings  # pylint: disable=unused-import
import data_science_pipeline.configure_notebook_logging  # pylint: disable=unused-import

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.bq import to_gbq
from data_science_pipeline.utils.io import load_object_from
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query
)
from data_science_pipeline.peerscout.models import (
    WeightedKeywordModel
)

In [3]:
model_path = os.path.join(state_path, 'reviewing_editor_model.joblib')
recommendation_output_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='reviewing_editor_recommendation'
)

In [4]:
print('loading model from:', model_path)
model_dict = load_object_from(model_path)
model_dict.keys()

loading model from: s3://ci-elife-data-pipeline/airflow-config/data-science/state-dev/reviewing_editor_model.joblib


dict_keys(['editor_tf_idf_vectorizer', 'editor_tf_idf', 'editor_names', 'editor_person_ids'])

In [5]:
editor_tf_idf_vectorizer = model_dict['editor_tf_idf_vectorizer']
editor_tf_idf_vectorizer

TfidfVectorizer(lowercase=False, smooth_idf=False, token_pattern=None,
                tokenizer=<function identity_fn at 0x7f077f27c3b0>)

In [6]:
editor_tf_idf = model_dict['editor_tf_idf']
editor_tf_idf

<523x12964 sparse matrix of type '<class 'numpy.float64'>'
	with 397981 stored elements in Compressed Sparse Row format>

In [7]:
editor_names = model_dict['editor_names']
editor_names

0         Abby Dernburg
1             Adam Aron
2            Adam Frost
3         Adam Linstedt
4         Adèle Marston
             ...       
518         Yukiko Goda
519    Yukiko Yamashita
520           Yuting Ma
521      Yuuki Watanabe
522        Zsolt Molnár
Name: name, Length: 523, dtype: object

In [8]:
editor_person_ids = model_dict['editor_person_ids']
editor_person_ids

0        3357
1       33041
2       86941
3        4372
4        7131
        ...  
518     17746
519      4715
520    145793
521    126873
522    140825
Name: person_id, Length: 523, dtype: object

In [9]:
editor_person_id_by_name_map = dict(zip(editor_names, editor_person_ids))
editor_person_id_by_name_map

{'Abby Dernburg': '3357',
 'Adam Aron': '33041',
 'Adam Frost': '86941',
 'Adam Linstedt': '4372',
 'Adèle Marston': '7131',
 'Agnese Seminara': '72412',
 'Agnieszka Chacinska': '1532',
 'Ahmet Yildiz': '3258',
 'Alan Hinnebusch': '3613',
 'Alan Moses': '17261',
 'Alejandro Sánchez Alvarado': '1179',
 'Alex Cook': '104734',
 'Alex Fornito': '52199',
 'Alexander Borst': '6613',
 'Alexander Shackman': '133489',
 'Alfonso Valencia': '7284',
 'Alison Goate': '66036',
 'Allan Basbaum': '9287',
 'Alphee Michelot': '27913',
 'Ambra Pozzi': '36644',
 'Amita Sehgal': '117606',
 'Ammie Kalan': '131589',
 'Amy Gladfelter': '147660',
 'Amy Wesolowski': '125325',
 'Ana Domingos': '134266',
 'Andrea Martin': '171158',
 'Andreas Martin': '11271',
 'Andreas Schaefer': '39183',
 'Andrei Lupas': '3654',
 'Andrew Brack': '15012',
 'Andrew Carter': '14040',
 'Andrew Kruse': '44764',
 'Andrew MacPherson': '13704',
 'Andrew Morris': '46368',
 'Andrew West': '64900',
 'Andrés Aguilera': '3681',
 'Anita Bhatt

In [10]:
weighted_keyword_valid_model = WeightedKeywordModel.from_tf_matrix(
    editor_tf_idf.todense(),
    vectorizer=editor_tf_idf_vectorizer,
    choices=editor_names
)
weighted_keyword_valid_model

<data_science_pipeline.peerscout.models.WeightedKeywordModel at 0x7f077f29be90>

In [11]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [12]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [13]:
# we are using the same manuscript list used for senior editor recommendation
# this is because we want to recommend reviewing editors to consult with (not for assignment)
manuscript_version_for_recommendation_df = read_big_query(
    get_sql('manuscript-version-initial-submissions-for-senior-editor-recommendation.sql').format(
        **default_query_props
    )
)
print(len(manuscript_version_for_recommendation_df))
manuscript_version_for_recommendation_df.head()

> ```sql
> -- Main features:
> --    - Returns Initial Submissions for the purpose of Senior Editor recommendation
> --    - No older than a year
> --    - Not have Senior Editor assigned for more than 30 days
> 
> WITH t_manuscript_version_abstract_keywords AS (
>   SELECT
>     manuscript_abstract_keywords.manuscript_id AS manuscript_id,
>     manuscript_abstract_keywords.version_id,
>     manuscript_abstract_keywords.extracted_keywords,
>     ROW_NUMBER() OVER (
>       PARTITION BY version_id
>       ORDER BY data_hub_imported_timestamp DESC
>     ) AS version_id_row_number
>   FROM `elife-data-pipeline.de_dev.manuscript_abstract_keywords` AS manuscript_abstract_keywords
>   WHERE ARRAY_LENGTH(extracted_keywords) > 0
> ),
> 
> t_last_manuscript_version_abstract_keywords AS (
>   SELECT
>     * EXCEPT(version_id_row_number)
>   FROM t_manuscript_version_abstract_keywords
>   WHERE version_id_row_number = 1
>   ORDER BY version_id
> )
> 
> SELECT version.version_id, manuscript_version_abstract_keywords.extracted_keywords
> FROM `elife-data-pipeline.de_dev.mv_manuscript_version` AS version
> JOIN t_last_manuscript_version_abstract_keywords AS manuscript_version_abstract_keywords
>   ON manuscript_version_abstract_keywords.version_id = version.version_id
> WHERE version.overall_stage = 'Initial Submission'
>   AND (
>     ARRAY_LENGTH(version.senior_editors) = 0
>     OR TIMESTAMP_DIFF(
>       CURRENT_TIMESTAMP,
>       (SELECT MAX(last_assigned_timestamp) FROM UNNEST(version.senior_editors)),
>       DAY
>     ) < 30
>   )
>   AND TIMESTAMP_DIFF(CURRENT_TIMESTAMP, version.created_timestamp, DAY) < 365
>   AND NOT is_withdrawn
>   AND NOT is_deleted
> ```

Downloading: 100%|██████████| 931/931 [00:01<00:00, 465.73rows/s]

931





Unnamed: 0,version_id,extracted_keywords
0,441048/2020-03-04T16:55:14Z,"[acquisition, animal, consolidation, cortex, e..."
1,61605/2020-07-30T03:20:49Z,"[2 protein, adaptation, balance, balanced, bal..."
2,61922/2020-08-07T21:40:55Z,"[approach, arrangement, brain, brain region, b..."
3,61975/2020-08-10T14:21:34Z,"[acquisition, adequate, adequate tissue transp..."
4,61990/2020-08-10T20:55:15Z,"[1 helicase non-structural protein, activity, ..."


In [14]:
keyword_similarity = cosine_similarity(
    editor_tf_idf_vectorizer.transform(
        manuscript_version_for_recommendation_df
        ['extracted_keywords']
    ),
    editor_tf_idf
)
print(keyword_similarity.max())
keyword_similarity

0.9313638682777767


array([[0.02096939, 0.09319624, 0.01221354, ..., 0.02643237, 0.02301269,
        0.04593907],
       [0.04621938, 0.0086969 , 0.03907396, ..., 0.03693508, 0.03544445,
        0.02224424],
       [0.03843053, 0.07005385, 0.02142093, ..., 0.02286954, 0.00987532,
        0.01390712],
       ...,
       [0.04398216, 0.00522956, 0.00470752, ..., 0.01308869, 0.01277799,
        0.01038034],
       [0.02690482, 0.0194184 , 0.01750485, ..., 0.04883347, 0.0434696 ,
        0.02670351],
       [0.05453647, 0.00796423, 0.06877242, ..., 0.02054926, 0.00786981,
        0.00791369]])

In [15]:
weighted_keyword_valid_model.predict_ranking(
    manuscript_version_for_recommendation_df['extracted_keywords'][:1],
).proba_matrix

matrix([[0.02096939, 0.09319624, 0.01221354, 0.03083998, 0.01124261,
         0.03955946, 0.02990524, 0.01756236, 0.02755628, 0.05271243,
         0.04847826, 0.02407158, 0.0589776 , 0.06824892, 0.10611761,
         0.06448509, 0.04811256, 0.05293716, 0.00473663, 0.00589344,
         0.04234657, 0.04560739, 0.01260331, 0.04406764, 0.01809496,
         0.07293714, 0.02057313, 0.07321707, 0.02230551, 0.02049924,
         0.0192212 , 0.02236001, 0.01647974, 0.02381865, 0.03331546,
         0.02188127, 0.04409586, 0.        , 0.00787704, 0.0779866 ,
         0.17415853, 0.05623851, 0.05443493, 0.01633852, 0.02273868,
         0.00560352, 0.02650346, 0.03435802, 0.02034547, 0.0488016 ,
         0.01489894, 0.01375281, 0.02594418, 0.03403671, 0.02481871,
         0.02131498, 0.01603578, 0.00521334, 0.06038936, 0.02658294,
         0.        , 0.03619403, 0.0205525 , 0.01827323, 0.0490007 ,
         0.0523429 , 0.05179182, 0.02547808, 0.03341295, 0.10144148,
         0.0217316 , 0.04013462, 0

In [16]:
manuscript_matching_keywords_list = weighted_keyword_valid_model.predict_ranking(
    manuscript_version_for_recommendation_df['extracted_keywords']
).matching_keywords_list
pd.Series(manuscript_matching_keywords_list[:5])

0    [[(0.004527405058331749, spatial), (0.00295036...
1    [[(0.0077663941846240914, protein), (0.0059459...
2    [[(0.00678513050857859, complex), (0.005334694...
3    [[(0.011106824476849752, volumetric), (0.01082...
4    [[(0.008029376782489685, protein), (0.00470298...
dtype: object

In [17]:
T = TypeVar('T')


def get_recommended_editors_with_probability(
        proba_matrix: List[List[float]],
        manuscript_matching_keywords_list: List[List[Tuple[float, str]]],
        indices: List[T],
        threshold: float = 0.5) -> List[List[Tuple[float, T]]]:
    return [
        sorted([
            (
                p,
                key,
                sum(
                    s for s, _ in editor_matching_keywords
                ),
                editor_matching_keywords
            )
            for p, key, editor_matching_keywords in zip(
                row,
                indices,
                editors_matching_keywords
            ) if p >= threshold
        ], reverse=True)
        for row, editors_matching_keywords in zip(proba_matrix, manuscript_matching_keywords_list)
    ]


prediction_results_with_similarity = pd.Series(
    get_recommended_editors_with_probability(
        keyword_similarity,
        manuscript_matching_keywords_list,
        editor_names,
        threshold=0.001
    ),
    index=manuscript_version_for_recommendation_df.index
)
prediction_results_with_similarity[:5]

0    [(0.26039681157087025, Mathieu Wolff, 0.260396...
1    [(0.31259503465485067, Thorsten Nürnberger, 0....
2    [(0.16551825384835234, Sacha Nelson, 0.1655182...
3    [(0.09677939271459088, Moritz Helmstaedter, 0....
4    [(0.17368901704208917, Maria Spies, 0.17368901...
dtype: object

In [18]:
prediction_results_df = pd.concat([
    manuscript_version_for_recommendation_df['version_id'],
    prediction_results_with_similarity.to_frame('prediction'),
], axis=1)
print(len(prediction_results_df))
prediction_results_df.head()

931


Unnamed: 0,version_id,prediction
0,441048/2020-03-04T16:55:14Z,"[(0.26039681157087025, Mathieu Wolff, 0.260396..."
1,61605/2020-07-30T03:20:49Z,"[(0.31259503465485067, Thorsten Nürnberger, 0...."
2,61922/2020-08-07T21:40:55Z,"[(0.16551825384835234, Sacha Nelson, 0.1655182..."
3,61975/2020-08-10T14:21:34Z,"[(0.09677939271459088, Moritz Helmstaedter, 0...."
4,61990/2020-08-10T20:55:15Z,"[(0.17368901704208917, Maria Spies, 0.17368901..."


In [19]:
prediction_results_df['prediction'][0]

[(0.26039681157087025,
  'Mathieu Wolff',
  0.26039681157087036,
  [(0.06363895667035205, 'maze'),
   (0.021148095867519007, 'task'),
   (0.01978333820914989, 'memory'),
   (0.015096582395263517, 'prefrontal cortex'),
   (0.014911352146358628, 'cortex'),
   (0.014765201705944414, 'learn'),
   (0.014499972376444183, 'prefrontal'),
   (0.01302363000378527, 'spatial'),
   (0.012791706537203676, 'acquisition'),
   (0.012629857250713705, 'performance'),
   (0.008474423489945995, 'trial'),
   (0.005478688822502652, 'effect'),
   (0.004967098831755899, 'information'),
   (0.004904089429479274, 'consolidation'),
   (0.004256901809557349, 'rodent'),
   (0.003956667641829978, 'location'),
   (0.0036389017378482998, 'system'),
   (0.0029136043456426354, 'session'),
   (0.0024100908437534917, 'specific'),
   (0.00235771394882872, 'animal'),
   (0.0022760607349936804, 'result'),
   (0.00218124143522778, 'navigation'),
   (0.0016878522450101622, 'simple'),
   (0.0016165876994508653, 'knowledge'),
  

In [20]:
prediction_results_flat_df = pd.DataFrame([
    {
        'version_id': row.version_id,
        'score': predicted_editor[0],
        'name': predicted_editor[1],
        'person_id': editor_person_id_by_name_map[predicted_editor[1]],
        'matching_keyword_score': predicted_editor[2],
        'matching_keywords': [{
            'score': keyword_score,
            'keyword': keyword
        } for keyword_score, keyword in predicted_editor[3]],
    }
    for row in prediction_results_df.itertuples()
    for predicted_editor in row.prediction
])
print(len(prediction_results_flat_df))
prediction_results_flat_df.head()

484804


Unnamed: 0,version_id,score,name,person_id,matching_keyword_score,matching_keywords
0,441048/2020-03-04T16:55:14Z,0.260397,Mathieu Wolff,98201,0.260397,"[{'score': 0.06363895667035205, 'keyword': 'ma..."
1,441048/2020-03-04T16:55:14Z,0.218171,David Badre,29804,0.218171,"[{'score': 0.0342554783795348, 'keyword': 'pre..."
2,441048/2020-03-04T16:55:14Z,0.217949,Lila Davachi,19176,0.217949,"[{'score': 0.03260237969805255, 'keyword': 'me..."
3,441048/2020-03-04T16:55:14Z,0.209631,Mihaela Iordanova,93079,0.209631,"[{'score': 0.02215236281259356, 'keyword': 'me..."
4,441048/2020-03-04T16:55:14Z,0.209273,Daeyeol Lee,10628,0.209273,"[{'score': 0.027291100070182504, 'keyword': 'p..."


In [21]:
prediction_results_flat_df['version_id'].nunique()

931

In [22]:
prediction_results_flat_df.max()

version_id                62934/2020-09-08T23:30:53Z
score                                       0.931364
name                                    Zsolt Molnár
person_id                                       9960
matching_keyword_score                      0.931364
dtype: object

In [23]:
print('writing to:', recommendation_output_table_name)
to_gbq(
    prediction_results_flat_df,
    recommendation_output_table_name,
    project_id=project_id,
    if_exists='replace'
)
print('done')

writing to: de_dev.data_science_reviewing_editor_recommendation
done
