In [1]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
manuscript_min_tf = 10
manuscript_max_tf = 0.9
state_path = 's3://ci-elife-data-pipeline/airflow-config/data-science/state-dev'

In [2]:
import os
from functools import partial
from typing import List, Tuple, TypeVar

import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity

import data_science_pipeline.configure_warnings  # pylint: disable=unused-import
import data_science_pipeline.configure_notebook_logging  # pylint: disable=unused-import

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.bq import (to_gbq, delete_bq_table)
from data_science_pipeline.utils.io import load_object_from
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query
)
from data_science_pipeline.peerscout.models import (
    WeightedKeywordModel
)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [3]:
model_path = os.path.join(state_path, 'reviewing_editor_model.joblib')
recommendation_output_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='reviewing_editor_recommendation'
)

In [4]:
print('loading model from:', model_path)
model_dict = load_object_from(model_path)
model_dict.keys()

loading model from: s3://ci-elife-data-pipeline/airflow-config/data-science/state-dev/reviewing_editor_model.joblib




dict_keys(['editor_tf_idf_vectorizer', 'editor_tf_idf', 'editor_names', 'editor_person_ids'])

In [6]:
editor_tf_idf_vectorizer = model_dict['editor_tf_idf_vectorizer']
editor_tf_idf_vectorizer

TfidfVectorizer(lowercase=False, smooth_idf=False, token_pattern=None,
                tokenizer=<function identity_fn at 0x12a19dd40>)

In [7]:
editor_tf_idf = model_dict['editor_tf_idf']
editor_tf_idf

<566x32650 sparse matrix of type '<class 'numpy.float64'>'
	with 1083800 stored elements in Compressed Sparse Row format>

In [8]:
editor_names = model_dict['editor_names']
# editor_names

In [9]:
editor_person_ids = model_dict['editor_person_ids']
# editor_person_ids

In [10]:
editor_person_id_by_name_map = dict(zip(editor_names, editor_person_ids))
# editor_person_id_by_name_map

In [11]:
weighted_keyword_valid_model = WeightedKeywordModel.from_tf_matrix(
    editor_tf_idf.todense(),
    vectorizer=editor_tf_idf_vectorizer,
    choices=editor_names
)
# weighted_keyword_valid_model

In [12]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [13]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [14]:
# we are using the same manuscript list used for senior editor recommendation
# this is because we want to recommend reviewing editors to consult with (not for assignment)
manuscript_version_for_recommendation_df = read_big_query(
    get_sql('manuscript-version-initial-submissions-for-senior-editor-recommendation.sql').format(
        **default_query_props
    )
)
# print(len(manuscript_version_for_recommendation_df))
# manuscript_version_for_recommendation_df.head()

> ```sql
> -- Main features:
> --    - Returns Initial Submissions for the purpose of Senior Editor recommendation
> --    - No older than a year
> --    - Not have Senior Editor assigned for more than 30 days
> 
> WITH t_manuscript_version_abstract_keywords AS (
>   SELECT
>     manuscript_abstract_keywords.manuscript_id AS manuscript_id,
>     manuscript_abstract_keywords.version_id,
>     manuscript_abstract_keywords.extracted_keywords,
>     ROW_NUMBER() OVER (
>       PARTITION BY version_id
>       ORDER BY data_hub_imported_timestamp DESC
>     ) AS version_id_row_number
>   FROM `elife-data-pipeline.de_dev.manuscript_abstract_keywords` AS manuscript_abstract_keywords
>   WHERE ARRAY_LENGTH(extracted_keywords) > 0
> ),
> 
> t_last_manuscript_version_abstract_keywords AS (
>   SELECT
>     * EXCEPT(version_id_row_number)
>   FROM t_manuscript_version_abstract_keywords
>   WHERE version_id_row_number = 1
>   ORDER BY version_id
> )
> 
> SELECT version.version_id, manuscript_version_abstract_keywords.extracted_keywords
> FROM `elife-data-pipeline.de_dev.mv_manuscript_version` AS version
> JOIN t_last_manuscript_version_abstract_keywords AS manuscript_version_abstract_keywords
>   ON manuscript_version_abstract_keywords.version_id = version.version_id
> WHERE version.overall_stage = 'Initial Submission'
>   AND (
>     ARRAY_LENGTH(version.senior_editors) = 0
>     OR TIMESTAMP_DIFF(
>       CURRENT_TIMESTAMP,
>       (SELECT MAX(last_assigned_timestamp) FROM UNNEST(version.senior_editors)),
>       DAY
>     ) < 30
>   )
>   AND TIMESTAMP_DIFF(CURRENT_TIMESTAMP, version.created_timestamp, DAY) < 365
>   AND NOT is_withdrawn
>   AND NOT is_deleted
> ```

In [15]:
keyword_similarity = cosine_similarity(
    editor_tf_idf_vectorizer.transform(
        manuscript_version_for_recommendation_df
        ['extracted_keywords']
    ),
    editor_tf_idf
)
# print(keyword_similarity.max())
# keyword_similarity

In [16]:
weighted_keyword_valid_model.predict_ranking(
    manuscript_version_for_recommendation_df['extracted_keywords'][:1],
).proba_matrix

matrix([[0.00306093, 0.03067246, 0.02202923, 0.03464895, 0.02292322,
         0.04318502, 0.01557829, 0.04572065, 0.02411131, 0.0636179 ,
         0.02718654, 0.02783322, 0.05677565, 0.04470987, 0.02780778,
         0.04683665, 0.03755486, 0.07712596, 0.05994866, 0.04184636,
         0.02608116, 0.02119183, 0.02246005, 0.01006581, 0.02860591,
         0.0247516 , 0.02459215, 0.0387984 , 0.03914576, 0.08070278,
         0.01312075, 0.03024609, 0.03918477, 0.02318454, 0.02459989,
         0.02824739, 0.02562608, 0.0281173 , 0.04466105, 0.07205303,
         0.03342014, 0.03264087, 0.0406232 , 0.03518264, 0.05317832,
         0.04831648, 0.02740609, 0.05606851, 0.05640841, 0.01890196,
         0.03520664, 0.02618198, 0.02233196, 0.01749824, 0.02164606,
         0.01927998, 0.04079578, 0.00774869, 0.02227654, 0.02929645,
         0.04469836, 0.062376  , 0.02947312, 0.02456284, 0.02504775,
         0.02447775, 0.01636786, 0.04771955, 0.01538938, 0.07255144,
         0.01448441, 0.03626193, 0

In [17]:
manuscript_matching_keywords_list = weighted_keyword_valid_model.predict_ranking(
    manuscript_version_for_recommendation_df['extracted_keywords']
).matching_keywords_list
pd.Series(manuscript_matching_keywords_list[:5])

0    [[(0.003060927201726417, effect)], [(0.0039074...
1    [[], [(0.0017674070553020792, desire), (0.0017...
2    [[(0.0024388841791769863, role), (0.0024131594...
3    [[(0.0029117909512956955, network), (0.0023704...
4    [[(0.0029346145927124736, receptor), (0.002182...
dtype: object

In [18]:
T = TypeVar('T')


def get_recommended_editors_with_probability(
        proba_matrix: List[List[float]],
        manuscript_matching_keywords_list: List[List[Tuple[float, str]]],
        indices: List[T],
        threshold: float = 0.5) -> List[List[Tuple[float, T]]]:
    return [
        sorted([
            (
                p,
                key,
                sum(
                    s for s, _ in editor_matching_keywords
                ),
                editor_matching_keywords
            )
            for p, key, editor_matching_keywords in zip(
                row,
                indices,
                editors_matching_keywords
            ) if p >= threshold
        ], reverse=True)
        for row, editors_matching_keywords in zip(proba_matrix, manuscript_matching_keywords_list)
    ]


prediction_results_with_similarity = pd.Series(
    get_recommended_editors_with_probability(
        keyword_similarity,
        manuscript_matching_keywords_list,
        editor_names,
        threshold=0.001
    ),
    index=manuscript_version_for_recommendation_df.index
)
# prediction_results_with_similarity[:5]

In [19]:
prediction_results_df = pd.concat([
    manuscript_version_for_recommendation_df['version_id'],
    prediction_results_with_similarity.to_frame('prediction'),
], axis=1)
# print(len(prediction_results_df))
# prediction_results_df.head()

In [20]:
# prediction_results_df['prediction'][0]

In [21]:
prediction_results_flat_df = pd.DataFrame([
    {
        'version_id': row.version_id,
        'score': predicted_editor[0],
        'name': predicted_editor[1],
        'person_id': editor_person_id_by_name_map[predicted_editor[1]],
        'matching_keyword_score': predicted_editor[2],
        'matching_keywords': [{
            'score': keyword_score,
            'keyword': keyword
        } for keyword_score, keyword in predicted_editor[3]],
    }
    for row in prediction_results_df.itertuples()
    for predicted_editor in row.prediction
])
# print(len(prediction_results_flat_df))
# prediction_results_flat_df.head()

In [22]:
prediction_results_flat_df['version_id'].nunique()

24

In [29]:
# prediction_results_flat_df.max()

In [24]:
prediction_results_flat_df.head()

Unnamed: 0,version_id,score,name,person_id,matching_keyword_score,matching_keywords
0,72259/2021-07-16T11:10:42Z,0.097233,Tobias Donner,74960,0.097233,"[{'score': 0.017253928935734276, 'keyword': 'd..."
1,72259/2021-07-16T11:10:42Z,0.096695,Stephanie Palmer,68611,0.096695,"[{'score': 0.011549650065095047, 'keyword': 'p..."
2,72259/2021-07-16T11:10:42Z,0.09439,Mark Jit,1708,0.09439,"[{'score': 0.02352193620930551, 'keyword': 'ec..."
3,72259/2021-07-16T11:10:42Z,0.091485,Daeyeol Lee,10628,0.091485,"[{'score': 0.02025661665082472, 'keyword': 'ch..."
4,72259/2021-07-16T11:10:42Z,0.091175,Mimi Liljeholm,97220,0.091175,"[{'score': 0.009398746892334683, 'keyword': 'd..."


In [25]:
def dataframe_chunk(seq, size):
    for pos in range(0, len(seq), size):
        yield seq.iloc[pos:pos + size]


In [30]:
CHUNK_SIZE = 3000
delete_bq_table(project_id=project_id, table_name=recommendation_output_table_name)

INFO:data_science_pipeline.utils.bq:Deleted table 'elife-data-pipeline.de_dev.data_science_reviewing_editor_recommendation'.


In [28]:
print('writing to:', recommendation_output_table_name)
# added chunk while loading bq to reduce memory allocation
for part_of_prediction_results in dataframe_chunk(prediction_results_flat_df, CHUNK_SIZE):
    print('Last record of the df chunk: ', part_of_prediction_results.tail(1))
    to_gbq(
        part_of_prediction_results,
        recommendation_output_table_name,
        project_id=project_id,
        if_exists='append'
    )


writing to: de_dev.data_science_reviewing_editor_recommendation
Last record of the df chunk:                        version_id    score           name person_id  \
2999  72257/2021-07-16T10:15:30Z  0.03467  Noriaki Emoto    184943   

      matching_keyword_score  \
2999                 0.03467   

                                      matching_keywords  
2999  [{'score': 0.003957430614840587, 'keyword': 'd...  


INFO:root:Processing line 1000
INFO:root:Processing line 2000
INFO:root:Processing line 3000
INFO:root:Processed 3000 lines
INFO:data_science_pipeline.utils.bq_schema:Created table elife-data-pipeline.de_dev.data_science_reviewing_editor_recommendation
INFO:data_science_pipeline.utils.bq:loading from /var/folders/gz/3b4st4q56pd1983txrbt2yyw0000gn/T/tmpro_3xrt8/data.jsonl.gz
INFO:data_science_pipeline.utils.bq:Loaded 3000 rows into de_dev:data_science_reviewing_editor_recommendation.


Last record of the df chunk:                        version_id     score            name person_id  \
5999  72263/2021-07-16T13:45:27Z  0.022918  Michael Dustin      7917   

      matching_keyword_score  \
5999                0.022918   

                                      matching_keywords  
5999  [{'score': 0.004109232155294465, 'keyword': 'p...  


INFO:root:Processing line 1000
INFO:root:Processing line 2000
INFO:root:Processing line 3000
INFO:root:Processed 3000 lines
INFO:data_science_pipeline.utils.bq:loading from /var/folders/gz/3b4st4q56pd1983txrbt2yyw0000gn/T/tmpl1cor601/data.jsonl.gz
INFO:data_science_pipeline.utils.bq:Loaded 3000 rows into de_dev:data_science_reviewing_editor_recommendation.


Last record of the df chunk:                        version_id    score      name person_id  \
8999  72268/2021-07-16T19:01:37Z  0.02862  Demba Ba    196287   

      matching_keyword_score  \
8999                 0.02862   

                                      matching_keywords  
8999  [{'score': 0.0043731123001550425, 'keyword': '...  


INFO:root:Processing line 1000
INFO:root:Processing line 2000
INFO:root:Processing line 3000
INFO:root:Processed 3000 lines
INFO:data_science_pipeline.utils.bq:loading from /var/folders/gz/3b4st4q56pd1983txrbt2yyw0000gn/T/tmp0666vd5m/data.jsonl.gz
INFO:data_science_pipeline.utils.bq:Loaded 3000 rows into de_dev:data_science_reviewing_editor_recommendation.


Last record of the df chunk:                         version_id     score               name person_id  \
11999  72264/2021-07-16T13:50:25Z  0.039605  Pekka Lappalainen     19580   

       matching_keyword_score  \
11999                0.039605   

                                       matching_keywords  
11999  [{'score': 0.0071757418678202525, 'keyword': '...  


INFO:root:Processing line 1000
INFO:root:Processing line 2000
INFO:root:Processing line 3000
INFO:root:Processed 3000 lines
INFO:data_science_pipeline.utils.bq:loading from /var/folders/gz/3b4st4q56pd1983txrbt2yyw0000gn/T/tmp_s9f99rz/data.jsonl.gz
INFO:data_science_pipeline.utils.bq:Loaded 3000 rows into de_dev:data_science_reviewing_editor_recommendation.


Last record of the df chunk:                         version_id     score              name person_id  \
13576  72256/2021-07-16T09:45:26Z  0.001896  Donald Hamelberg    206904   

       matching_keyword_score  \
13576                0.001896   

                                       matching_keywords  
13576  [{'score': 0.001895878097495363, 'keyword': 'p...  


INFO:root:Processing line 1000
INFO:root:Processed 1577 lines
INFO:data_science_pipeline.utils.bq:loading from /var/folders/gz/3b4st4q56pd1983txrbt2yyw0000gn/T/tmpqy2yryun/data.jsonl.gz
INFO:data_science_pipeline.utils.bq:Loaded 1577 rows into de_dev:data_science_reviewing_editor_recommendation.
