In [1]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
manuscript_min_tf = 10
manuscript_max_tf = 0.9
state_path = 's3://ci-elife-data-pipeline/airflow-config/data-science/state-dev'

In [2]:
import os
from functools import partial
from typing import List, Tuple, T

import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity

import data_science_pipeline.configure_warnings  #pylint: disable=unused-import
import data_science_pipeline.configure_notebook_logging  # pylint: disable=unused-import

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.bq import to_gbq
from data_science_pipeline.utils.io import load_object_from
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query
)
from data_science_pipeline.peerscout.models import (
    WeightedKeywordModel
)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [3]:
model_path = os.path.join(state_path, 'senior_editor_model.joblib')
recommendation_output_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='editor_recommendation'
)

In [4]:
print('loading model from:', model_path)
model_dict = load_object_from(model_path)
model_dict.keys()

loading model from: s3://ci-elife-data-pipeline/airflow-config/data-science/state-dev/senior_editor_model.joblib




dict_keys(['editor_tf_idf_vectorizer', 'editor_tf_idf', 'editor_names', 'editor_person_ids'])

In [5]:
editor_tf_idf_vectorizer = model_dict['editor_tf_idf_vectorizer']
editor_tf_idf_vectorizer

TfidfVectorizer(lowercase=False, smooth_idf=False, token_pattern=None,
                tokenizer=<function identity_fn at 0x1347f3d40>)

In [6]:
editor_tf_idf = model_dict['editor_tf_idf']
editor_tf_idf

<72x21870 sparse matrix of type '<class 'numpy.float64'>'
	with 362280 stored elements in Compressed Sparse Row format>

In [7]:
editor_names = model_dict['editor_names']
# editor_names

In [8]:
editor_person_ids = model_dict['editor_person_ids']
# editor_person_ids

In [9]:
editor_person_id_by_name_map = dict(zip(editor_names, editor_person_ids))
# editor_person_id_by_name_map

In [10]:
weighted_keyword_valid_model = WeightedKeywordModel.from_tf_matrix(
    editor_tf_idf.todense(),
    vectorizer=editor_tf_idf_vectorizer,
    choices=editor_names
)
# weighted_keyword_valid_model

In [11]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [12]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [13]:
manuscript_version_for_recommendation_df = read_big_query(
    get_sql('manuscript-version-initial-submissions-for-senior-editor-recommendation.sql').format(
        **default_query_props
    )
)
# print(len(manuscript_version_for_recommendation_df))
# manuscript_version_for_recommendation_df.head()

> ```sql
> -- Main features:
> --    - Returns Initial Submissions for the purpose of Senior Editor recommendation
> --    - No older than a year
> --    - Not have Senior Editor assigned for more than 30 days
> 
> WITH t_manuscript_version_abstract_keywords AS (
>   SELECT
>     manuscript_abstract_keywords.manuscript_id AS manuscript_id,
>     manuscript_abstract_keywords.version_id,
>     manuscript_abstract_keywords.extracted_keywords,
>     ROW_NUMBER() OVER (
>       PARTITION BY version_id
>       ORDER BY data_hub_imported_timestamp DESC
>     ) AS version_id_row_number
>   FROM `elife-data-pipeline.de_dev.manuscript_abstract_keywords` AS manuscript_abstract_keywords
>   WHERE ARRAY_LENGTH(extracted_keywords) > 0
> ),
> 
> t_last_manuscript_version_abstract_keywords AS (
>   SELECT
>     * EXCEPT(version_id_row_number)
>   FROM t_manuscript_version_abstract_keywords
>   WHERE version_id_row_number = 1
>   ORDER BY version_id
> )
> 
> SELECT version.version_id, manuscript_version_abstract_keywords.extracted_keywords
> FROM `elife-data-pipeline.de_dev.mv_manuscript_version` AS version
> JOIN t_last_manuscript_version_abstract_keywords AS manuscript_version_abstract_keywords
>   ON manuscript_version_abstract_keywords.version_id = version.version_id
> WHERE version.overall_stage = 'Initial Submission'
>   AND (
>     ARRAY_LENGTH(version.senior_editors) = 0
>     OR TIMESTAMP_DIFF(
>       CURRENT_TIMESTAMP,
>       (SELECT MAX(last_assigned_timestamp) FROM UNNEST(version.senior_editors)),
>       DAY
>     ) < 30
>   )
>   AND TIMESTAMP_DIFF(CURRENT_TIMESTAMP, version.created_timestamp, DAY) < 365
>   AND NOT is_withdrawn
>   AND NOT is_deleted
> ```

In [14]:
keyword_similarity = cosine_similarity(
    editor_tf_idf_vectorizer.transform(
        manuscript_version_for_recommendation_df
        ['extracted_keywords']
    ),
    editor_tf_idf
)
# print(keyword_similarity.max())
# keyword_similarity

In [15]:
weighted_keyword_valid_model.predict_ranking(
    manuscript_version_for_recommendation_df['extracted_keywords'][:1],
).proba_matrix

matrix([[0.05729369, 0.14320046, 0.07224609, 0.10036872, 0.04462025,
         0.04714328, 0.22195058, 0.12048519, 0.14570821, 0.05358157,
         0.07877081, 0.05734939, 0.04469389, 0.05988373, 0.07402032,
         0.03682689, 0.08927674, 0.04702045, 0.05672794, 0.06213534,
         0.0536982 , 0.12198293, 0.17261758, 0.04555791, 0.06586447,
         0.12794226, 0.06602591, 0.06560843, 0.14865797, 0.05176305,
         0.07114613, 0.04574581, 0.13757111, 0.04095361, 0.13444617,
         0.24022015, 0.06047642, 0.06682273, 0.06840197, 0.1713528 ,
         0.04220274, 0.09644525, 0.05549118, 0.05630301, 0.05294448,
         0.04739091, 0.08189801, 0.14288646, 0.0673432 , 0.11320263,
         0.04706534, 0.06028706, 0.07645614, 0.05797921, 0.06714612,
         0.07376837, 0.1228253 , 0.05903231, 0.07535301, 0.12795652,
         0.05065281, 0.17554124, 0.03243371, 0.06103372, 0.06801791,
         0.0602241 , 0.11137763, 0.154268  , 0.09766311, 0.07047984,
         0.06409865, 0.06348857]])

In [16]:
manuscript_matching_keywords_list = weighted_keyword_valid_model.predict_ranking(
    manuscript_version_for_recommendation_df['extracted_keywords']
).matching_keywords_list
pd.Series(manuscript_matching_keywords_list[:5])

0    [[(0.009742938450779979, result), (0.004467759...
1    [[(0.01241493083998792, result), (0.0089232315...
2    [[(0.010565836818567462, result), (0.005943283...
3    [[(0.006733348606335727, gene), (0.00673334860...
4    [[(0.023284361697453057, cell), (0.01222169681...
dtype: object

In [17]:
def get_recommended_editors_with_probability(
        proba_matrix: List[List[float]],
        editors_matching_keywords_list: List[List[Tuple[float, str]]],
        indices: List[T],
        threshold: float = 0.5) -> List[List[Tuple[float, T]]]:
    return [
        sorted([
            (
                p,
                key,
                sum(
                    s for s, _ in editor_matching_keywords
                ),
                editor_matching_keywords
            )
            for p, key, editor_matching_keywords in zip(
                row,
                indices,
                editors_matching_keywords
            ) if p >= threshold
        ], reverse=True)
        for row, editors_matching_keywords in zip(proba_matrix, editors_matching_keywords_list)
    ]


prediction_results_with_similarity = pd.Series(
    get_recommended_editors_with_probability(
        keyword_similarity,
        manuscript_matching_keywords_list,
        editor_names,
        threshold=0.001
    ),
    index=manuscript_version_for_recommendation_df.index
)
# print(prediction_results_with_similarity[0])
# prediction_results_with_similarity[:5]

In [18]:
prediction_results_df = pd.concat([
    manuscript_version_for_recommendation_df['version_id'],
    prediction_results_with_similarity.to_frame('prediction'),
], axis=1)
# print(len(prediction_results_df))
# prediction_results_df.head()

In [None]:
# prediction_results_df['prediction'][0]

In [19]:
prediction_results_flat_df = pd.DataFrame([
    {
        'version_id': row.version_id,
        'score': predicted_editor[0],
        'name': predicted_editor[1],
        'person_id': editor_person_id_by_name_map[predicted_editor[1]],
        'matching_keyword_score': predicted_editor[2],
        'matching_keywords': [{
            'score': keyword_score,
            'keyword': keyword
        } for keyword_score, keyword in predicted_editor[3]],
    }
    for row in prediction_results_df.itertuples()
    for predicted_editor in row.prediction
])
# print(len(prediction_results_flat_df))
# prediction_results_flat_df.head()

In [20]:
prediction_results_flat_df['version_id'].nunique()

24

In [21]:
prediction_results_flat_df.max()

  """Entry point for launching an IPython kernel.


version_id                72268/2021-07-16T19:01:37Z
score                                       0.304983
name                                   Wendy Garrett
person_id                                      97155
matching_keyword_score                      0.304983
dtype: object

In [22]:
print('writing to:', recommendation_output_table_name)
to_gbq(
    prediction_results_flat_df,
    recommendation_output_table_name,
    project_id=project_id,
    if_exists='replace'
)
print('done')

writing to: de_dev.data_science_editor_recommendation


INFO:root:Processing line 1000
INFO:root:Processed 1725 lines
INFO:data_science_pipeline.utils.bq:loading from /var/folders/gz/3b4st4q56pd1983txrbt2yyw0000gn/T/tmpa_3iplwk/data.jsonl.gz
INFO:data_science_pipeline.utils.bq:Loaded 1725 rows into de_dev:data_science_editor_recommendation.


done
