In [1]:
project_id = 'elife-data-pipeline'
source_dataset = 'prod'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
manuscript_min_tf = 10
manuscript_max_tf = 0.9
state_path = 's3://ci-elife-data-pipeline/airflow-config/keyword-extraction/state-dev'

In [2]:
from functools import partial
from typing import List, Tuple, TypeVar

import joblib
import os
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import data_science_pipeline.configure_warnings

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.io import load_object_from
from data_science_pipeline.utils.misc import identity_fn
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query,
)

In [3]:
model_path = os.path.join(state_path, 'senior_editor_model.joblib')
recommendation_output_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='editor_recommendation'
)

In [4]:
print('loading model from:', model_path)
model_dict = load_object_from(model_path)
model_dict.keys()

loading model from: s3://ci-elife-data-pipeline/airflow-config/keyword-extraction/state-dev/senior_editor_model.joblib


dict_keys(['editor_tf_idf_vectorizer', 'doc_length_weighted_editor_tf_idf', 'editor_names', 'editor_person_ids'])

In [5]:
editor_tf_idf_vectorizer = model_dict['editor_tf_idf_vectorizer']
editor_tf_idf_vectorizer

TfidfVectorizer(lowercase=False, smooth_idf=False, token_pattern=None,
                tokenizer=<function identity_fn at 0x7fbb35415cb0>)

In [6]:
doc_length_weighted_editor_tf_idf = model_dict['doc_length_weighted_editor_tf_idf']
doc_length_weighted_editor_tf_idf

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.00139157, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.00150745, 0.00109185, 0.        , ..., 0.        , 0.00292359,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.01753493,
         0.        ],
        [0.        , 0.0010018 , 0.        , ..., 0.00068432, 0.00076642,
         0.        ],
        [0.00065092, 0.00141439, 0.        , ..., 0.        , 0.00216414,
         0.        ]])

In [7]:
editor_names = model_dict['editor_names']
editor_names

0           Aleksandra Walczak
1                  Andrew King
2               Anna Akhmanova
3     Barbara Shinn-Cunningham
4                Carla Rothlin
                ...           
60                 Tamar Makin
61             Timothy Behrens
62              Utpal Banerjee
63              Vivek Malhotra
64               Wendy Garrett
Name: name, Length: 65, dtype: object

In [8]:
editor_person_ids = model_dict['editor_person_ids']
editor_person_ids

0     50904
1     14601
2      8518
3     19576
4     44396
      ...  
60    18331
61     1044
62     1042
63     1133
64    28627
Name: person_id, Length: 65, dtype: object

In [9]:
editor_person_id_by_name_map = dict(zip(editor_names, editor_person_ids))
editor_person_id_by_name_map

{'Aleksandra Walczak': '50904',
 'Andrew King': '14601',
 'Anna Akhmanova': '8518',
 'Barbara Shinn-Cunningham': '19576',
 'Carla Rothlin': '44396',
 'Catherine Dulac': '1014',
 'Chris Baker': '28129',
 'Christian Büchel': '16197',
 'Christian Hardtke': '1102',
 'Christian Rutz': '15332',
 'Clifford Rosen': '48282',
 'Cynthia Wolberger': '5005',
 'David Ron': '1174',
 'Detlef Weigel': '1030',
 'Didier Stainier': '7189',
 'Diethard Tautz': '1191',
 'Dominique Soldati-Favre': '41647',
 'Eduardo Franco': '1086',
 'Edward Morrisey': '55645',
 'Floris de Lange': '28130',
 'Gary Westbrook': '1202',
 'George Perry': '42011',
 'Gisela Storz': '1188',
 'Huda Zoghbi': '1029',
 'James Manley': '1020',
 'Jessica Tyler': '1421',
 'John Huguenard': '13947',
 'John Kuriyan': '1018',
 'Jonathan Cooper': '1062',
 'Jos van der Meer': '41027',
 'Joshua Gold': '17965',
 'José Faraldo-Gómez': '13987',
 'K VijayRaghavan': '1027',
 'Karla Kirkegaard': '3645',
 'Kate Wassum': '33127',
 'Kathryn Cheah': '91149

In [10]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [11]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [12]:
manuscript_version_for_recommendation_df = read_big_query(
    get_sql('manuscript-version-initial-submissions-for-senior-editor-recommendation.sql').format(
        **default_query_props
    )
)
manuscript_version_for_recommendation_df.head()

> ```sql
> WITH t_manuscript_version_abstract_keywords AS (
>   SELECT
>     LPAD(CAST(manuscript_abstract_keywords.manuscript_id AS STRING), 5, '0') AS manuscript_id,
>     manuscript_abstract_keywords.version_id,
>     manuscript_abstract_keywords.extracted_keywords,
>     ROW_NUMBER() OVER (
>       PARTITION BY version_id
>       ORDER BY data_hub_imported_timestamp DESC
>     ) AS version_id_row_number
>   FROM `elife-data-pipeline.prod.manuscript_abstract_keywords` AS manuscript_abstract_keywords
>   WHERE ARRAY_LENGTH(extracted_keywords) > 0
> ),
> 
> t_last_manuscript_version_abstract_keywords AS (
>   SELECT
>     * EXCEPT(version_id_row_number)
>   FROM t_manuscript_version_abstract_keywords
>   WHERE version_id_row_number = 1
>   ORDER BY version_id
> )
> 
> SELECT version.version_id, manuscript_version_abstract_keywords.extracted_keywords
> FROM `elife-data-pipeline.prod.mv_manuscript_version` AS version
> JOIN t_last_manuscript_version_abstract_keywords AS manuscript_version_abstract_keywords
>   ON manuscript_version_abstract_keywords.version_id = version.version_id
> WHERE version.overall_stage = 'Initial Submission'
>   AND ARRAY_LENGTH(version.senior_editors) = 0
>   AND TIMESTAMP_DIFF(CURRENT_TIMESTAMP, version.created_timestamp, DAY) < 365
> ```

Downloading: 100%|██████████| 135/135 [00:01<00:00, 118.61rows/s]


Unnamed: 0,version_id,extracted_keywords
0,48817/2019-05-26T10:20:36Z,"[-30, -30 high-quality somatic mutation, 2mg, ..."
1,48944/2019-05-31T08:14:54Z,"[-use, approach, aspect, assessment, benchmark..."
2,49386/2019-06-16T17:54:43Z,"[action, activation, activity, adrenergic, adr..."
3,49454/2019-06-18T14:49:43Z,"[activation, activation connectivity, adult, a..."
4,49719/2019-06-26T19:20:19Z,"[abrupt, abrupt transition, abundance, balance..."


In [13]:
keyword_similarity = cosine_similarity(
    editor_tf_idf_vectorizer.transform(
        manuscript_version_for_recommendation_df
        ['extracted_keywords']
    ),
    doc_length_weighted_editor_tf_idf
)
print(keyword_similarity.max())
keyword_similarity

0.3355734086321363


array([[0.08878421, 0.05283914, 0.09983064, ..., 0.11813633, 0.08211483,
        0.11671405],
       [0.05378891, 0.03104301, 0.04404304, ..., 0.02308523, 0.0342086 ,
        0.03625382],
       [0.07517379, 0.07506359, 0.13640605, ..., 0.11187638, 0.16645181,
        0.08188081],
       ...,
       [0.0872426 , 0.05539434, 0.10876117, ..., 0.07425155, 0.10561886,
        0.07420016],
       [0.10689867, 0.07306304, 0.18234542, ..., 0.20835954, 0.16439594,
        0.12621397],
       [0.05188661, 0.0316176 , 0.03079543, ..., 0.02175103, 0.02470023,
        0.02204759]])

In [14]:
T = TypeVar('T')


def get_recommended_editors_with_probability(
        proba_matrix: List[List[float]],
        indices: List[T],
        threshold: float = 0.5) -> List[List[Tuple[float, T]]]:
    return [
        sorted([(p, key) for p, key in zip(row, indices) if p >= threshold], reverse=True)
        for row in proba_matrix
    ]


prediction_results_with_similarity = pd.Series(
    get_recommended_editors_with_probability(
        keyword_similarity,
        editor_names,
        threshold=0.001
    ),
    index=manuscript_version_for_recommendation_df.index
)
prediction_results_with_similarity[:5]

0    [(0.16204785769215999, Detlef Weigel), (0.1547...
1    [(0.06987810914346426, Naama Barkai), (0.06429...
2    [(0.20862855278981354, Olga Boudker), (0.18494...
3    [(0.19168306173307847, Richard Ivry), (0.18664...
4    [(0.16059355415897233, Naama Barkai), (0.15650...
dtype: object

In [15]:
prediction_results_df = pd.concat([
    manuscript_version_for_recommendation_df['version_id'],
    prediction_results_with_similarity.to_frame('prediction'),
], axis=1)
print(len(prediction_results_df))
prediction_results_df.head()

135


Unnamed: 0,version_id,prediction
0,48817/2019-05-26T10:20:36Z,"[(0.16204785769215999, Detlef Weigel), (0.1547..."
1,48944/2019-05-31T08:14:54Z,"[(0.06987810914346426, Naama Barkai), (0.06429..."
2,49386/2019-06-16T17:54:43Z,"[(0.20862855278981354, Olga Boudker), (0.18494..."
3,49454/2019-06-18T14:49:43Z,"[(0.19168306173307847, Richard Ivry), (0.18664..."
4,49719/2019-06-26T19:20:19Z,"[(0.16059355415897233, Naama Barkai), (0.15650..."


In [16]:
prediction_results_df['prediction'][0]

[(0.16204785769215999, 'Detlef Weigel'),
 (0.15472611005471554, 'Patricia Wittkopp'),
 (0.14920109988116995, 'Kevin Struhl'),
 (0.1478960311543259, 'Kathryn Cheah'),
 (0.14327516597743065, 'Päivi Ojala'),
 (0.14257917621506616, 'James Manley'),
 (0.14237075188197826, 'Maureen Murphy'),
 (0.14211589093427718, 'Diethard Tautz'),
 (0.14071926720906205, 'Jessica Tyler'),
 (0.13737272099320416, 'Richard White'),
 (0.13363492961563422, 'Michael Eisen'),
 (0.12922471460293097, 'Marianne Bronner'),
 (0.12921017934711362, 'Didier Stainier'),
 (0.12658971733858476, 'Naama Barkai'),
 (0.12536602773063374, 'Huda Zoghbi'),
 (0.12217071521975853, 'Edward Morrisey'),
 (0.11813632909597148, 'Utpal Banerjee'),
 (0.11671405412063869, 'Wendy Garrett'),
 (0.11496539757376192, 'Jonathan Cooper'),
 (0.11422921816993711, 'Satyajit Rath'),
 (0.11196901674892172, 'Tadatsugu Taniguchi'),
 (0.10656955986230698, 'Gisela Storz'),
 (0.10594847535409728, 'Clifford Rosen'),
 (0.10575156333079008, 'Jos van der Meer'),

In [17]:
prediction_results_flat_df = pd.DataFrame([
    {
        'version_id': row.version_id,
        'score': predicted_editor[0],
        'name': predicted_editor[1],
        'person_id': editor_person_id_by_name_map[predicted_editor[1]]
    }
    for row in prediction_results_df.itertuples()
    for predicted_editor in row.prediction
])
print(len(prediction_results_flat_df))
prediction_results_flat_df.head()

8639


Unnamed: 0,version_id,score,name,person_id
0,48817/2019-05-26T10:20:36Z,0.162048,Detlef Weigel,1030
1,48817/2019-05-26T10:20:36Z,0.154726,Patricia Wittkopp,42157
2,48817/2019-05-26T10:20:36Z,0.149201,Kevin Struhl,1189
3,48817/2019-05-26T10:20:36Z,0.147896,Kathryn Cheah,91149
4,48817/2019-05-26T10:20:36Z,0.143275,Päivi Ojala,124135


In [18]:
print('writing to:', recommendation_output_table_name)
prediction_results_flat_df.to_gbq(
    recommendation_output_table_name,
    project_id=project_id,
    if_exists='replace'
)
print('done')

writing to: de_dev.data_science_editor_recommendation


1it [00:04,  4.77s/it]

done



