In [1]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
manuscript_min_tf = 10
manuscript_max_tf = 0.9
state_path = 's3://ci-elife-data-pipeline/airflow-config/keyword-extraction/state-dev'

In [2]:
import os
from functools import partial
from itertools import groupby
from typing import List, Tuple, TypeVar

import joblib
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import data_science_pipeline.configure_warnings

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.bq import to_gbq
from data_science_pipeline.utils.io import load_object_from
from data_science_pipeline.utils.misc import identity_fn
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query
)
from data_science_pipeline.peerscout.models import (
    WeightedKeywordModel
)

In [3]:
model_path = os.path.join(state_path, 'senior_editor_model.joblib')
recommendation_output_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='editor_recommendation'
)

In [4]:
print('loading model from:', model_path)
model_dict = load_object_from(model_path)
model_dict.keys()

loading model from: s3://ci-elife-data-pipeline/airflow-config/keyword-extraction/state-dev/senior_editor_model.joblib


dict_keys(['editor_tf_idf_vectorizer', 'editor_tf_idf', 'editor_names', 'editor_person_ids'])

In [5]:
editor_tf_idf_vectorizer = model_dict['editor_tf_idf_vectorizer']
editor_tf_idf_vectorizer

TfidfVectorizer(lowercase=False, smooth_idf=False, token_pattern=None,
                tokenizer=<function identity_fn at 0x7f8f2137cdd0>)

In [6]:
editor_tf_idf = model_dict['editor_tf_idf']
editor_tf_idf

<65x20373 sparse matrix of type '<class 'numpy.float64'>'
	with 320085 stored elements in Compressed Sparse Row format>

In [7]:
editor_names = model_dict['editor_names']
editor_names

0           Aleksandra Walczak
1                  Andrew King
2               Anna Akhmanova
3     Barbara Shinn-Cunningham
4                Carla Rothlin
                ...           
60                 Tamar Makin
61             Timothy Behrens
62              Utpal Banerjee
63              Vivek Malhotra
64               Wendy Garrett
Name: name, Length: 65, dtype: object

In [8]:
editor_person_ids = model_dict['editor_person_ids']
editor_person_ids

0     50904
1     14601
2      8518
3     19576
4     44396
      ...  
60    18331
61     1044
62     1042
63     1133
64    28627
Name: person_id, Length: 65, dtype: object

In [9]:
editor_person_id_by_name_map = dict(zip(editor_names, editor_person_ids))
editor_person_id_by_name_map

{'Aleksandra Walczak': '50904',
 'Andrew King': '14601',
 'Anna Akhmanova': '8518',
 'Barbara Shinn-Cunningham': '19576',
 'Carla Rothlin': '44396',
 'Catherine Dulac': '1014',
 'Chris Baker': '28129',
 'Christian Büchel': '16197',
 'Christian Hardtke': '1102',
 'Christian Rutz': '15332',
 'Clifford Rosen': '48282',
 'Cynthia Wolberger': '5005',
 'David Ron': '1174',
 'Detlef Weigel': '1030',
 'Didier Stainier': '7189',
 'Diethard Tautz': '1191',
 'Dominique Soldati-Favre': '41647',
 'Eduardo Franco': '1086',
 'Edward Morrisey': '55645',
 'Floris de Lange': '28130',
 'Gary Westbrook': '1202',
 'George Perry': '42011',
 'Gisela Storz': '1188',
 'Huda Zoghbi': '1029',
 'James Manley': '1020',
 'Jessica Tyler': '1421',
 'John Huguenard': '13947',
 'John Kuriyan': '1018',
 'Jonathan Cooper': '1062',
 'Jos van der Meer': '41027',
 'Joshua Gold': '17965',
 'José Faraldo-Gómez': '13987',
 'K VijayRaghavan': '1027',
 'Karla Kirkegaard': '3645',
 'Kate Wassum': '33127',
 'Kathryn Cheah': '91149

In [10]:
weighted_keyword_valid_model = WeightedKeywordModel.from_tf_matrix(
    editor_tf_idf.todense(),
    editor_tf_idf_vectorizer.get_feature_names(),
    choices=editor_names
)
weighted_keyword_valid_model

<data_science_pipeline.peerscout.models.WeightedKeywordModel at 0x7f8f203dcc10>

In [11]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [12]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [13]:
manuscript_version_for_recommendation_df = read_big_query(
    get_sql('manuscript-version-initial-submissions-for-senior-editor-recommendation.sql').format(
        **default_query_props
    )
)
manuscript_version_for_recommendation_df.head()

> ```sql
> WITH t_manuscript_version_abstract_keywords AS (
>   SELECT
>     LPAD(CAST(manuscript_abstract_keywords.manuscript_id AS STRING), 5, '0') AS manuscript_id,
>     manuscript_abstract_keywords.version_id,
>     manuscript_abstract_keywords.extracted_keywords,
>     ROW_NUMBER() OVER (
>       PARTITION BY version_id
>       ORDER BY data_hub_imported_timestamp DESC
>     ) AS version_id_row_number
>   FROM `elife-data-pipeline.de_dev.manuscript_abstract_keywords` AS manuscript_abstract_keywords
>   WHERE ARRAY_LENGTH(extracted_keywords) > 0
> ),
> 
> t_last_manuscript_version_abstract_keywords AS (
>   SELECT
>     * EXCEPT(version_id_row_number)
>   FROM t_manuscript_version_abstract_keywords
>   WHERE version_id_row_number = 1
>   ORDER BY version_id
> )
> 
> SELECT version.version_id, manuscript_version_abstract_keywords.extracted_keywords
> FROM `elife-data-pipeline.de_dev.mv_manuscript_version` AS version
> JOIN t_last_manuscript_version_abstract_keywords AS manuscript_version_abstract_keywords
>   ON manuscript_version_abstract_keywords.version_id = version.version_id
> WHERE version.overall_stage = 'Initial Submission'
>   AND ARRAY_LENGTH(version.senior_editors) = 0
>   AND TIMESTAMP_DIFF(CURRENT_TIMESTAMP, version.created_timestamp, DAY) < 365
> ```

Downloading: 100%|██████████| 135/135 [00:01<00:00, 112.03rows/s]


Unnamed: 0,version_id,extracted_keywords
0,49586/2019-06-22T03:44:31Z,"[ac, advantage, affective, affective value, am..."
1,50199/2019-07-14T19:30:01Z,"[agent, cardiopulmonary, cardiopulmonary syndr..."
2,51204/2019-08-19T14:01:01Z,"[etiology, extent, impairment, infertility, le..."
3,51769/2019-09-10T09:20:03Z,"[3'-untranslate, 3'-untranslated region, 3'-ut..."
4,52332/2019-09-30T11:45:20Z,"[584e, 584e mutation, 592a, 592a mutation, aci..."


In [14]:
keyword_similarity = cosine_similarity(
    editor_tf_idf_vectorizer.transform(
        manuscript_version_for_recommendation_df
        ['extracted_keywords']
    ),
    editor_tf_idf
)
print(keyword_similarity.max())
keyword_similarity

0.3355734086321362


array([[0.08724956, 0.27371087, 0.08635848, ..., 0.07420609, 0.07733345,
        0.07454804],
       [0.09197185, 0.04039304, 0.09755077, ..., 0.07793741, 0.10870541,
        0.12053354],
       [0.01058842, 0.01726941, 0.01816204, ..., 0.03753092, 0.01436975,
        0.01733916],
       ...,
       [0.07797919, 0.08630291, 0.12348528, ..., 0.13660572, 0.11095575,
        0.10314351],
       [0.08109098, 0.07823628, 0.08013092, ..., 0.07684354, 0.07144866,
        0.10281719],
       [0.12926069, 0.07417636, 0.20160478, ..., 0.17831767, 0.19024005,
        0.12596202]])

In [15]:
weighted_keyword_valid_model.predict_ranking(
    manuscript_version_for_recommendation_df['extracted_keywords'][:1],
).proba_matrix

matrix([[1.23474042, 2.84311651, 1.20985254, 2.05520185, 0.7503078 ,
         2.47906964, 1.41891987, 1.95863712, 0.95806543, 1.45968371,
         1.00688296, 0.88543088, 0.96588022, 1.21656378, 1.32762174,
         1.22820888, 0.69355899, 0.97510592, 0.98089017, 1.82542398,
         2.28341128, 0.67742896, 0.99316872, 1.75930561, 0.91770999,
         0.90554284, 2.28266055, 0.9825291 , 1.20290148, 0.86262888,
         2.14931088, 1.04903341, 1.69767738, 0.72072373, 2.41396749,
         1.15344412, 1.04331859, 0.94440715, 2.21243624, 1.45074805,
         0.87308664, 0.82365637, 1.13779278, 2.11280433, 1.07529873,
         0.49113081, 1.4163573 , 0.78685302, 1.19613395, 1.16837994,
         1.20835553, 1.49813334, 0.80663805, 1.39214892, 1.96865043,
         0.87258903, 2.48017331, 1.02358521, 1.10049191, 1.05533646,
         1.46568891, 2.17977357, 1.04736322, 1.08527833, 1.06366756]])

In [16]:
manuscript_matching_keywords_list = weighted_keyword_valid_model.predict_ranking(
    manuscript_version_for_recommendation_df['extracted_keywords']
).matching_keywords_list
pd.Series(manuscript_matching_keywords_list[:5])

0    [[(0.12628766459796698, result), (0.1058085838...
1    [[(0.25649130421417055, cell), (0.100516862462...
2    [[(0.015208369622218892, signaling), (0.010313...
3    [[(0.11263494410088948, mechanism), (0.1005168...
4    [[(0.10051686246231009, interaction), (0.08447...
dtype: object

In [17]:
T = TypeVar('T')


def get_recommended_editors_with_probability(
        proba_matrix: List[List[float]],
        manuscript_matching_keywords_list: List[List[Tuple[float, str]]],
        indices: List[T],
        threshold: float = 0.5) -> List[List[Tuple[float, T]]]:
    return [
        sorted([
            (
                p,
                key,
                sum(
                    s for s, _ in editor_matching_keywords
                ),
                editor_matching_keywords
            )
            for p, key, editor_matching_keywords in zip(
                row,
                indices,
                editors_matching_keywords
            ) if p >= threshold
        ], reverse=True)
        for row, editors_matching_keywords in zip(proba_matrix, manuscript_matching_keywords_list)
    ]


prediction_results_with_similarity = pd.Series(
    get_recommended_editors_with_probability(
        keyword_similarity,
        manuscript_matching_keywords_list,
        editor_names,
        threshold=0.001
    ),
    index=manuscript_version_for_recommendation_df.index
)
# print(prediction_results_with_similarity[0])
prediction_results_with_similarity[:5]

0    [(0.2737108728328221, Andrew King, 2.843116508...
1    [(0.24878086591486592, Karla Kirkegaard, 2.560...
2    [(0.04398606884115359, Michael Eisen, 0.135735...
3    [(0.21636200978749898, James Manley, 2.1179212...
4    [(0.25091686324207785, Dominique Soldati-Favre...
dtype: object

In [18]:
prediction_results_df = pd.concat([
    manuscript_version_for_recommendation_df['version_id'],
    prediction_results_with_similarity.to_frame('prediction'),
], axis=1)
print(len(prediction_results_df))
prediction_results_df.head()

135


Unnamed: 0,version_id,prediction
0,49586/2019-06-22T03:44:31Z,"[(0.2737108728328221, Andrew King, 2.843116508..."
1,50199/2019-07-14T19:30:01Z,"[(0.24878086591486592, Karla Kirkegaard, 2.560..."
2,51204/2019-08-19T14:01:01Z,"[(0.04398606884115359, Michael Eisen, 0.135735..."
3,51769/2019-09-10T09:20:03Z,"[(0.21636200978749898, James Manley, 2.1179212..."
4,52332/2019-09-30T11:45:20Z,"[(0.25091686324207785, Dominique Soldati-Favre..."


In [19]:
prediction_results_df['prediction'][0]

[(0.2737108728328221,
  'Andrew King',
  2.843116508907422,
  [(0.2874490016491514, 'auditory'),
   (0.19231404955399775, 'cortex'),
   (0.16420313052910335, 'neuron'),
   (0.15893865330137735, 'auditory cortex'),
   (0.14627558007783445, 'result'),
   (0.11257152936865601, 'mouse'),
   (0.11147258765963573, 'cortical'),
   (0.10753412926906904, 'information'),
   (0.10051046502840401, 'input'),
   (0.07507595812370023, 'signal'),
   (0.0694303444609076, 'function'),
   (0.06673402040417332, 'different'),
   (0.06565179566558534, 'behavior'),
   (0.06538585837580618, 'effect'),
   (0.06484155338243915, 'circuit'),
   (0.0636614845530076, 'functional'),
   (0.04946987073371848, 'inhibitory'),
   (0.048429436496369584, 'excitatory'),
   (0.04651158997866626, 'individual'),
   (0.04517911806987636, 'control'),
   (0.044772323616379124, 'spike'),
   (0.040401209586711624, 'projection'),
   (0.037537979061850114, 'specific'),
   (0.036280200874294656, 'type'),
   (0.035673777903757115, 'ima

In [20]:
prediction_results_flat_df = pd.DataFrame([
    {
        'version_id': row.version_id,
        'score': predicted_editor[0],
        'name': predicted_editor[1],
        'person_id': editor_person_id_by_name_map[predicted_editor[1]],
        'matching_keyword_score': predicted_editor[2],
        'matching_keywords': [{
            'score': keyword_score,
            'keyword': keyword
        } for keyword_score, keyword in predicted_editor[3]],
    }
    for row in prediction_results_df.itertuples()
    for predicted_editor in row.prediction
])
print(len(prediction_results_flat_df))
prediction_results_flat_df.head()

8639


Unnamed: 0,version_id,score,name,person_id,matching_keyword_score,matching_keywords
0,49586/2019-06-22T03:44:31Z,0.273711,Andrew King,14601,2.843117,"[{'score': 0.2874490016491514, 'keyword': 'aud..."
1,49586/2019-06-22T03:44:31Z,0.207845,Kate Wassum,33127,2.413967,"[{'score': 0.18800263313182172, 'keyword': 'ne..."
2,49586/2019-06-22T03:44:31Z,0.205554,John Huguenard,13947,2.282661,"[{'score': 0.16244064606460873, 'keyword': 'ne..."
3,49586/2019-06-22T03:44:31Z,0.204885,Catherine Dulac,1014,2.47907,"[{'score': 0.23865693081301326, 'keyword': 'ne..."
4,49586/2019-06-22T03:44:31Z,0.202564,Ronald Calabrese,1056,2.480173,"[{'score': 0.2649762007228597, 'keyword': 'neu..."


In [21]:
prediction_results_flat_df['version_id'].nunique()

134

In [22]:
prediction_results_flat_df.max()

version_id                58638/2020-05-06T09:35:13Z
score                                       0.335573
name                                   Wendy Garrett
person_id                                      97155
matching_keyword_score                       3.68348
dtype: object

In [23]:
print('writing to:', recommendation_output_table_name)
to_gbq(
    prediction_results_flat_df,
    recommendation_output_table_name,
    project_id=project_id,
    if_exists='replace'
)
print('done')

writing to: de_dev.data_science_editor_recommendation
done
