In [1]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
manuscript_min_tf = 10
manuscript_max_tf = 0.9
state_path = 's3://ci-elife-data-pipeline/airflow-config/data-science/state-dev'

In [2]:
import os
from functools import partial
from itertools import groupby
from typing import List, Tuple, TypeVar

import joblib
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import data_science_pipeline.configure_warnings

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.bq import to_gbq
from data_science_pipeline.utils.io import load_object_from
from data_science_pipeline.utils.misc import identity_fn
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query
)
from data_science_pipeline.peerscout.models import (
    WeightedKeywordModel
)

In [3]:
model_path = os.path.join(state_path, 'senior_editor_model.joblib')
recommendation_output_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='editor_recommendation'
)

In [4]:
print('loading model from:', model_path)
model_dict = load_object_from(model_path)
model_dict.keys()

loading model from: s3://ci-elife-data-pipeline/airflow-config/data-science/state-dev/senior_editor_model.joblib


dict_keys(['editor_tf_idf_vectorizer', 'editor_tf_idf', 'editor_names', 'editor_person_ids'])

In [5]:
editor_tf_idf_vectorizer = model_dict['editor_tf_idf_vectorizer']
editor_tf_idf_vectorizer

TfidfVectorizer(lowercase=False, smooth_idf=False, token_pattern=None,
                tokenizer=<function identity_fn at 0x7f800c7bfdd0>)

In [6]:
editor_tf_idf = model_dict['editor_tf_idf']
editor_tf_idf

<65x13658 sparse matrix of type '<class 'numpy.float64'>'
	with 218399 stored elements in Compressed Sparse Row format>

In [7]:
editor_names = model_dict['editor_names']
editor_names

0           Aleksandra Walczak
1                  Andrew King
2               Anna Akhmanova
3     Barbara Shinn-Cunningham
4                Carla Rothlin
                ...           
60                 Tamar Makin
61             Timothy Behrens
62              Utpal Banerjee
63              Vivek Malhotra
64               Wendy Garrett
Name: name, Length: 65, dtype: object

In [8]:
editor_person_ids = model_dict['editor_person_ids']
editor_person_ids

0     50904
1     14601
2      8518
3     19576
4     44396
      ...  
60    18331
61     1044
62     1042
63     1133
64    28627
Name: person_id, Length: 65, dtype: object

In [9]:
editor_person_id_by_name_map = dict(zip(editor_names, editor_person_ids))
editor_person_id_by_name_map

{'Aleksandra Walczak': '50904',
 'Andrew King': '14601',
 'Anna Akhmanova': '8518',
 'Barbara Shinn-Cunningham': '19576',
 'Carla Rothlin': '44396',
 'Catherine Dulac': '1014',
 'Chris Baker': '28129',
 'Christian Büchel': '16197',
 'Christian Hardtke': '1102',
 'Christian Rutz': '15332',
 'Clifford Rosen': '48282',
 'Cynthia Wolberger': '5005',
 'David Ron': '1174',
 'Detlef Weigel': '1030',
 'Didier Stainier': '7189',
 'Diethard Tautz': '1191',
 'Dominique Soldati-Favre': '41647',
 'Eduardo Franco': '1086',
 'Edward Morrisey': '55645',
 'Floris de Lange': '28130',
 'Gary Westbrook': '1202',
 'George Perry': '42011',
 'Gisela Storz': '1188',
 'Huda Zoghbi': '1029',
 'James Manley': '1020',
 'Jessica Tyler': '1421',
 'John Huguenard': '13947',
 'John Kuriyan': '1018',
 'Jonathan Cooper': '1062',
 'Jos van der Meer': '41027',
 'Joshua Gold': '17965',
 'José Faraldo-Gómez': '13987',
 'K VijayRaghavan': '1027',
 'Karla Kirkegaard': '3645',
 'Kate Wassum': '33127',
 'Kathryn Cheah': '91149

In [10]:
weighted_keyword_valid_model = WeightedKeywordModel.from_tf_matrix(
    editor_tf_idf.todense(),
    vectorizer=editor_tf_idf_vectorizer,
    choices=editor_names
)
weighted_keyword_valid_model

<data_science_pipeline.peerscout.models.WeightedKeywordModel at 0x7f800bf1de90>

In [11]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [12]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [13]:
manuscript_version_for_recommendation_df = read_big_query(
    get_sql('manuscript-version-initial-submissions-for-senior-editor-recommendation.sql').format(
        **default_query_props
    )
)
manuscript_version_for_recommendation_df.head()

> ```sql
> WITH t_manuscript_version_abstract_keywords AS (
>   SELECT
>     manuscript_abstract_keywords.manuscript_id AS manuscript_id,
>     manuscript_abstract_keywords.version_id,
>     manuscript_abstract_keywords.extracted_keywords,
>     ROW_NUMBER() OVER (
>       PARTITION BY version_id
>       ORDER BY data_hub_imported_timestamp DESC
>     ) AS version_id_row_number
>   FROM `elife-data-pipeline.de_dev.manuscript_abstract_keywords` AS manuscript_abstract_keywords
>   WHERE ARRAY_LENGTH(extracted_keywords) > 0
> ),
> 
> t_last_manuscript_version_abstract_keywords AS (
>   SELECT
>     * EXCEPT(version_id_row_number)
>   FROM t_manuscript_version_abstract_keywords
>   WHERE version_id_row_number = 1
>   ORDER BY version_id
> )
> 
> SELECT version.version_id, manuscript_version_abstract_keywords.extracted_keywords
> FROM `elife-data-pipeline.de_dev.v_manuscript_version_last_editor_assigned_timestamp` AS version
> JOIN t_last_manuscript_version_abstract_keywords AS manuscript_version_abstract_keywords
>   ON manuscript_version_abstract_keywords.version_id = version.version_id
> WHERE version.overall_stage = 'Initial Submission'
>   AND (
>     ARRAY_LENGTH(version.senior_editors) = 0
>     OR TIMESTAMP_DIFF(CURRENT_TIMESTAMP, version.created_timestamp, DAY) < 30
>   )
>   AND TIMESTAMP_DIFF(CURRENT_TIMESTAMP, version.created_timestamp, DAY) < 365
>   AND NOT is_withdrawn
>   AND NOT is_deleted
> ```

Downloading: 100%|██████████| 965/965 [00:01<00:00, 491.39rows/s]


Unnamed: 0,version_id,extracted_keywords
0,58953/2020-05-15T00:15:18Z,"[ability, approach, c-x-c, c-x-c chemokine rec..."
1,59230/2020-05-22T14:55:14Z,"[2,4,6-trinitrobenzene sulfonic acid, aberrant..."
2,59121/2020-05-19T22:01:28Z,"[-), -) mutant, activity, axon, axonal, axonal..."
3,59654/2020-06-03T23:01:27Z,"[aberrant, aberrant exon inclusion, antisense,..."
4,59637/2020-06-03T12:55:15Z,"[anomaly, approach, associate, associated cell..."


In [14]:
keyword_similarity = cosine_similarity(
    editor_tf_idf_vectorizer.transform(
        manuscript_version_for_recommendation_df
        ['extracted_keywords']
    ),
    editor_tf_idf
)
print(keyword_similarity.max())
keyword_similarity

0.5403891148811286


array([[0.10173447, 0.08286221, 0.13020518, ..., 0.12909765, 0.10421792,
        0.14073832],
       [0.06303509, 0.03751369, 0.10522161, ..., 0.08625391, 0.09066637,
        0.20432113],
       [0.07495633, 0.07819616, 0.17322239, ..., 0.09466413, 0.1396156 ,
        0.07152955],
       ...,
       [0.04096132, 0.06304018, 0.1176507 , ..., 0.09771696, 0.1297707 ,
        0.12786451],
       [0.12183249, 0.07543425, 0.15873632, ..., 0.19758136, 0.14146249,
        0.1558505 ],
       [0.08396773, 0.20030262, 0.09535832, ..., 0.08008529, 0.08168925,
        0.06656732]])

In [15]:
weighted_keyword_valid_model.predict_ranking(
    manuscript_version_for_recommendation_df['extracted_keywords'][:1],
).proba_matrix

matrix([[0.10173447, 0.08286221, 0.13020518, 0.06402088, 0.08673865,
         0.12837785, 0.03316783, 0.09148853, 0.07098337, 0.08235923,
         0.13296573, 0.08065732, 0.11234088, 0.12716464, 0.1606169 ,
         0.090191  , 0.08867651, 0.1401539 , 0.14402271, 0.0627713 ,
         0.13255291, 0.02022985, 0.10257324, 0.15817278, 0.10851532,
         0.10561704, 0.08161497, 0.08191628, 0.14457223, 0.11328601,
         0.06430343, 0.0730466 , 0.10770775, 0.09644977, 0.09402496,
         0.1501687 , 0.08693582, 0.11468151, 0.08202416, 0.13789854,
         0.15401007, 0.13179242, 0.10571565, 0.07535757, 0.11217314,
         0.08344254, 0.12953502, 0.11960023, 0.08096164, 0.10003957,
         0.1349146 , 0.07069654, 0.12393875, 0.11991646, 0.07311337,
         0.14802576, 0.08886936, 0.15808299, 0.10448931, 0.15956584,
         0.03054326, 0.07428509, 0.12909765, 0.10421792, 0.14073832]])

In [16]:
manuscript_matching_keywords_list = weighted_keyword_valid_model.predict_ranking(
    manuscript_version_for_recommendation_df['extracted_keywords']
).matching_keywords_list
pd.Series(manuscript_matching_keywords_list[:5])

0    [[(0.019536609419929147, cell), (0.01727282038...
1    [[(0.013918817133982535, model), (0.0056706106...
2    [[(0.02213734273213075, cell), (0.006324955066...
3    [[(0.020091850019105507, cell), (0.00930480724...
4    [[(0.019333377969194477, cell), (0.00617159934...
dtype: object

In [17]:
T = TypeVar('T')


def get_recommended_editors_with_probability(
        proba_matrix: List[List[float]],
        manuscript_matching_keywords_list: List[List[Tuple[float, str]]],
        indices: List[T],
        threshold: float = 0.5) -> List[List[Tuple[float, T]]]:
    return [
        sorted([
            (
                p,
                key,
                sum(
                    s for s, _ in editor_matching_keywords
                ),
                editor_matching_keywords
            )
            for p, key, editor_matching_keywords in zip(
                row,
                indices,
                editors_matching_keywords
            ) if p >= threshold
        ], reverse=True)
        for row, editors_matching_keywords in zip(proba_matrix, manuscript_matching_keywords_list)
    ]


prediction_results_with_similarity = pd.Series(
    get_recommended_editors_with_probability(
        keyword_similarity,
        manuscript_matching_keywords_list,
        editor_names,
        threshold=0.001
    ),
    index=manuscript_version_for_recommendation_df.index
)
# print(prediction_results_with_similarity[0])
prediction_results_with_similarity[:5]

0    [(0.1606168958118253, Didier Stainier, 0.16061...
1    [(0.2043211299056196, Wendy Garrett, 0.2043211...
2    [(0.17322238822251937, Anna Akhmanova, 0.17322...
3    [(0.21450946931958897, Richard White, 0.214509...
4    [(0.16323005688041844, Richard Aldrich, 0.1632...
dtype: object

In [18]:
prediction_results_df = pd.concat([
    manuscript_version_for_recommendation_df['version_id'],
    prediction_results_with_similarity.to_frame('prediction'),
], axis=1)
print(len(prediction_results_df))
prediction_results_df.head()

965


Unnamed: 0,version_id,prediction
0,58953/2020-05-15T00:15:18Z,"[(0.1606168958118253, Didier Stainier, 0.16061..."
1,59230/2020-05-22T14:55:14Z,"[(0.2043211299056196, Wendy Garrett, 0.2043211..."
2,59121/2020-05-19T22:01:28Z,"[(0.17322238822251937, Anna Akhmanova, 0.17322..."
3,59654/2020-06-03T23:01:27Z,"[(0.21450946931958897, Richard White, 0.214509..."
4,59637/2020-06-03T12:55:15Z,"[(0.16323005688041844, Richard Aldrich, 0.1632..."


In [19]:
prediction_results_df['prediction'][0]

[(0.1606168958118253,
  'Didier Stainier',
  0.16061689581182514,
  [(0.024446497505420314, 'cell'),
   (0.011166452996539067, 'mouse'),
   (0.009727735079483291, 'cardiomyocyte'),
   (0.008970621409332092, 'model'),
   (0.008729799358074854, 'study'),
   (0.006710778043215778, 'cardiac'),
   (0.0065466524613925796, 'differentiation'),
   (0.0057624720883604425, 'level'),
   (0.005711991120531769, 'datum'),
   (0.005506362217766645, 'disease'),
   (0.00520955556726825, 'myocardial'),
   (0.005126736281429886, 'receptor'),
   (0.00457561897388751, 'effect'),
   (0.003969702994203859, 'high'),
   (0.0037135931236100626, 'potential'),
   (0.00336751521647227, 'type'),
   (0.003114576890474496, 'treatment'),
   (0.0029741806068807526, 'rna'),
   (0.0026894636580491447, 'therapeutic'),
   (0.0024430992746955686, 'approach'),
   (0.0024430992746955686, 'strategy'),
   (0.0023768819110250535, 'generation'),
   (0.0023546967819871516, 'fibroblast'),
   (0.0019349744487684927, 'infarction'),
  

In [20]:
prediction_results_flat_df = pd.DataFrame([
    {
        'version_id': row.version_id,
        'score': predicted_editor[0],
        'name': predicted_editor[1],
        'person_id': editor_person_id_by_name_map[predicted_editor[1]],
        'matching_keyword_score': predicted_editor[2],
        'matching_keywords': [{
            'score': keyword_score,
            'keyword': keyword
        } for keyword_score, keyword in predicted_editor[3]],
    }
    for row in prediction_results_df.itertuples()
    for predicted_editor in row.prediction
])
print(len(prediction_results_flat_df))
prediction_results_flat_df.head()

62722


Unnamed: 0,version_id,score,name,person_id,matching_keyword_score,matching_keywords
0,58953/2020-05-15T00:15:18Z,0.160617,Didier Stainier,7189,0.160617,"[{'score': 0.024446497505420314, 'keyword': 'c..."
1,58953/2020-05-15T00:15:18Z,0.159566,Tadatsugu Taniguchi,1026,0.159566,"[{'score': 0.025637892365621105, 'keyword': 'c..."
2,58953/2020-05-15T00:15:18Z,0.158173,Huda Zoghbi,1029,0.158173,"[{'score': 0.01727056818035229, 'keyword': 'mo..."
3,58953/2020-05-15T00:15:18Z,0.158083,Satyajit Rath,4853,0.158083,"[{'score': 0.024451134240624492, 'keyword': 'c..."
4,58953/2020-05-15T00:15:18Z,0.15401,Matthias Barton,97155,0.15401,"[{'score': 0.020906497768717693, 'keyword': 'm..."


In [21]:
prediction_results_flat_df['version_id'].nunique()

965

In [22]:
prediction_results_flat_df.max()

version_id                59773/2020-06-08T03:01:29Z
score                                       0.540389
name                                   Wendy Garrett
person_id                                      97155
matching_keyword_score                      0.540389
dtype: object

In [23]:
print('writing to:', recommendation_output_table_name)
to_gbq(
    prediction_results_flat_df,
    recommendation_output_table_name,
    project_id=project_id,
    if_exists='replace'
)
print('done')

writing to: de_dev.data_science_editor_recommendation
done
