In [1]:
project_id = 'elife-data-pipeline'
source_dataset = 'prod'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
manuscript_min_tf = 10
manuscript_max_tf = 0.9
state_path = 's3://ci-elife-data-pipeline/airflow-config/keyword-extraction/state-dev'

In [2]:
import os
from functools import partial
from itertools import groupby
from typing import List, Tuple, TypeVar

import joblib
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import data_science_pipeline.configure_warnings

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.bq import to_gbq
from data_science_pipeline.utils.io import load_object_from
from data_science_pipeline.utils.misc import identity_fn
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query,
)

In [3]:
model_path = os.path.join(state_path, 'senior_editor_model.joblib')
recommendation_output_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='editor_recommendation'
)

In [4]:
print('loading model from:', model_path)
model_dict = load_object_from(model_path)
model_dict.keys()

loading model from: s3://ci-elife-data-pipeline/airflow-config/keyword-extraction/state-dev/senior_editor_model.joblib


dict_keys(['editor_tf_idf_vectorizer', 'doc_length_weighted_editor_tf_idf', 'editor_names', 'editor_person_ids'])

In [5]:
editor_tf_idf_vectorizer = model_dict['editor_tf_idf_vectorizer']
editor_tf_idf_vectorizer

TfidfVectorizer(lowercase=False, smooth_idf=False, token_pattern=None,
                tokenizer=<function identity_fn at 0x7f62c6d5edd0>)

In [6]:
doc_length_weighted_editor_tf_idf = model_dict['doc_length_weighted_editor_tf_idf']
doc_length_weighted_editor_tf_idf

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.00139157, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.00150745, 0.00109185, 0.        , ..., 0.        , 0.00292359,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.01753493,
         0.        ],
        [0.        , 0.0010018 , 0.        , ..., 0.00068432, 0.00076642,
         0.        ],
        [0.00065092, 0.00141439, 0.        , ..., 0.        , 0.00216414,
         0.        ]])

In [7]:
editor_names = model_dict['editor_names']
editor_names

0           Aleksandra Walczak
1                  Andrew King
2               Anna Akhmanova
3     Barbara Shinn-Cunningham
4                Carla Rothlin
                ...           
60                 Tamar Makin
61             Timothy Behrens
62              Utpal Banerjee
63              Vivek Malhotra
64               Wendy Garrett
Name: name, Length: 65, dtype: object

In [8]:
editor_person_ids = model_dict['editor_person_ids']
editor_person_ids

0     50904
1     14601
2      8518
3     19576
4     44396
      ...  
60    18331
61     1044
62     1042
63     1133
64    28627
Name: person_id, Length: 65, dtype: object

In [9]:
editor_person_id_by_name_map = dict(zip(editor_names, editor_person_ids))
editor_person_id_by_name_map

{'Aleksandra Walczak': '50904',
 'Andrew King': '14601',
 'Anna Akhmanova': '8518',
 'Barbara Shinn-Cunningham': '19576',
 'Carla Rothlin': '44396',
 'Catherine Dulac': '1014',
 'Chris Baker': '28129',
 'Christian Büchel': '16197',
 'Christian Hardtke': '1102',
 'Christian Rutz': '15332',
 'Clifford Rosen': '48282',
 'Cynthia Wolberger': '5005',
 'David Ron': '1174',
 'Detlef Weigel': '1030',
 'Didier Stainier': '7189',
 'Diethard Tautz': '1191',
 'Dominique Soldati-Favre': '41647',
 'Eduardo Franco': '1086',
 'Edward Morrisey': '55645',
 'Floris de Lange': '28130',
 'Gary Westbrook': '1202',
 'George Perry': '42011',
 'Gisela Storz': '1188',
 'Huda Zoghbi': '1029',
 'James Manley': '1020',
 'Jessica Tyler': '1421',
 'John Huguenard': '13947',
 'John Kuriyan': '1018',
 'Jonathan Cooper': '1062',
 'Jos van der Meer': '41027',
 'Joshua Gold': '17965',
 'José Faraldo-Gómez': '13987',
 'K VijayRaghavan': '1027',
 'Karla Kirkegaard': '3645',
 'Kate Wassum': '33127',
 'Kathryn Cheah': '91149

In [10]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [11]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [12]:
manuscript_version_for_recommendation_df = read_big_query(
    get_sql('manuscript-version-initial-submissions-for-senior-editor-recommendation.sql').format(
        **default_query_props
    )
)
manuscript_version_for_recommendation_df.head()

> ```sql
> WITH t_manuscript_version_abstract_keywords AS (
>   SELECT
>     LPAD(CAST(manuscript_abstract_keywords.manuscript_id AS STRING), 5, '0') AS manuscript_id,
>     manuscript_abstract_keywords.version_id,
>     manuscript_abstract_keywords.extracted_keywords,
>     ROW_NUMBER() OVER (
>       PARTITION BY version_id
>       ORDER BY data_hub_imported_timestamp DESC
>     ) AS version_id_row_number
>   FROM `elife-data-pipeline.prod.manuscript_abstract_keywords` AS manuscript_abstract_keywords
>   WHERE ARRAY_LENGTH(extracted_keywords) > 0
> ),
> 
> t_last_manuscript_version_abstract_keywords AS (
>   SELECT
>     * EXCEPT(version_id_row_number)
>   FROM t_manuscript_version_abstract_keywords
>   WHERE version_id_row_number = 1
>   ORDER BY version_id
> )
> 
> SELECT version.version_id, manuscript_version_abstract_keywords.extracted_keywords
> FROM `elife-data-pipeline.prod.mv_manuscript_version` AS version
> JOIN t_last_manuscript_version_abstract_keywords AS manuscript_version_abstract_keywords
>   ON manuscript_version_abstract_keywords.version_id = version.version_id
> WHERE version.overall_stage = 'Initial Submission'
>   AND ARRAY_LENGTH(version.senior_editors) = 0
>   AND TIMESTAMP_DIFF(CURRENT_TIMESTAMP, version.created_timestamp, DAY) < 365
> ```

Downloading: 100%|██████████| 135/135 [00:01<00:00, 117.10rows/s]


Unnamed: 0,version_id,extracted_keywords
0,49069/2019-06-05T12:39:51Z,"[1 expression level, accurate, accurate model,..."
1,49224/2019-06-11T08:05:08Z,"[activity, association, bacteria, charge, char..."
2,49391/2019-06-17T03:54:44Z,"[4.1 protein, associate, associated protein, b..."
3,50111/2019-07-11T05:25:04Z,"[activity, axis, basal, basal expression, basa..."
4,51945/2019-09-17T10:10:25Z,"[activation, activation signaling, adhesion, a..."


In [13]:
keyword_similarity = cosine_similarity(
    editor_tf_idf_vectorizer.transform(
        manuscript_version_for_recommendation_df
        ['extracted_keywords']
    ),
    doc_length_weighted_editor_tf_idf
)
print(keyword_similarity.max())
keyword_similarity

0.3355734086321363


array([[0.17789009, 0.09409137, 0.13246859, ..., 0.11589037, 0.11555229,
        0.12619189],
       [0.07523874, 0.06317391, 0.10638405, ..., 0.07578461, 0.1125783 ,
        0.1178674 ],
       [0.07911117, 0.05118664, 0.16084152, ..., 0.09419436, 0.18452799,
        0.11085467],
       ...,
       [0.07797898, 0.08630291, 0.12348528, ..., 0.13660572, 0.11095575,
        0.10314962],
       [0.08109075, 0.07823628, 0.08013092, ..., 0.07684354, 0.07144866,
        0.10282327],
       [0.12926034, 0.07417636, 0.20160478, ..., 0.17831767, 0.19024005,
        0.12596947]])

In [14]:
doc_length_weighted_editor_tf_idf.shape

(65, 20373)

In [15]:
doc_length_weighted_editor_tf_idf

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.00139157, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.00150745, 0.00109185, 0.        , ..., 0.        , 0.00292359,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.01753493,
         0.        ],
        [0.        , 0.0010018 , 0.        , ..., 0.00068432, 0.00076642,
         0.        ],
        [0.00065092, 0.00141439, 0.        , ..., 0.        , 0.00216414,
         0.        ]])

In [16]:
editor_tf_idf_vectorizer.transform(
    manuscript_version_for_recommendation_df
    ['extracted_keywords']
)[0]

<1x20373 sparse matrix of type '<class 'numpy.float64'>'
	with 68 stored elements in Compressed Sparse Row format>

In [17]:
def get_single_manuscript_matching_keywords(
        manuscript_keywords,
        editor_tf_idf=None):
    # Note: multiply is much faster with a dense matrix 
    matching_keyword_matrix = np.multiply(
        editor_tf_idf_vectorizer.transform(
            [manuscript_keywords]
        )[0].todense(),
        editor_tf_idf
    )
    non_zero_matrix = np.nonzero(matching_keyword_matrix)
    values = np.asarray(matching_keyword_matrix[non_zero_matrix])[0]
    flat_matching_keywords = np.array(editor_tf_idf_vectorizer.get_feature_names())[non_zero_matrix[-1]]
    d = {
        key: sorted((t[1] for t in grouped_values), reverse=True)
        for key, grouped_values in groupby(
            zip(non_zero_matrix[0], zip(values, flat_matching_keywords)),
            key=lambda t: t[0]
        )
    }
    return [d.get(i, []) for i in range(len(editor_tf_idf))]


get_single_manuscript_matching_keywords(
    manuscript_version_for_recommendation_df
    ['extracted_keywords']
    [0],
    editor_tf_idf=doc_length_weighted_editor_tf_idf[:3]
)

[[(0.0756555584874029, 'cell'),
  (0.07038887664035, 'model'),
  (0.04396201371565304, 'dynamic'),
  (0.036736079765009436, 'mathematical'),
  (0.019503202446439334, 'prediction'),
  (0.017890862827622096, 'concentration'),
  (0.01784506731727183, 'response'),
  (0.017798433958278514, 'shape'),
  (0.017380331003862826, 'type'),
  (0.017380331003862826, 'network'),
  (0.016853674688534506, 'level'),
  (0.016357958591870895, 'single'),
  (0.01619032545934394, 'molecular'),
  (0.015862282059797183, 'role'),
  (0.01558569531263056, 'mathematical modeling'),
  (0.0148532105229503, 'diversity'),
  (0.01476059236017749, 'expression'),
  (0.01434209251136987, 'cell dynamic'),
  (0.013348825468708882, 'cancer'),
  (0.01304655280343508, 'biochemical'),
  (0.01288810417358521, 'factor'),
  (0.012268468943903174, 'disease'),
  (0.012245359921669812, 'strength'),
  (0.010938696482440067, 'give'),
  (0.00948895223154267, 'treatment'),
  (0.009251614548196537, 'stress'),
  (0.008787206282595359, 'sho

In [18]:
def get_multiple_manuscript_matching_keywords(
        manuscript_keywords_list: pd.Series,
        editor_tf_idf=None):
    return (
        manuscript_keywords_list
        .apply(partial(
            get_single_manuscript_matching_keywords,
            editor_tf_idf=editor_tf_idf
        ))
    )


get_multiple_manuscript_matching_keywords(
    manuscript_version_for_recommendation_df['extracted_keywords'][:3],
    editor_tf_idf=doc_length_weighted_editor_tf_idf
)

0    [[(0.0756555584874029, cell), (0.0703888766403...
1    [[(0.0362739087978428, mechanism), (0.03287319...
2    [[(0.10036800660091151, cell), (0.048663334778...
Name: extracted_keywords, dtype: object

In [19]:
manuscript_matching_keywords_ser = get_multiple_manuscript_matching_keywords(
    manuscript_version_for_recommendation_df['extracted_keywords'],
    editor_tf_idf=doc_length_weighted_editor_tf_idf
)
manuscript_matching_keywords_ser.head()

0    [[(0.0756555584874029, cell), (0.0703888766403...
1    [[(0.0362739087978428, mechanism), (0.03287319...
2    [[(0.10036800660091151, cell), (0.048663334778...
3    [[(0.08458135803750542, cell), (0.041009192884...
4    [[(0.07047924030283395, model), (0.03672861818...
Name: extracted_keywords, dtype: object

In [20]:
T = TypeVar('T')


def get_recommended_editors_with_probability(
        proba_matrix: List[List[float]],
        manuscript_matching_keywords_ser: pd.Series,
        indices: List[T],
        threshold: float = 0.5) -> List[List[Tuple[float, T]]]:
    return [
        sorted([
            (
                p,
                key,
                sum(
                    s for s, _ in editor_matching_keywords
                ),
                editor_matching_keywords
            )
            for p, key, editor_matching_keywords in zip(
                row,
                indices,
                editors_matching_keywords
            ) if p >= threshold
        ], reverse=True)
        for row, editors_matching_keywords in zip(proba_matrix, manuscript_matching_keywords_ser)
    ]


prediction_results_with_similarity = pd.Series(
    get_recommended_editors_with_probability(
        keyword_similarity,
        manuscript_matching_keywords_ser,
        editor_names,
        threshold=0.001
    ),
    index=manuscript_version_for_recommendation_df.index
)
# print(prediction_results_with_similarity[0])
prediction_results_with_similarity[:5]

0    [(0.20284825072707918, Naama Barkai, 0.1192771...
1    [(0.16296864597421715, Philip Cole, 0.16820940...
2    [(0.2638693234976934, Dominique Soldati-Favre,...
3    [(0.2533793660157667, Maureen Murphy, 0.792941...
4    [(0.2237822437909484, Olga Boudker, 0.56505807...
dtype: object

In [21]:
prediction_results_df = pd.concat([
    manuscript_version_for_recommendation_df['version_id'],
    prediction_results_with_similarity.to_frame('prediction'),
], axis=1)
print(len(prediction_results_df))
prediction_results_df.head()

135


Unnamed: 0,version_id,prediction
0,49069/2019-06-05T12:39:51Z,"[(0.20284825072707918, Naama Barkai, 0.1192771..."
1,49224/2019-06-11T08:05:08Z,"[(0.16296864597421715, Philip Cole, 0.16820940..."
2,49391/2019-06-17T03:54:44Z,"[(0.2638693234976934, Dominique Soldati-Favre,..."
3,50111/2019-07-11T05:25:04Z,"[(0.2533793660157667, Maureen Murphy, 0.792941..."
4,51945/2019-09-17T10:10:25Z,"[(0.2237822437909484, Olga Boudker, 0.56505807..."


In [22]:
prediction_results_df['prediction'][0]

[(0.20284825072707918,
  'Naama Barkai',
  0.1192771280840941,
  [(0.013144729364537961, 'cell'),
   (0.009290178935014487, 'model'),
   (0.005787192236089882, 'expression'),
   (0.005586509979928634, 'dynamic'),
   (0.0052831700715162185, 'network'),
   (0.004436734530969979, 'level'),
   (0.00389746293052059, 'factor'),
   (0.0038239258940956727, 'response'),
   (0.0037503888576707564, 'role'),
   (0.0034884022463496462, 'molecular'),
   (0.0033114606668355243, 'single'),
   (0.0031033774739564037, 'mathematical'),
   (0.0029704671073837257, 'cancer'),
   (0.0025144322407529675, 'prediction'),
   (0.0025025542444024195, 'type'),
   (0.0023732572659591862, 'stress'),
   (0.0019463598882635178, 'concentration'),
   (0.0019157618893887406, 'measurement'),
   (0.0018575717976718796, 'variation'),
   (0.0018166112042671632, 'phenotypic'),
   (0.001769482799072418, 'disease'),
   (0.0017104374775649095, 'heterogeneity'),
   (0.0015687901103982247, 'major'),
   (0.0015329625505020258, 'phen

In [23]:
prediction_results_flat_df = pd.DataFrame([
    {
        'version_id': row.version_id,
        'score': predicted_editor[0],
        'name': predicted_editor[1],
        'person_id': editor_person_id_by_name_map[predicted_editor[1]],
        'matching_keyword_score': predicted_editor[2],
        'matching_keywords': [{
            'score': keyword_score,
            'keyword': keyword
        } for keyword_score, keyword in predicted_editor[3]],
    }
    for row in prediction_results_df.itertuples()
    for predicted_editor in row.prediction
])
print(len(prediction_results_flat_df))
prediction_results_flat_df.head()

8639


Unnamed: 0,version_id,score,name,person_id,matching_keyword_score,matching_keywords
0,49069/2019-06-05T12:39:51Z,0.202848,Naama Barkai,1725,0.119277,"[{'score': 0.013144729364537961, 'keyword': 'c..."
1,49069/2019-06-05T12:39:51Z,0.17789,Aleksandra Walczak,50904,0.694047,"[{'score': 0.0756555584874029, 'keyword': 'cel..."
2,49069/2019-06-05T12:39:51Z,0.140364,Maureen Murphy,33764,0.439265,"[{'score': 0.08211727587213337, 'keyword': 'ca..."
3,49069/2019-06-05T12:39:51Z,0.138349,Detlef Weigel,1030,0.027471,"[{'score': 0.002278748665085694, 'keyword': 'e..."
4,49069/2019-06-05T12:39:51Z,0.137367,Philip Cole,21020,0.141784,"[{'score': 0.019289698639547295, 'keyword': 'c..."


In [24]:
print('writing to:', recommendation_output_table_name)
to_gbq(
    prediction_results_flat_df[:3],
    recommendation_output_table_name,
    project_id=project_id,
    if_exists='replace'
)
print('done')



writing to: de_dev.data_science_editor_recommendation
done
