In [1]:
project_id = 'elife-data-pipeline'
source_dataset = 'prod'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
manuscript_min_tf = 10
manuscript_max_tf = 0.9
state_path = 's3://ci-elife-data-pipeline/airflow-config/keyword-extraction/state-dev'

In [2]:
import os
from functools import partial
from itertools import groupby
from typing import List, Tuple, TypeVar

import joblib
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import data_science_pipeline.configure_warnings

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.bq import to_gbq
from data_science_pipeline.utils.io import load_object_from
from data_science_pipeline.utils.misc import identity_fn
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query,
)

In [3]:
model_path = os.path.join(state_path, 'senior_editor_model.joblib')
recommendation_output_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='editor_recommendation'
)

In [4]:
print('loading model from:', model_path)
model_dict = load_object_from(model_path)
model_dict.keys()

loading model from: s3://ci-elife-data-pipeline/airflow-config/keyword-extraction/state-dev/senior_editor_model.joblib


dict_keys(['editor_tf_idf_vectorizer', 'doc_length_weighted_editor_tf_idf', 'editor_names', 'editor_person_ids'])

In [5]:
editor_tf_idf_vectorizer = model_dict['editor_tf_idf_vectorizer']
editor_tf_idf_vectorizer

TfidfVectorizer(lowercase=False, smooth_idf=False, token_pattern=None,
                tokenizer=<function identity_fn at 0x7f372e471ef0>)

In [6]:
doc_length_weighted_editor_tf_idf = model_dict['doc_length_weighted_editor_tf_idf']
doc_length_weighted_editor_tf_idf

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.00139157, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.00150745, 0.00109185, 0.        , ..., 0.        , 0.00292359,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.01753493,
         0.        ],
        [0.        , 0.0010018 , 0.        , ..., 0.00068432, 0.00076642,
         0.        ],
        [0.00065092, 0.00141439, 0.        , ..., 0.        , 0.00216414,
         0.        ]])

In [7]:
editor_names = model_dict['editor_names']
editor_names

0           Aleksandra Walczak
1                  Andrew King
2               Anna Akhmanova
3     Barbara Shinn-Cunningham
4                Carla Rothlin
                ...           
60                 Tamar Makin
61             Timothy Behrens
62              Utpal Banerjee
63              Vivek Malhotra
64               Wendy Garrett
Name: name, Length: 65, dtype: object

In [8]:
editor_person_ids = model_dict['editor_person_ids']
editor_person_ids

0     50904
1     14601
2      8518
3     19576
4     44396
      ...  
60    18331
61     1044
62     1042
63     1133
64    28627
Name: person_id, Length: 65, dtype: object

In [9]:
editor_person_id_by_name_map = dict(zip(editor_names, editor_person_ids))
editor_person_id_by_name_map

{'Aleksandra Walczak': '50904',
 'Andrew King': '14601',
 'Anna Akhmanova': '8518',
 'Barbara Shinn-Cunningham': '19576',
 'Carla Rothlin': '44396',
 'Catherine Dulac': '1014',
 'Chris Baker': '28129',
 'Christian Büchel': '16197',
 'Christian Hardtke': '1102',
 'Christian Rutz': '15332',
 'Clifford Rosen': '48282',
 'Cynthia Wolberger': '5005',
 'David Ron': '1174',
 'Detlef Weigel': '1030',
 'Didier Stainier': '7189',
 'Diethard Tautz': '1191',
 'Dominique Soldati-Favre': '41647',
 'Eduardo Franco': '1086',
 'Edward Morrisey': '55645',
 'Floris de Lange': '28130',
 'Gary Westbrook': '1202',
 'George Perry': '42011',
 'Gisela Storz': '1188',
 'Huda Zoghbi': '1029',
 'James Manley': '1020',
 'Jessica Tyler': '1421',
 'John Huguenard': '13947',
 'John Kuriyan': '1018',
 'Jonathan Cooper': '1062',
 'Jos van der Meer': '41027',
 'Joshua Gold': '17965',
 'José Faraldo-Gómez': '13987',
 'K VijayRaghavan': '1027',
 'Karla Kirkegaard': '3645',
 'Kate Wassum': '33127',
 'Kathryn Cheah': '91149

In [10]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [11]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [12]:
manuscript_version_for_recommendation_df = read_big_query(
    get_sql('manuscript-version-initial-submissions-for-senior-editor-recommendation.sql').format(
        **default_query_props
    )
)
manuscript_version_for_recommendation_df.head()

> ```sql
> WITH t_manuscript_version_abstract_keywords AS (
>   SELECT
>     LPAD(CAST(manuscript_abstract_keywords.manuscript_id AS STRING), 5, '0') AS manuscript_id,
>     manuscript_abstract_keywords.version_id,
>     manuscript_abstract_keywords.extracted_keywords,
>     ROW_NUMBER() OVER (
>       PARTITION BY version_id
>       ORDER BY data_hub_imported_timestamp DESC
>     ) AS version_id_row_number
>   FROM `elife-data-pipeline.prod.manuscript_abstract_keywords` AS manuscript_abstract_keywords
>   WHERE ARRAY_LENGTH(extracted_keywords) > 0
> ),
> 
> t_last_manuscript_version_abstract_keywords AS (
>   SELECT
>     * EXCEPT(version_id_row_number)
>   FROM t_manuscript_version_abstract_keywords
>   WHERE version_id_row_number = 1
>   ORDER BY version_id
> )
> 
> SELECT version.version_id, manuscript_version_abstract_keywords.extracted_keywords
> FROM `elife-data-pipeline.prod.mv_manuscript_version` AS version
> JOIN t_last_manuscript_version_abstract_keywords AS manuscript_version_abstract_keywords
>   ON manuscript_version_abstract_keywords.version_id = version.version_id
> WHERE version.overall_stage = 'Initial Submission'
>   AND ARRAY_LENGTH(version.senior_editors) = 0
>   AND TIMESTAMP_DIFF(CURRENT_TIMESTAMP, version.created_timestamp, DAY) < 365
> ```

Downloading: 100%|██████████| 135/135 [00:01<00:00, 111.25rows/s]


Unnamed: 0,version_id,extracted_keywords
0,441048/2020-03-04T16:55:14Z,"[acquisition, animal, consolidation, cortex, e..."
1,49311/2019-06-13T15:00:47Z,"[affinity, agonist, amplitude, assembly, biase..."
2,49781/2019-06-28T14:30:11Z,"[2018 meeting, 68 recommendation, action, acti..."
3,50008/2019-07-08T04:30:37Z,"[available, brand, cenforce, cenforce(generic,..."
4,50082/2019-07-10T12:00:50Z,"[15-29 age range, 2015-2017 epidemic, abnormal..."


In [13]:
keyword_similarity = cosine_similarity(
    editor_tf_idf_vectorizer.transform(
        manuscript_version_for_recommendation_df
        ['extracted_keywords']
    ),
    doc_length_weighted_editor_tf_idf
)
print(keyword_similarity.max())
keyword_similarity

0.3355734086321363


array([[0.08405994, 0.14742345, 0.06909751, ..., 0.05710582, 0.05775535,
        0.06756911],
       [0.11588705, 0.09434875, 0.12685097, ..., 0.11780483, 0.12765585,
        0.11149259],
       [0.07161397, 0.04634709, 0.04978881, ..., 0.03773747, 0.04670254,
        0.04851046],
       ...,
       [0.0872426 , 0.05539434, 0.10876117, ..., 0.07425155, 0.10561886,
        0.07420016],
       [0.10689867, 0.07306304, 0.18234542, ..., 0.20835954, 0.16439594,
        0.12621397],
       [0.05188661, 0.0316176 , 0.03079543, ..., 0.02175103, 0.02470023,
        0.02204759]])

In [14]:
doc_length_weighted_editor_tf_idf.shape

(65, 20373)

In [15]:
doc_length_weighted_editor_tf_idf

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.00139157, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.00150745, 0.00109185, 0.        , ..., 0.        , 0.00292359,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.01753493,
         0.        ],
        [0.        , 0.0010018 , 0.        , ..., 0.00068432, 0.00076642,
         0.        ],
        [0.00065092, 0.00141439, 0.        , ..., 0.        , 0.00216414,
         0.        ]])

In [16]:
editor_tf_idf_vectorizer.transform(
    manuscript_version_for_recommendation_df
    ['extracted_keywords']
)[0]

<1x20373 sparse matrix of type '<class 'numpy.float64'>'
	with 43 stored elements in Compressed Sparse Row format>

In [17]:
def get_single_manuscript_matching_keywords(
        manuscript_keywords,
        editor_tf_idf=None):
    # Note: multiply is much faster with a dense matrix 
    matching_keyword_matrix = np.multiply(
        editor_tf_idf_vectorizer.transform(
            [manuscript_keywords]
        )[0].todense(),
        editor_tf_idf
    )
    non_zero_matrix = np.nonzero(matching_keyword_matrix)
    values = np.asarray(matching_keyword_matrix[non_zero_matrix])[0]
    flat_matching_keywords = np.array(editor_tf_idf_vectorizer.get_feature_names())[non_zero_matrix[-1]]
    d = {
        key: sorted((t[1] for t in grouped_values), reverse=True)
        for key, grouped_values in groupby(
            zip(non_zero_matrix[0], zip(values, flat_matching_keywords)),
            key=lambda t: t[0]
        )
    }
    return [d.get(i, []) for i in range(len(editor_tf_idf))]


get_single_manuscript_matching_keywords(
    manuscript_version_for_recommendation_df
    ['extracted_keywords']
    [0],
    editor_tf_idf=doc_length_weighted_editor_tf_idf[:3]
)

[[(0.043129416405413776, 'result'),
  (0.03147281737692357, 'system'),
  (0.03005213302715085, 'new'),
  (0.02447885795982944, 'time'),
  (0.020981878251282376, 'effect'),
  (0.020639405127786357, 'simple'),
  (0.020435450458462578, 'network'),
  (0.017484898542735317, 'multiple'),
  (0.017355213421823106, 'specific'),
  (0.013597332432739823, 'spatial'),
  (0.01054745744365369, 'information'),
  (0.010490939125641188, 'increase'),
  (0.009616682568688273, 'previous'),
  (0.008450134564553744, 'memory'),
  (0.007910593082740268, 'animal'),
  (0.005828299514245105, 'form'),
  (0.004974992305208271, 'experience'),
  (0.004762939644873774, 'acquisition'),
  (0.0043393035507489665, 'period'),
  (0.004222094667287006, 'task'),
  (0.004222094667287006, 'food'),
  (0.0029839055257000167, 'learn'),
  (0.002719466486547965, 'knowledge'),
  (0.002111047333643503, 'cortex'),
  (0.0019796337151708056, 'trial'),
  (0.0016379629170098615, 'performance'),
  (0.001539000676292104, 'location')],
 [(0.0

In [18]:
def get_multiple_manuscript_matching_keywords(
        manuscript_keywords_list: pd.Series,
        editor_tf_idf=None):
    return (
        manuscript_keywords_list
        .apply(partial(
            get_single_manuscript_matching_keywords,
            editor_tf_idf=editor_tf_idf
        ))
    )


get_multiple_manuscript_matching_keywords(
    manuscript_version_for_recommendation_df['extracted_keywords'][:3],
    editor_tf_idf=doc_length_weighted_editor_tf_idf
)

0    [[(0.043129416405413776, result), (0.031472817...
1    [[(0.06701479445787284, cell), (0.032492060849...
2    [[(0.04805396907999008, mechanism), (0.0435488...
Name: extracted_keywords, dtype: object

In [19]:
manuscript_matching_keywords_ser = get_multiple_manuscript_matching_keywords(
    manuscript_version_for_recommendation_df['extracted_keywords'],
    editor_tf_idf=doc_length_weighted_editor_tf_idf
)
manuscript_matching_keywords_ser.head()

0    [[(0.043129416405413776, result), (0.031472817...
1    [[(0.06701479445787284, cell), (0.032492060849...
2    [[(0.04805396907999008, mechanism), (0.0435488...
3    [[(0.05580082217245639, type), (0.039314622447...
4    [[(0.07736308391824086, model), (0.04831781836...
Name: extracted_keywords, dtype: object

In [20]:
T = TypeVar('T')


def get_recommended_editors_with_probability(
        proba_matrix: List[List[float]],
        manuscript_matching_keywords_ser: pd.Series,
        indices: List[T],
        threshold: float = 0.5) -> List[List[Tuple[float, T]]]:
    return [
        sorted([
            (
                p,
                key,
                sum(
                    s for s, _ in editor_matching_keywords
                ),
                editor_matching_keywords
            )
            for p, key, editor_matching_keywords in zip(
                row,
                indices,
                editors_matching_keywords
            ) if p >= threshold
        ], reverse=True)
        for row, editors_matching_keywords in zip(proba_matrix, manuscript_matching_keywords_ser)
    ]


prediction_results_with_similarity = pd.Series(
    get_recommended_editors_with_probability(
        keyword_similarity,
        manuscript_matching_keywords_ser,
        editor_names,
        threshold=0.001
    ),
    index=manuscript_version_for_recommendation_df.index
)
# print(prediction_results_with_similarity[0])
prediction_results_with_similarity[:5]

0    [(0.23447781188397485, Laura Colgin, 0.2676738...
1    [(0.17610076153423337, Satyajit Rath, 0.261375...
2    [(0.08738299955933337, Christian Büchel, 0.208...
3    [(0.0502403906192875, Clifford Rosen, 0.190444...
4    [(0.2464676806837211, Neil Ferguson, 0.6794412...
dtype: object

In [21]:
prediction_results_df = pd.concat([
    manuscript_version_for_recommendation_df['version_id'],
    prediction_results_with_similarity.to_frame('prediction'),
], axis=1)
print(len(prediction_results_df))
prediction_results_df.head()

135


Unnamed: 0,version_id,prediction
0,441048/2020-03-04T16:55:14Z,"[(0.23447781188397485, Laura Colgin, 0.2676738..."
1,49311/2019-06-13T15:00:47Z,"[(0.17610076153423337, Satyajit Rath, 0.261375..."
2,49781/2019-06-28T14:30:11Z,"[(0.08738299955933337, Christian Büchel, 0.208..."
3,50008/2019-07-08T04:30:37Z,"[(0.0502403906192875, Clifford Rosen, 0.190444..."
4,50082/2019-07-10T12:00:50Z,"[(0.2464676806837211, Neil Ferguson, 0.6794412..."


In [22]:
prediction_results_df['prediction'][0]

[(0.23447781188397485,
  'Laura Colgin',
  0.26767388220509436,
  [(0.0235380639586542, 'cortex'),
   (0.021572797850357733, 'memory'),
   (0.01704480493557718, 'task'),
   (0.015081690477659426, 'prefrontal'),
   (0.01389339300942232, 'result'),
   (0.012682330628940884, 'prefrontal cortex'),
   (0.011073998990878268, 'consolidation'),
   (0.010560387503231052, 'spatial'),
   (0.010138239815412106, 'information'),
   (0.009705755757957905, 'network'),
   (0.00959743939487418, 'navigation'),
   (0.006694763309097865, 'experience'),
   (0.006381823762714771, 'session'),
   (0.006364070346251514, 'time'),
   (0.006103382086770546, 'maze'),
   (0.006095165965423983, 'increase'),
   (0.006082943889247263, 'animal'),
   (0.005919795284542729, 'performance'),
   (0.005814823317043464, 'specific'),
   (0.005646991997378103, 'effect'),
   (0.005506822467685972, 'learn'),
   (0.005197993684957203, 'rodent'),
   (0.0046713461850322645, 'navigation task'),
   (0.004436916917923614, 'new'),
   (0.

In [23]:
prediction_results_flat_df = pd.DataFrame([
    {
        'version_id': row.version_id,
        'score': predicted_editor[0],
        'name': predicted_editor[1],
        'person_id': editor_person_id_by_name_map[predicted_editor[1]],
        'matching_keyword_score': predicted_editor[2],
        'matching_keywords': [{
            'score': keyword_score,
            'keyword': keyword
        } for keyword_score, keyword in predicted_editor[3]],
    }
    for row in prediction_results_df.itertuples()
    for predicted_editor in row.prediction
])
print(len(prediction_results_flat_df))
prediction_results_flat_df.head()

8639


Unnamed: 0,version_id,score,name,person_id,matching_keyword_score,matching_keywords
0,441048/2020-03-04T16:55:14Z,0.234478,Laura Colgin,16452,0.267674,"[{'score': 0.0235380639586542, 'keyword': 'cor..."
1,441048/2020-03-04T16:55:14Z,0.228894,Michael Frank,24218,0.196782,"[{'score': 0.019729466116625585, 'keyword': 'c..."
2,441048/2020-03-04T16:55:14Z,0.210788,Timothy Behrens,1044,0.088202,"[{'score': 0.009204529615972257, 'keyword': 'c..."
3,441048/2020-03-04T16:55:14Z,0.190183,Richard Ivry,30815,0.142855,"[{'score': 0.016977865055845157, 'keyword': 't..."
4,441048/2020-03-04T16:55:14Z,0.189951,Joshua Gold,17965,0.24714,"[{'score': 0.030679890185112378, 'keyword': 'c..."


In [24]:
prediction_results_flat_df['version_id'].nunique()

134

In [26]:
print('writing to:', recommendation_output_table_name)
to_gbq(
    prediction_results_flat_df,
    recommendation_output_table_name,
    project_id=project_id,
    if_exists='replace'
)
print('done')

writing to: de_dev.data_science_editor_recommendation
done
