In [1]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
manuscript_min_tf = 10
manuscript_max_tf = 0.9
state_path = 's3://ci-elife-data-pipeline/airflow-config/data-science/state-dev'

In [2]:
import os
from functools import partial
from typing import List, Tuple, T

import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity

import data_science_pipeline.configure_warnings  #pylint: disable=unused-import

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.bq import to_gbq
from data_science_pipeline.utils.io import load_object_from
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query
)
from data_science_pipeline.peerscout.models import (
    WeightedKeywordModel
)

In [3]:
model_path = os.path.join(state_path, 'senior_editor_model.joblib')
recommendation_output_table_name = '{output_dataset}.{prefix}{suffix}'.format(
    output_dataset=output_dataset,
    prefix=output_table_prefix,
    suffix='editor_recommendation'
)

In [4]:
print('loading model from:', model_path)
model_dict = load_object_from(model_path)
model_dict.keys()

loading model from: s3://ci-elife-data-pipeline/airflow-config/data-science/state-dev/senior_editor_model.joblib


dict_keys(['editor_tf_idf_vectorizer', 'editor_tf_idf', 'editor_names', 'editor_person_ids'])

In [5]:
editor_tf_idf_vectorizer = model_dict['editor_tf_idf_vectorizer']
editor_tf_idf_vectorizer

TfidfVectorizer(lowercase=False, smooth_idf=False, token_pattern=None,
                tokenizer=<function identity_fn at 0x7f126b840dd0>)

In [6]:
editor_tf_idf = model_dict['editor_tf_idf']
editor_tf_idf

<65x13658 sparse matrix of type '<class 'numpy.float64'>'
	with 218399 stored elements in Compressed Sparse Row format>

In [7]:
editor_names = model_dict['editor_names']
editor_names

0           Aleksandra Walczak
1                  Andrew King
2               Anna Akhmanova
3     Barbara Shinn-Cunningham
4                Carla Rothlin
                ...           
60                 Tamar Makin
61             Timothy Behrens
62              Utpal Banerjee
63              Vivek Malhotra
64               Wendy Garrett
Name: name, Length: 65, dtype: object

In [8]:
editor_person_ids = model_dict['editor_person_ids']
editor_person_ids

0     50904
1     14601
2      8518
3     19576
4     44396
      ...  
60    18331
61     1044
62     1042
63     1133
64    28627
Name: person_id, Length: 65, dtype: object

In [9]:
editor_person_id_by_name_map = dict(zip(editor_names, editor_person_ids))
editor_person_id_by_name_map

{'Aleksandra Walczak': '50904',
 'Andrew King': '14601',
 'Anna Akhmanova': '8518',
 'Barbara Shinn-Cunningham': '19576',
 'Carla Rothlin': '44396',
 'Catherine Dulac': '1014',
 'Chris Baker': '28129',
 'Christian Büchel': '16197',
 'Christian Hardtke': '1102',
 'Christian Rutz': '15332',
 'Clifford Rosen': '48282',
 'Cynthia Wolberger': '5005',
 'David Ron': '1174',
 'Detlef Weigel': '1030',
 'Didier Stainier': '7189',
 'Diethard Tautz': '1191',
 'Dominique Soldati-Favre': '41647',
 'Eduardo Franco': '1086',
 'Edward Morrisey': '55645',
 'Floris de Lange': '28130',
 'Gary Westbrook': '1202',
 'George Perry': '42011',
 'Gisela Storz': '1188',
 'Huda Zoghbi': '1029',
 'James Manley': '1020',
 'Jessica Tyler': '1421',
 'John Huguenard': '13947',
 'John Kuriyan': '1018',
 'Jonathan Cooper': '1062',
 'Jos van der Meer': '41027',
 'Joshua Gold': '17965',
 'José Faraldo-Gómez': '13987',
 'K VijayRaghavan': '1027',
 'Karla Kirkegaard': '3645',
 'Kate Wassum': '33127',
 'Kathryn Cheah': '91149

In [10]:
weighted_keyword_valid_model = WeightedKeywordModel.from_tf_matrix(
    editor_tf_idf.todense(),
    vectorizer=editor_tf_idf_vectorizer,
    choices=editor_names
)
weighted_keyword_valid_model

<data_science_pipeline.peerscout.models.WeightedKeywordModel at 0x7f126af9f9d0>

In [11]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [12]:
default_query_props = dict(project=project_id, dataset=source_dataset)

In [13]:
manuscript_version_for_recommendation_df = read_big_query(
    get_sql('manuscript-version-initial-submissions-for-senior-editor-recommendation.sql').format(
        **default_query_props
    )
)
print(len(manuscript_version_for_recommendation_df))
manuscript_version_for_recommendation_df.head()

> ```sql
> -- Main features:
> --    - Returns Initial Submissions for the purpose of Senior Editor recommendation
> --    - No older than a year
> --    - Not have Senior Editor assigned for more than 30 days
> 
> WITH t_manuscript_version_abstract_keywords AS (
>   SELECT
>     manuscript_abstract_keywords.manuscript_id AS manuscript_id,
>     manuscript_abstract_keywords.version_id,
>     manuscript_abstract_keywords.extracted_keywords,
>     ROW_NUMBER() OVER (
>       PARTITION BY version_id
>       ORDER BY data_hub_imported_timestamp DESC
>     ) AS version_id_row_number
>   FROM `elife-data-pipeline.de_dev.manuscript_abstract_keywords` AS manuscript_abstract_keywords
>   WHERE ARRAY_LENGTH(extracted_keywords) > 0
> ),
> 
> t_last_manuscript_version_abstract_keywords AS (
>   SELECT
>     * EXCEPT(version_id_row_number)
>   FROM t_manuscript_version_abstract_keywords
>   WHERE version_id_row_number = 1
>   ORDER BY version_id
> )
> 
> SELECT version.version_id, manuscript_version_abstract_keywords.extracted_keywords
> FROM `elife-data-pipeline.de_dev.v_manuscript_version_last_editor_assigned_timestamp` AS version
> JOIN t_last_manuscript_version_abstract_keywords AS manuscript_version_abstract_keywords
>   ON manuscript_version_abstract_keywords.version_id = version.version_id
> WHERE version.overall_stage = 'Initial Submission'
>   AND (
>     ARRAY_LENGTH(version.senior_editors) = 0
>     OR TIMESTAMP_DIFF(
>       CURRENT_TIMESTAMP,
>       (SELECT MAX(last_assigned_timestamp) FROM UNNEST(version.senior_editors)),
>       DAY
>     ) < 30
>   )
>   AND TIMESTAMP_DIFF(CURRENT_TIMESTAMP, version.created_timestamp, DAY) < 365
>   AND NOT is_withdrawn
>   AND NOT is_deleted
> ```

Downloading: 100%|██████████| 1004/1004 [00:01<00:00, 542.23rows/s]

1004





Unnamed: 0,version_id,extracted_keywords
0,58800/2020-05-11T10:40:38Z,"[africa, amazon, area, area system, asia, avia..."
1,58813/2020-05-11T22:01:21Z,"[absence, acute, acute exosome depletion, alte..."
2,58814/2020-05-11T22:05:38Z,"[167 metabolite, abundance, activity, age, alt..."
3,58824/2020-05-12T05:10:25Z,"[accompany, accompanying gene expression profi..."
4,58836/2020-05-12T09:35:16Z,"[3', 3’ untranslated region, ability, abundant..."


In [14]:
keyword_similarity = cosine_similarity(
    editor_tf_idf_vectorizer.transform(
        manuscript_version_for_recommendation_df
        ['extracted_keywords']
    ),
    editor_tf_idf
)
print(keyword_similarity.max())
keyword_similarity

0.5403891148811286


array([[0.04697055, 0.03750212, 0.02328891, ..., 0.01680209, 0.02114745,
        0.0408574 ],
       [0.07382973, 0.05282344, 0.08714171, ..., 0.08619588, 0.08384541,
        0.06587759],
       [0.11656326, 0.08916004, 0.14374067, ..., 0.13773205, 0.14223228,
        0.12241562],
       ...,
       [0.11838027, 0.10295489, 0.15861449, ..., 0.2068653 , 0.15603755,
        0.12595418],
       [0.07938155, 0.03525144, 0.10785622, ..., 0.06627242, 0.10330611,
        0.05848229],
       [0.13153079, 0.0929411 , 0.12257047, ..., 0.10140502, 0.10334484,
        0.10777728]])

In [15]:
weighted_keyword_valid_model.predict_ranking(
    manuscript_version_for_recommendation_df['extracted_keywords'][:1],
).proba_matrix

matrix([[0.04697055, 0.03750212, 0.02328891, 0.03536301, 0.01648059,
         0.03674309, 0.01235442, 0.03223446, 0.02522475, 0.12386136,
         0.01904331, 0.02162277, 0.01895693, 0.06920021, 0.02894409,
         0.07683732, 0.02187069, 0.05800008, 0.01033107, 0.03122536,
         0.02531779, 0.06864325, 0.03389681, 0.02198689, 0.01803758,
         0.02213437, 0.03323802, 0.02367386, 0.01893484, 0.01337589,
         0.03842174, 0.0143005 , 0.03216382, 0.02810269, 0.0284626 ,
         0.01515078, 0.01086461, 0.01938651, 0.03083047, 0.02618579,
         0.01259028, 0.02181141, 0.02733777, 0.03340766, 0.0285137 ,
         0.0236504 , 0.04190097, 0.08722997, 0.02397868, 0.05591717,
         0.02580026, 0.0227739 , 0.0186421 , 0.02144945, 0.04346612,
         0.0209199 , 0.03892793, 0.01914393, 0.01692726, 0.02178274,
         0.02524614, 0.03683772, 0.01680209, 0.02114745, 0.0408574 ]])

In [16]:
manuscript_matching_keywords_list = weighted_keyword_valid_model.predict_ranking(
    manuscript_version_for_recommendation_df['extracted_keywords']
).matching_keywords_list
pd.Series(manuscript_matching_keywords_list[:5])

0    [[(0.008048785265378677, system), (0.006351496...
1    [[(0.01033776669591374, mechanism), (0.0086383...
2    [[(0.02080495625351181, cell), (0.009197099268...
3    [[(0.01681908499674384, cell), (0.011679820536...
4    [[(0.007883403813675905, mechanism), (0.006587...
dtype: object

In [17]:
def get_recommended_editors_with_probability(
        proba_matrix: List[List[float]],
        editors_matching_keywords_list: List[List[Tuple[float, str]]],
        indices: List[T],
        threshold: float = 0.5) -> List[List[Tuple[float, T]]]:
    return [
        sorted([
            (
                p,
                key,
                sum(
                    s for s, _ in editor_matching_keywords
                ),
                editor_matching_keywords
            )
            for p, key, editor_matching_keywords in zip(
                row,
                indices,
                editors_matching_keywords
            ) if p >= threshold
        ], reverse=True)
        for row, editors_matching_keywords in zip(proba_matrix, editors_matching_keywords_list)
    ]


prediction_results_with_similarity = pd.Series(
    get_recommended_editors_with_probability(
        keyword_similarity,
        manuscript_matching_keywords_list,
        editor_names,
        threshold=0.001
    ),
    index=manuscript_version_for_recommendation_df.index
)
# print(prediction_results_with_similarity[0])
prediction_results_with_similarity[:5]

0    [(0.12386135952620322, Christian Rutz, 0.12386...
1    [(0.1935022778783016, James Manley, 0.19350227...
2    [(0.15741296725553208, Michael Marletta, 0.157...
3    [(0.22665923842273233, Satyajit Rath, 0.226659...
4    [(0.23345910990841623, James Manley, 0.2334591...
dtype: object

In [18]:
prediction_results_df = pd.concat([
    manuscript_version_for_recommendation_df['version_id'],
    prediction_results_with_similarity.to_frame('prediction'),
], axis=1)
print(len(prediction_results_df))
prediction_results_df.head()

1004


Unnamed: 0,version_id,prediction
0,58800/2020-05-11T10:40:38Z,"[(0.12386135952620322, Christian Rutz, 0.12386..."
1,58813/2020-05-11T22:01:21Z,"[(0.1935022778783016, James Manley, 0.19350227..."
2,58814/2020-05-11T22:05:38Z,"[(0.15741296725553208, Michael Marletta, 0.157..."
3,58824/2020-05-12T05:10:25Z,"[(0.22665923842273233, Satyajit Rath, 0.226659..."
4,58836/2020-05-12T09:35:16Z,"[(0.23345910990841623, James Manley, 0.2334591..."


In [19]:
prediction_results_df['prediction'][0]

[(0.12386135952620322,
  'Christian Rutz',
  0.12386135952620335,
  [(0.013372672652604752, 'specialist'),
   (0.01306024230055606, 'specie'),
   (0.011640104213490007, 'bird'),
   (0.010581124595948822, 'biodiversity'),
   (0.008868827536664646, 'high'),
   (0.008000238594538108, 'extinction'),
   (0.005773448569033431, 'system'),
   (0.005618507481017212, 'specialization'),
   (0.005408492726581766, 'distribution'),
   (0.005290562297974411, 'generalist'),
   (0.0047113523572990855, 'hotspot'),
   (0.004640124467704867, 'higher number'),
   (0.004499724968714192, 'scale'),
   (0.003700547197878242, 'planning'),
   (0.0035160913456824517, 'avian'),
   (0.002814206190509656, 'strategy'),
   (0.0027842940851605727, 'hemisphere'),
   (0.002016987858891522, 'percentage'),
   (0.0016235335410349886, 'conservation'),
   (0.0015920243539456638, 'area'),
   (0.0014965962253007657, 'global'),
   (0.0010894461690401508, 'exist'),
   (0.0009904411462858002, 'overall'),
   (0.0007717686503461885,

In [20]:
prediction_results_flat_df = pd.DataFrame([
    {
        'version_id': row.version_id,
        'score': predicted_editor[0],
        'name': predicted_editor[1],
        'person_id': editor_person_id_by_name_map[predicted_editor[1]],
        'matching_keyword_score': predicted_editor[2],
        'matching_keywords': [{
            'score': keyword_score,
            'keyword': keyword
        } for keyword_score, keyword in predicted_editor[3]],
    }
    for row in prediction_results_df.itertuples()
    for predicted_editor in row.prediction
])
print(len(prediction_results_flat_df))
prediction_results_flat_df.head()

65257


Unnamed: 0,version_id,score,name,person_id,matching_keyword_score,matching_keywords
0,58800/2020-05-11T10:40:38Z,0.123861,Christian Rutz,15332,0.123861,"[{'score': 0.013372672652604752, 'keyword': 's..."
1,58800/2020-05-11T10:40:38Z,0.08723,Neil Ferguson,12065,0.08723,"[{'score': 0.01043147626191354, 'keyword': 'so..."
2,58800/2020-05-11T10:40:38Z,0.076837,Diethard Tautz,1191,0.076837,"[{'score': 0.011186019263284155, 'keyword': 's..."
3,58800/2020-05-11T10:40:38Z,0.0692,Detlef Weigel,1030,0.0692,"[{'score': 0.008488720473132524, 'keyword': 's..."
4,58800/2020-05-11T10:40:38Z,0.068643,George Perry,42011,0.068643,"[{'score': 0.011924732412445911, 'keyword': 'a..."


In [21]:
prediction_results_flat_df['version_id'].nunique()

1004

In [22]:
prediction_results_flat_df.max()

version_id                59773/2020-06-08T03:01:29Z
score                                       0.540389
name                                   Wendy Garrett
person_id                                      97155
matching_keyword_score                      0.540389
dtype: object

In [23]:
print('writing to:', recommendation_output_table_name)
to_gbq(
    prediction_results_flat_df,
    recommendation_output_table_name,
    project_id=project_id,
    if_exists='replace'
)
print('done')

writing to: de_dev.data_science_editor_recommendation
done
