In [None]:
project_id = 'elife-data-pipeline'
source_dataset = 'de_dev'
output_dataset = 'de_dev'
output_table_prefix = 'data_science_'
max_manuscripts = None
manuscript_min_tf = 10
manuscript_max_tf = 0.9
state_path = 's3://ci-elife-data-pipeline/airflow-config/data-science/state-dev'

In [None]:
import os
from functools import partial
from typing import List, Tuple, TypeVar, Iterable

import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity

from google.cloud.bigquery import WriteDisposition

import data_science_pipeline.configure_warnings  # pylint: disable=unused-import
import data_science_pipeline.configure_notebook_logging  # pylint: disable=unused-import

from data_science_pipeline.sql import get_sql
from data_science_pipeline.utils.bq import (
    load_file_into_bq_with_auto_schema,
    with_limit_sql
)
from data_science_pipeline.utils.io import load_object_from
from data_science_pipeline.utils.jupyter import (
    read_big_query as _read_big_query
)
from data_science_pipeline.peerscout.models import (
    WeightedKeywordModel
)
from data_science_pipeline.utils.json import (
    remove_key_with_null_value,
    json_list_as_jsonl_file
)
from data_science_pipeline.utils.editor_recommendation import (
    get_author_ids_of_given_version_of_manuscript
)

In [None]:
model_path = os.path.join(state_path, 'reviewing_editor_model.joblib')
recommendation_output_table_name = '{prefix}{suffix}'.format(
    prefix=output_table_prefix,
    suffix='reviewing_editor_recommendation'
)

In [None]:
print('loading model from:', model_path)
model_dict = load_object_from(model_path)
model_dict.keys()

In [None]:
editor_tf_idf_vectorizer = model_dict['editor_tf_idf_vectorizer']

In [None]:
editor_tf_idf = model_dict['editor_tf_idf']

In [None]:
editor_names = model_dict['editor_names']

In [None]:
editor_person_ids = model_dict['editor_person_ids']

In [None]:
editor_person_id_by_name_map = dict(zip(editor_names, editor_person_ids))
editor_person_id_by_name_map

In [None]:
weighted_keyword_valid_model = WeightedKeywordModel.from_tf_matrix(
    editor_tf_idf.todense(),
    vectorizer=editor_tf_idf_vectorizer,
    choices=editor_names
)

In [None]:
read_big_query = partial(_read_big_query, project_id=project_id)

In [None]:
default_query_props = dict(project=project_id, dataset=source_dataset)
default_query_props

In [None]:
# we are using the same manuscript list used for senior editor recommendation
# this is because we want to recommend reviewing editors to consult with (not for assignment)
manuscript_version_for_recommendation_df = read_big_query(with_limit_sql(
    get_sql('manuscript-version-initial-submissions-for-senior-editor-recommendation.sql').format(
        **default_query_props
    ),
    limit=max_manuscripts
))
manuscript_version_for_recommendation_df.head(3)

In [None]:
keyword_similarity = cosine_similarity(
    editor_tf_idf_vectorizer.transform(
        manuscript_version_for_recommendation_df
        ['extracted_keywords']
    ),
    editor_tf_idf
)
print("max keyword_similarity: ", keyword_similarity.max())


In [None]:
# weighted_keyword_valid_model.predict_ranking(
#     manuscript_version_for_recommendation_df['extracted_keywords'][:1],
# ).proba_matrix

In [None]:
manuscript_matching_keywords_list = weighted_keyword_valid_model.predict_ranking(
    manuscript_version_for_recommendation_df['extracted_keywords']
).matching_keywords_list
pd.Series(manuscript_matching_keywords_list[:5])

In [None]:
T = TypeVar('T')


def get_recommended_editors_with_probability(
        proba_matrix: List[List[float]],
        editors_matching_keywords_list: List[List[List[Tuple[float, str]]]],
        indices: List[T],
        threshold: float = 0.5) -> List[List[Tuple[float, T, float, List[Tuple[float, str]]]]]:
    return [
        sorted([
            (
                p,
                key,
                sum(
                    s for s, _ in editor_matching_keywords
                ),
                editor_matching_keywords
            )
            for p, key, editor_matching_keywords in zip(
                row,
                indices,
                editors_matching_keywords
            ) if p >= threshold
        ], reverse=True)
        for row, editors_matching_keywords in zip(proba_matrix, editors_matching_keywords_list)
    ]


prediction_results_with_similarity = pd.Series(
    get_recommended_editors_with_probability(
        keyword_similarity,
        manuscript_matching_keywords_list,
        editor_names,
        threshold=0.001
    ),
    index=manuscript_version_for_recommendation_df.index
)
prediction_results_with_similarity[0:5]

In [None]:
prediction_results_df = pd.concat([
    manuscript_version_for_recommendation_df['version_id'],
    prediction_results_with_similarity.to_frame('prediction'),
], axis=1)
print("len of prediction_results_df: ", len(prediction_results_df))
prediction_results_df.head(3)

In [None]:
# prediction_results_df['prediction'][0]

In [None]:
print("'prediction_results_df' memory usage:")
prediction_results_df.info(memory_usage='deep')

In [None]:
manuscript_version_for_recommendation_df.head(1)

In [None]:
def iter_prediction_results(
    prediction_df: pd.DataFrame,
) -> Iterable[dict]:
    for row in prediction_df.itertuples():
        for predicted_editor in row.prediction:
            person_id = editor_person_id_by_name_map[predicted_editor[1]]
            author_ids = get_author_ids_of_given_version_of_manuscript(
                manuscript_version_for_recommendation_df,
                row.version_id
            )
            if person_id in author_ids:
                print(f"Excluding person_id: {person_id} who is an author of the paper {row.version_id}.")
                continue
            yield remove_key_with_null_value({
                'version_id': row.version_id,
                'score': predicted_editor[0],
                'name': predicted_editor[1],
                'person_id': person_id,
                'matching_keyword_score': predicted_editor[2],
                'matching_keywords': [{
                    'score': keyword_score,
                    'keyword': keyword
                } for keyword_score, keyword in predicted_editor[3]],
            })

In [None]:
with json_list_as_jsonl_file(iter_prediction_results(prediction_results_df)) as jsonl_file:
    load_file_into_bq_with_auto_schema(
        jsonl_file = jsonl_file,
        project_id = project_id,
        write_mode = WriteDisposition.WRITE_TRUNCATE,
        dataset_name= output_dataset,
        table_name= recommendation_output_table_name,
    )
