In [94]:
import requests
import pandas as pd
from urllib.parse import urlencode
import math
import concurrent.futures
import itertools
import time
import os
import progressbar
import numpy as np
from top2vec import Top2Vec
import umap.umap_ as umap
import hvplot.pandas
import openai
from thefuzz import fuzz
from contextlib import suppress

# Import documents

There are several limitations of the EV API, all related to the fact that it mirrors the web interface:
1. Maximum page size is 100 records
1. The maximum number of records returned is 5000 (e.g., max offset is 4900 with 100 records per page)

To work around the maximum record restriction, we can break the search results up by year (startYear and endYear). This works as long as the number of records in any single year is 5000 or fewer. Since many years have more than 5000 records, we further break slice by article type (conference articles and everything but. We use EV's controlled vocabulary to search, with the following search terms:

1. ((({reliability} WN CV) AND ({english} WN LA)) NOT ({ca} WN DT)) -- 109,731 records
1. ((({reliability} WN CV) AND ({english} WN LA)) AND ({ca} WN DT)) -- 84,674 records

The earliest year for these results is 1907, so we check all years from 1907 to 2023.

In [95]:
API_KEY = os.environ['ELSEVIER_API_KEY']
BASE_URL = 'https://api.elsevier.com/content/ev/results?'
QUERIES = [
    r'(((("reliability engineering") WN ALL)) AND ({english} WN LA))'
]

year_range = range(2023, 1907, -1)
results = []

def get_results_by_year(query_year, base_url=BASE_URL, api_key=API_KEY):
    """
    Get results for a given query and year (as a tuple). 
    Returns a list of results.
    """
    params = {
        'apiKey': api_key,
        'pageSize': 100,
        'query': query_year[0],
        'offset': 0,
        'startYear': query_year[1],
        'endYear': query_year[1]
    }

    url = base_url + urlencode(params)

    # Permit retry 3 times after a 1 second delay
    try_count = 0
    while try_count < 3:
        try:
            r = requests.get(url)
            n_results = r.json()['PAGE']['RESULTS-COUNT']
            print('.', end='')
            
            # EV API has a limit of 5000 results per query
            if n_results > 5000:
                raise('Too many results: ' + str(n_results))
            
            if n_results == 0:
                print('0', end='')
                break
            
            first_offset = params['pageSize']
            last_offset = (
                math.floor(n_results / params['pageSize']) 
                * params['pageSize'] + first_offset
                )

            # Loop through each page
            for doc in r.json()['PAGE']['PAGE-RESULTS']['PAGE-ENTRY']:
                results.append(doc['EI-DOCUMENT']['DOC']['DOC-ID'])
            break
        except:
            print('e', end='')
            time.sleep(1)
            try_count = try_count + 1

    if n_results > 0:
        for offset in range(first_offset, last_offset, first_offset):
            params['offset'] = offset
            url = base_url + urlencode(params)

            # Permit retry 3 times after a 1 second delay
            try_count = 0
            while try_count < 3:
                try:
                    r = requests.get(url)
                    for doc in r.json()['PAGE']['PAGE-RESULTS']['PAGE-ENTRY']:
                        results.append(doc['EI-DOCUMENT']['DOC']['DOC-ID'])
                    print('.', end='')
                    break
                except:
                    print('e', end='')
                    time.sleep(1)
                    try_count = try_count + 1


# Only run if the file doesn't already exist
if not os.path.exists('data/search_results.csv'):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(
            get_results_by_year, 
            itertools.product(QUERIES, year_range)
            )

    search_df = pd.DataFrame({'doc_id': results})
    search_df.to_csv('data/search_results.csv', index=False)

search_df = pd.read_csv('data/search_results.csv')
search_df

Unnamed: 0,doc_id
0,inspec_69a35fd157e37d21c3M7fe010178163171
1,cpx_5b25cedf167c1c49bafM7aab1017816339
2,cpx_11e9ecb114f4c825fb8M77ea10178163171
3,inspec_41ae3d3c15872bf4f78M54e710178163171
4,inspec_b7bc0be15903b6abb3M5a1210178163171
...,...
31784,cpx_2aa92d8d15f6db3b4f6M736b10178163176
31785,c84_1a5ae9afd8076e3a3M786019817173212
31786,c84_12aa9a2fa58cb27b3M614e19817173212
31787,c84_8676a8fa39e739edM701b19817173212


We can collect actual records in blocks of ~50. This is because the URL becomes too long if we include more than that number of docIds in the GET request.

In [96]:
# Get the result for each document
BASE_URL = 'https://api.elsevier.com/content/ev/records?'

records_dict = {
    'doc_id': [],
    'doi': [],
    'title': [],
    'abstract': [],
    'doc_type': [],
    'year': [],
    'publisher': [],
    'source_title': [],
    'authors': [],
    'author_affiliations': [],
    'country_of_origin': []
}

res_len = search_df.shape[0]
docids_chunked = [search_df['doc_id'][i:i+50] for i in range(0,res_len,50)]
result_count = 0

def get_records_by_chunk(docids, base_url=BASE_URL, api_key=API_KEY):
    """
    Get records for a given chunk of docids. Appends to the global records_df.
    """
    global bar
    global result_count

    params = {
        'docId': ','.join(docids),
        'apiKey': api_key
    }
    url = base_url + urlencode(params)

    # Permit retry 3 times after a 1 second delay
    try_count = 0
    while try_count < 3:
        try:
            r = requests.get(url)
            results = r.json()['PAGE']['PAGE-RESULTS']['PAGE-ENTRY']

            for document in results:
                with suppress(KeyError):
                    # Account for missing data (e.g. missing JSON keys)
                    doi = None
                    doi = document['EI-DOCUMENT']['DOCUMENTPROPERTIES']['DO']
                    doi = pd.NA if not doi

                    title = None
                    title = document['EI-DOCUMENT']['DOCUMENTPROPERTIES']['TI']
                    title = pd.NA if not title

                    abstract = None
                    abstract = document['EI-DOCUMENT']['DOCUMENTPROPERTIES']['AB']
                    abstract = pd.NA if not abstract

                    doc_type = None
                    doc_type = document['EI-DOCUMENT']['DOCUMENTPROPERTIES']['DT']
                    doc_type = pd.NA if not doc_type

                    # Publication year has several options
                    year = None
                    year = document['EI-DOCUMENT']['DOCUMENTPROPERTIES']['PY']
                    if not year:
                        year = document['EI-DOCUMENT']['DOCUMENTPROPERTIES']['YR']
                    if not year:
                        year = document['EI-DOCUMENT']['DOCUMENTPROPERTIES']['SD']
                    if not year:
                        year = document['EI-DOCUMENT']['DOCUMENTPROPERTIES']['PD_YR']
                    if not year:
                        year = document['EI-DOCUMENT']['DOCUMENTPROPERTIES']['CPR']
                    year = pd.NA if not year

                    publisher = None
                    publisher = document['EI-DOCUMENT']['DOCUMENTPROPERTIES']['PF']
                    if not publisher:
                        publisher = document['EI-DOCUMENT']['DOCUMENTPROPERTIES']['PN']
                    publisher = pd.NA if not publisher

                    source = None
                    source = document['EI-DOCUMENT']['DOCUMENTPROPERTIES']['MT']
                    if not source:
                        source = document['EI-DOCUMENT']['DOCUMENTPROPERTIES']['CF']
                    if not source:
                        source = document['EI-DOCUMENT']['DOCUMENTPROPERTIES']['RIL']
                    source = pd.NA if not source_title

                    authors = None
                    authors = document['EI-DOCUMENT']['AUS']
                    authors = pd.NA if not authors

                    author_affiliations = None
                    author_affiliations = document['EI-DOCUMENT']['AFS']
                    author_affiliations = pd.NA if not author_affiliations

                    country = None
                    country = document['EI-DOCUMENT']['DOCUMENTPROPERTIES']['CO']
                    if not country_of_origin:
                        country = document['EI-DOCUMENT']['DOCUMENTPROPERTIES']['ML']
                    if not country_of_origin:
                        country = document['EI-DOCUMENT']['DOCUMENTPROPERTIES']['PL']
                    if not country_of_origin:
                        country = document['EI-DOCUMENT']['DOCUMENTPROPERTIES']['PLA']
                    country = pd.NA if not country_of_origin

                (records_dict['doc_id']
                    .append(document['EI-DOCUMENT']['DOC']['DOC-ID']))
                records_dict['doi'].append(doi)
                records_dict['title'].append(title)
                records_dict['abstract'].append(abstract)
                records_dict['doc_type'].append(doc_type)
                records_dict['year'].append(year)
                records_dict['publisher'].append(publisher)
                records_dict['source_title'].append(source)
                records_dict['authors'].append(authors)
                records_dict['author_affiliations'].append(author_affiliations)
                records_dict['country_of_origin'].append(country)

            bar.update(result_count + 1)
            result_count = result_count + 1
            break
        except Exception as e:
            time.sleep(1)
            try_count = try_count + 1

if not os.path.exists('data/records.csv'):
    widgets = [
        progressbar.Percentage(), 
        progressbar.GranularBar(markers=' ▁▂▃▄▅▆▇█'), 
        ' Chunk ', progressbar.widgets.Counter(), ' of ', str(len(docids_chunked)), 
        ' | ', progressbar.ETA(), 
    ]
    bar = progressbar.ProgressBar(widgets=widgets, max_value=len(docids_chunked)).start()

    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(get_records_by_chunk, docids_chunked)
    bar.finish()
    
    records_df = pd.DataFrame(records_dict)

    # Clean up the year column
    records_df['year'] = records_df['year'].str.extract(r'(\d{4})')

    # Filter out records with missing abstracts and drop dups
    records_df = records_df.query('@pd.notna(abstract)').drop_duplicates(subset='doc_id')
    records_df.to_csv('data/records.csv', index=False)

records_df = pd.read_csv('data/records.csv')
records_df

Unnamed: 0,doc_id,doi,title,abstract,doc_type,year,publisher,source_title,authors,author_affiliations,country_of_origin
0,cpx_2a67e3b418670970bc9M66a31017816363,10.1002/qre.3284,The following article for this Special Issue w...,"<div data-language=""eng"" data-ev-field=""abstra...",Journal article (JA),2023,John Wiley and Sons Ltd,Quality and Reliability Engineering International,,,
1,cpx_M65fcd0f4185e0617d4dM7f841017816363,10.1002/qre.3248,A bibliography of the literature on process ca...,"<div data-language=""eng"" data-ev-field=""abstra...",Article in Press,2023,John Wiley and Sons Ltd,Quality and Reliability Engineering International,"{'AU': [{'ID': '1', 'EMAIL': 'yum3116@kaist.ac...","{'AF': [{'ID': '1', 'NAME': 'Emeritus, Departm...",
2,cpx_M145e5f651850816b0fbM7db61017816355,10.1002/qre.3240,A misuse of the EWMA-type statistic in accepta...,"<div data-language=""eng"" data-ev-field=""abstra...",Journal article (JA),2023,John Wiley and Sons Ltd,Quality and Reliability Engineering International,"{'AU': [{'ID': '1', 'EMAIL': 'aaabdulhaq@yahoo...","{'AF': [{'ID': '1', 'NAME': 'Department of Sta...",
3,cpx_M5b332a91849c00712fM59f61017816355,10.1016/j.ress.2022.108890,Deep imbalanced domain adaptation for transfer...,"<div data-language=""eng"" data-ev-field=""abstra...",Journal article (JA),2023,Elsevier Ltd,Reliability Engineering and System Safety,"{'AU': [{'ID': '1', 'AFS': {'AFID': [1, 3]}, '...","{'AF': [{'ID': '1', 'NAME': 'School of Mechani...",
4,cpx_M49bad31849c02f4ebM618c1017816355,10.1016/j.ress.2022.108953,QB-II for evaluating the reliability of binary...,"<div data-language=""eng"" data-ev-field=""abstra...",Journal article (JA),2023,Elsevier Ltd,Reliability Engineering and System Safety,"{'AU': [{'ID': '1', 'EMAIL': 'yeh@ieee.org', '...","{'AF': [{'ID': '1', 'NAME': 'Department of Ind...",
...,...,...,...,...,...,...,...,...,...,...,...
30486,c84_bae4fef9c2b7fc84M60ab19817173212,,Elements of practical reliability-engineering ...,"Primary reliability concepts and terms, such a...",Journal article (JA),1963,,Electro-Technology,"{'AU': [{'NAME': 'Royal, E.L.'}]}",,
30487,c84_7bd950f98fe10199M513519817173212,,Curriculum for reliability engineering,Proposal for educational program is basically ...,Journal article (JA),1963,,Environmental Quarterly,"{'AU': [{'NAME': 'Jones, H.C.'}]}",,
30488,c84_1a72683fa3433e71aM7e9019817173212,,Reliability engineering -- Its application to ...,Engineering reliability program establishes st...,Journal article (JA),1963,,Agricultural Engineering,"{'AU': [{'NAME': 'Archer, R.C.'}]}",,
30489,c84_182d8dffa345009a8M750919817173212,,Reliability engineering review -- Effective ma...,In Space Systems at Philco Western Development...,Conference article (CA),1963,Institute of Electrical and Electronics Engine...,Proceedings of the 7th IEEE -- National Conven...,"{'AU': [{'NAME': 'DeVille, W.W.'}]}",,


# Create top2vec model

In [228]:
corpus = records_df['abstract'].tolist()
document_ids = records_df['doc_id'].tolist()

if not os.path.exists('data/top2vec_model.mdl'):
    model = Top2Vec(corpus, speed='learn', document_ids=document_ids, workers=10)
    _ = model.hierarchical_topic_reduction(num_topics=11) # 11 topics is the sweet spot
    model.save('data/top2vec_model.mdl')
model = Top2Vec.load('data/top2vec_model.mdl')
_ = model.hierarchical_topic_reduction(num_topics=11) # reduction isn't saved

# Apply labels

In [98]:
# Make labels
model_words = model.topic_words_reduced[model.doc_top_reduced,0:3]
labels = np.array([', '.join(x) for x in model_words])
topic_ids = model.doc_top_reduced

reduced_df = pd.DataFrame({
    'label': labels,
    'topic_id': topic_ids,
    'doc_id': model.document_ids
})
reduced_df.label = reduced_df.label.astype('category')

# Join reduced_df with records_df to get full dataset
records_df = records_df.join(reduced_df.set_index('doc_id'), on='doc_id')

# Visualize model

In [99]:
# Create UMAP model for visualization. This will take ~1 min to run.
vectors = model.document_vectors
mapping = umap.UMAP(
    n_neighbors=100, 
    min_dist = 0.0, 
    n_components=2, 
    metric='cosine', 
    verbose=True, 
    n_epochs=1000
    )
reduced_fit_transform = mapping.fit_transform(vectors)
transform_df = pd.DataFrame(reduced_fit_transform, columns=['x', 'y'])
transform_df['doc_id'] = model.document_ids
transform_df.to_csv('data/transform_df.csv', index=False)

# Plot the reduced dimensionality data
records_df.join(transform_df.set_index('doc_id'), on='doc_id').hvplot(
    'x',
    'y',
    by='label',
    kind='scatter',
    width=1500,
    height=1000,
    size=2,
    alpha=0.2,
    legend=False,
    yaxis=False,
    xaxis=False,
    hover_cols=['doc_id', 'title']
).opts(bgcolor='#111111')

UMAP(angular_rp_forest=True, metric='cosine', min_dist=0.0, n_epochs=1000, n_neighbors=100, verbose=True)
Fri Apr 14 20:32:36 2023 Construct fuzzy simplicial set
Fri Apr 14 20:32:36 2023 Finding Nearest Neighbors
Fri Apr 14 20:32:36 2023 Building RP forest with 14 trees
Fri Apr 14 20:32:36 2023 NN descent for 15 iterations
	 1  /  15
	 2  /  15
	 3  /  15
	 4  /  15
	 5  /  15
	Stopping threshold met -- exiting after 5 iterations
Fri Apr 14 20:32:43 2023 Finished Nearest Neighbor Search
Fri Apr 14 20:32:44 2023 Construct embedding


Epochs completed:   0%|            0/1000 [00:00]

Fri Apr 14 20:33:11 2023 Finished embedding


# Generate topic list

In [220]:
# Get most relevant documents for each topic
rep_docs = pd.DataFrame()
for topic_num in model.get_topics(reduced=False)[2]:
    result = model.search_documents_by_topic(topic_num=topic_num, num_docs=1)
    row = {
        'topic_num': topic_num,
        'topic_words': ', '.join(model.topic_words[topic_num,0:3]),
        'doc_count': model.topic_sizes[topic_num],
        'title': records_df.query(f'doc_id == "{result[2][0]}"')['title'].values[0]
    }
    rep_docs = pd.concat([rep_docs, pd.DataFrame(row, index=[0])])

rep_docs.to_csv('data/topic_list.csv', index=False)
representative_docs

Unnamed: 0,topic_num,topic_words,doc_count,title
0,0,"weibull, censoring, censored",824,E-Bayesian estimation of reliability character...
0,1,"engineering, assurance, disciplines",806,Reliability engineering
0,2,"replacement, preventive, maintenance",777,A condition-based maintenance policy for stoch...
0,3,"repair, repairable, markov",439,Transient analysis of reliability with and wit...
0,4,"political, risk, perceptions",434,Meaning and contextualisation in risk assessment
...,...,...,...,...
0,292,"go, repairable, logic",24,A new reliability analysis method for repairab...
0,293,"dsfs, detectability, blade",23,Sequential projection pursuit for optimised vi...
0,294,"sr, recommended, signed",21,IEEE Recommended Practice on Software Reliability
0,295,"cpn, timed, petri",21,Backward reachability of Colored Petri Nets fo...


# Practicality dimension

In [101]:
# Randomly sample 1000 documents from the full dataset
if not os.path.exists('data/sample.csv'):
    sample = records_df.sample(1000)[['doc_id', 'abstract']]
    sample.to_csv('data/sample.csv', index=False)

# Load sample from csv file with annotations
sample = pd.read_csv('data/sample.csv')
sample['real_world'] = sample['real_world'].astype('int')
sample

Unnamed: 0,doc_id,abstract,real_world
0,cpx_M24b5f0df16b6c26c8acM7d0310178163167,This is the first in a series of three papers ...,2
1,cpx_M5d85fe5a1660ca4f943M66311017816339,This article develops reliability models for s...,2
2,ntis_765291f78cb79645663119817173108,The required satellite mission durations and l...,2
3,cpx_1893600,A safety and reliability study of a H//2S stor...,2
4,cpx_b753f811bfbca5090M5eb82061377553,In the design of complex systems there is a gr...,1
...,...,...,...
181,cpx_217f9cf317fae433283M77b51017816328,"<div data-language=""eng"" data-ev-field=""abstra...",0
182,inspec_4dff6ff915a24abea77M561710178163171,The choice of maintenance strategy is a common...,0
183,cpx_M7366d7941776381ccd9M7bc010178163190,Modern distributed systems are supposed to be ...,0
184,inspec_13f304510b145d53c2M6b312061377553,"In this paper, we introduce a methodology for ...",2


In [102]:
# Create a jsonl file with one line per row in sample. Each line is a json object with key 'prompt' and value the abstract and key 'completion' and value real_world
if not os.path.exists('data/sample.jsonl'):
    sample_jsonl = sample.apply(lambda x: {'prompt': x['abstract'], 'completion': x['real_world']}, axis=1).to_json(orient='records', lines=True)
    with open('data/sample.jsonl', 'w') as f:
        f.write(sample_jsonl)

In [103]:
openai.api_key = os.environ['OPENAI_API_KEY']

def classify_abstract(abstract):


    delay = 1 / (3500 * 60) * 10 # (s) 3500 requests per minute times 10 workers
    time.sleep(delay)
    system_prompt = (
        'Provided academic abstracts as prompts, classify them as one of the following: '
        '0: no explicit mention of an example, '
        '1: mentions an illustrative example or demonstration, or '
        '2: explicitly mentions a case study. '
        'If an abstract mentions a case study and an example, classify it as 2. '
        'Respond only with the classification ID number (0, 1, or 2).'
    )

    prompt = abstract

    while True:
        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                max_tokens=1,
                temperature=1,
                messages=[
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": prompt}
                    ]
                )
            break
        except:
            time.sleep(60) # wait 1 minute and try again
            pass
    try:
        classification = response['choices'][0]['message']['content']
    except:
        classification = '-1'

    return classification
    
import textwrap
example_abstract = records_df['abstract'].sample(1).values[0]
print('\n'.join(textwrap.wrap(example_abstract, 100)))
print('Classification: ' + classify_abstract(example_abstract))

Dynamically programmable data probes assist in solving verification issues of autonomous systems.
Data probes can be programmed to monitor timed sequences of system data, to check system properties,
and to stimulate and control autonomous systems. We present several probe programs to inspect and
check various aspects of a safety invariant during a mobile robot's task execution. We report on
experience and identify open issues.<br/> &copy; 2017 IEEE.
Classification: 1


In [104]:
# Map the abstracts to the classification. This will cost $.
if not os.path.exists('data/sample_classified.csv'):
    sample['classification'] = sample['abstract'].apply(classify_abstract)
    sample.to_csv('data/sample_classified.csv', index=False)

sample = pd.read_csv('data/sample_classified.csv')
sample

Unnamed: 0,doc_id,abstract,real_world,classification
0,cpx_M24b5f0df16b6c26c8acM7d0310178163167,This is the first in a series of three papers ...,2,2
1,cpx_M5d85fe5a1660ca4f943M66311017816339,This article develops reliability models for s...,2,2
2,ntis_765291f78cb79645663119817173108,The required satellite mission durations and l...,2,1
3,cpx_1893600,A safety and reliability study of a H//2S stor...,2,1
4,cpx_b753f811bfbca5090M5eb82061377553,In the design of complex systems there is a gr...,1,1
...,...,...,...,...
181,cpx_217f9cf317fae433283M77b51017816328,"<div data-language=""eng"" data-ev-field=""abstra...",0,1
182,inspec_4dff6ff915a24abea77M561710178163171,The choice of maintenance strategy is a common...,0,0
183,cpx_M7366d7941776381ccd9M7bc010178163190,Modern distributed systems are supposed to be ...,0,1
184,inspec_13f304510b145d53c2M6b312061377553,"In this paper, we introduce a methodology for ...",2,2


In [105]:
# Check our accuracy
tf = 0 == sample['real_world'] - sample['classification']
tf.apply(int).sum()/len(tf)

0.7043010752688172

In [145]:
if not os.path.exists('data/records_df_examples.csv'):
    # This will cost $$$.
    from pandarallel import pandarallel
    pandarallel.initialize(
        progress_bar=True,
        verbose=0,
        nb_workers=10
        )
    
    records_df['examples'] = records_df['abstract'].parallel_apply(classify_abstract)
    records_df.to_csv('data/records_df_examples.csv', index=False)

records_df = pd.read_csv('data/records_df_examples.csv')
records_df.label = records_df.label.astype('category')

# Filtering out some categories

In [113]:
records_df.groupby(['topic_id', 'label']).count()['doc_id'].sort_values(ascending=False)[0:11]

topic_id  label                                      
0         bug, developers, software                      4211
1         engineering, organizations, development        4064
2         weibull, estimation, estimators                3737
3         cut, minimal, binary                           2841
4         electron, silicon, oxide                       2542
5         human, experts, hra                            2479
6         preventive, replacement, maintenance           2414
7         charts, chart, shewhart                        2333
8         infrastructure, transportation, disruptions    2204
9         rul, prediction, prognostic                    1865
10        nuclear, plants, reactor                       1853
Name: doc_id, dtype: int64

In [115]:
# Filled chart with non-relevant categories filtered out
pd.options.mode.chained_assignment = None
filtered_labels = [
    'bug, developers, software', 
    'infrastructure, transportation, disruptions', 
    'nuclear, plants, reactor'
    ]
records_df_filtered = records_df.query('label not in @filtered_labels')
records_df_filtered['label'] = records_df_filtered['label'].cat.remove_unused_categories()

In [118]:
# Establish names for each label
label_names = {
    'engineering, organizations, development': 'Management',
    'weibull, estimation, estimators': 'Statistics',
    'cut, minimal, binary': 'Modeling',
    'human, experts, hra': 'Risk Assessment',
    'preventive, replacement, maintenance': 'Maintenance',
    'charts, chart, shewhart': 'Quality Control',
    'rul, prediction, prognostic': 'Prognostics',
    'electron, silicon, oxide': 'Physics of Failure',
}

# Establish timing for each name
name_timing = {
    'Management': 1, # Applies from start of project
    'Statistics': 4, # Can only be used once testing starts
    'Modeling': 3, # Can be used effectively at PDR+
    'Risk Assessment': 2, # Can be used in concept +
    'Maintenance': 6, # Applies once product is in field
    'Quality Control': 5, # Applies after product is in production
    'Prognostics': 6, # Applies after product is in field
    'Physics of Failure:': 3 # Can be used in PDR+
}

# Apply those names and timings to the dataframe
records_df_filtered['label_name'] = records_df_filtered['label'].map(label_names)
records_df_filtered['timing'] = records_df_filtered['label_name'].map(name_timing)
records_df_filtered.to_csv('data/records_df_filtered.csv', index=False)
records_df_filtered

Unnamed: 0,doc_id,doi,title,abstract,doc_type,year,publisher,source_title,authors,author_affiliations,country_of_origin,label,topic_id,examples,label_name,timing
0,cpx_2a67e3b418670970bc9M66a31017816363,10.1002/qre.3284,The following article for this Special Issue w...,"<div data-language=""eng"" data-ev-field=""abstra...",Journal article (JA),2023,John Wiley and Sons Ltd,Quality and Reliability Engineering International,,,,"engineering, organizations, development",1,0,Management,1.0
1,cpx_M65fcd0f4185e0617d4dM7f841017816363,10.1002/qre.3248,A bibliography of the literature on process ca...,"<div data-language=""eng"" data-ev-field=""abstra...",Article in Press,2023,John Wiley and Sons Ltd,Quality and Reliability Engineering International,"{'AU': [{'ID': '1', 'EMAIL': 'yum3116@kaist.ac...","{'AF': [{'ID': '1', 'NAME': 'Emeritus, Departm...",,"electron, silicon, oxide",4,0,Physics of Failure,
2,cpx_M145e5f651850816b0fbM7db61017816355,10.1002/qre.3240,A misuse of the EWMA-type statistic in accepta...,"<div data-language=""eng"" data-ev-field=""abstra...",Journal article (JA),2023,John Wiley and Sons Ltd,Quality and Reliability Engineering International,"{'AU': [{'ID': '1', 'EMAIL': 'aaabdulhaq@yahoo...","{'AF': [{'ID': '1', 'NAME': 'Department of Sta...",,"charts, chart, shewhart",7,0,Quality Control,5.0
3,cpx_M5b332a91849c00712fM59f61017816355,10.1016/j.ress.2022.108890,Deep imbalanced domain adaptation for transfer...,"<div data-language=""eng"" data-ev-field=""abstra...",Journal article (JA),2023,Elsevier Ltd,Reliability Engineering and System Safety,"{'AU': [{'ID': '1', 'AFS': {'AFID': [1, 3]}, '...","{'AF': [{'ID': '1', 'NAME': 'School of Mechani...",,"rul, prediction, prognostic",9,2,Prognostics,6.0
4,cpx_M49bad31849c02f4ebM618c1017816355,10.1016/j.ress.2022.108953,QB-II for evaluating the reliability of binary...,"<div data-language=""eng"" data-ev-field=""abstra...",Journal article (JA),2023,Elsevier Ltd,Reliability Engineering and System Safety,"{'AU': [{'ID': '1', 'EMAIL': 'yeh@ieee.org', '...","{'AF': [{'ID': '1', 'NAME': 'Department of Ind...",,"cut, minimal, binary",3,1,Modeling,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30538,c84_bae4fef9c2b7fc84M60ab19817173212,,Elements of practical reliability-engineering ...,"Primary reliability concepts and terms, such a...",Journal article (JA),1963,,Electro-Technology,"{'AU': [{'NAME': 'Royal, E.L.'}]}",,,"engineering, organizations, development",1,1,Management,1.0
30539,c84_7bd950f98fe10199M513519817173212,,Curriculum for reliability engineering,Proposal for educational program is basically ...,Journal article (JA),1963,,Environmental Quarterly,"{'AU': [{'NAME': 'Jones, H.C.'}]}",,,"engineering, organizations, development",1,0,Management,1.0
30540,c84_1a72683fa3433e71aM7e9019817173212,,Reliability engineering -- Its application to ...,Engineering reliability program establishes st...,Journal article (JA),1963,,Agricultural Engineering,"{'AU': [{'NAME': 'Archer, R.C.'}]}",,,"engineering, organizations, development",1,2,Management,1.0
30541,c84_182d8dffa345009a8M750919817173212,,Reliability engineering review -- Effective ma...,In Space Systems at Philco Western Development...,Conference article (CA),1963,Institute of Electrical and Electronics Engine...,Proceedings of the 7th IEEE -- National Conven...,"{'AU': [{'NAME': 'DeVille, W.W.'}]}",,,"engineering, organizations, development",1,1,Management,1.0


In [119]:
def get_topics_reduced(model, n_reduced_topics):
    _ = model.hierarchical_topic_reduction(n_reduced_topics)
    topics = model.get_topics(reduced=True)
    topic_size = model.get_topic_sizes(reduced=True)
    topic_hierarchy = model.get_topic_hierarchy()
    topic_number = topics[2]
    topic_words = [', '.join(topics[0][x][0:3]) for x in range(len(topics[0]))]
    topic_sizes = topic_size[0]

    return pd.DataFrame({'n_reduced_topics': n_reduced_topics, 'topic_number': topic_number, 'topic_words': topic_words, 'topic_sizes': topic_sizes, 'children': topic_hierarchy}).sort_values('topic_sizes', ascending=False)

In [131]:
combined_df = pd.DataFrame()
for i in range(1, 12):
    combined_df = combined_df.append(get_topics_reduced(model, i))

combined_df.to_csv('topics.csv', index=False)
combined_df


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated a

Unnamed: 0,n_reduced_topics,topic_number,topic_words,topic_sizes,children
0,1,0,"virtualization, increasingly, decades",30491,"[141, 287, 271, 202, 10, 295, 42, 129, 139, 18..."
0,2,0,"hwma, censoring, lifetime",15262,"[210, 33, 169, 55, 148, 184, 199, 2, 56, 108, ..."
1,2,1,"software, virtualization, disasters",15229,"[141, 287, 271, 202, 10, 295, 42, 129, 139, 18..."
0,3,0,"software, virtualization, developers",13633,"[141, 287, 271, 202, 10, 295, 42, 129, 139, 18..."
1,3,1,"charts, estimators, multivariate",9842,"[73, 6, 164, 270, 18, 178, 132, 102, 173, 224,..."
...,...,...,...,...,...
6,11,6,"preventive, replacement, maintenance",2407,"[210, 33, 169, 55, 148, 184, 199, 2]"
7,11,7,"charts, chart, shewhart",2322,"[150, 58, 16, 89, 296, 59, 180, 234, 265, 81, ..."
8,11,8,"infrastructure, transportation, disruptions",2204,"[53, 228, 268, 34, 207, 126, 249, 61, 183, 146..."
9,11,9,"rul, prediction, prognostic",1854,"[105, 232, 144, 191, 83, 293, 167, 95, 124, 28..."


# How do deal with excluded top-level topics

In [None]:
model = Top2Vec.load('data/top2vec_model.mdl')
_ = model.hierarchical_topic_reduction(num_topics=11)

In [139]:
def get_sub_topics(topic_num, filename, target_num_topics=5, model=model, records_df=records_df):

    def get_topic_documents(model, records_df, topic_num, reduced=True):
        global corus
        n_docs = model.get_topic_sizes(reduced=reduced)[0][topic_num]

        topic_docs = model.search_documents_by_topic(
            topic_num, 
            n_docs, 
            return_documents=False, 
            reduced=reduced
            )
        return records_df[records_df['doc_id'].isin(list(topic_docs[1]))]

    recs = get_topic_documents(model, records_df, topic_num)
    corpus = recs['abstract'].tolist()
    document_ids = recs['doc_id'].tolist()
    sub_model = Top2Vec(corpus, speed='learn', document_ids=document_ids, workers=10, verbose=False)
    if sub_model.get_num_topics() > target_num_topics:
        sub_model.hierarchical_topic_reduction(num_topics=target_num_topics)

        reduced_df = pd.DataFrame({
            'sub_label': np.array([', '.join(x) for x in sub_model.topic_words_reduced[sub_model.doc_top_reduced,0:5]]),
            'sub_topic_id': sub_model.doc_top_reduced,
            'doc_id': sub_model.document_ids
        })
    else:
        reduced_df = pd.DataFrame({
            'sub_label': np.array([', '.join(x) for x in sub_model.topic_words[sub_model.doc_top,0:5]]),
            'sub_topic_id': sub_model.doc_top,
            'doc_id': sub_model.document_ids
        })

    reduced_df.sub_label = reduced_df.sub_label.astype('category')
    output_df = reduced_df.merge(records_df.set_index('doc_id'), on='doc_id')
    output_df.to_csv(filename, index=False)
    return (output_df.groupby(['sub_label'])
        .count()['doc_id']
        .sort_values(ascending=False)[0:target_num_topics]
        )

In [140]:
# Software

get_sub_topics(0, 'data/subtopics_software.csv', target_num_topics=5)

sub_label
formal, safety, elsevier, engineering, design            1102
apps, false, positives, attack, location                  803
test, suite, coverage, suites, cases                      782
releases, defect, predictions, proneness, release         772
service, rejuvenation, cloud, availability, computing     752
Name: doc_id, dtype: int64

In [141]:
# Infrastructure

get_sub_topics(8, 'data/subtopics_infrastructure.csv', target_num_topics=8)

sub_label
concrete, corrosion, reinforced, finite, seismic                   439
ieee, problem, scheme, problems, constraints                       285
resilience, recovery, functionality, infrastructure, disruptive    285
sea, accident, accidents, collision, ships                         274
power, energy, outage, electricity, wind                           272
risk, pipeline, safety, domino, risks                              240
cascading, node, network, robustness, rail                         236
attacker, defender, contest, attack, defense                       173
Name: doc_id, dtype: int64

In [155]:
# Nuclear

get_sub_topics(10, 'data/subtopics_nuclear.csv', target_num_topics=8)

sub_label
maintenance, paper, is, operational, test              431
limitations, risk, ensure, probabilistic, informed     281
simulation, monte, carlo, using, complex               257
escalation, domino, industrial, accidents, chemical    226
waste, geologic, repository, pa, radionuclide          179
real, tree, synthesis, fault, loops                    173
operator, errors, operators, human, error              171
bwr, core, pwr, damage, using                          135
Name: doc_id, dtype: int64

# Document pool validation

In [None]:
def get_reference_coverage(filename, title=1, records_df=records_df):
    with open('data/refs_zio.txt', 'r') as f:
        references = f.read().splitlines()

    titles = []
    for item in references:
        titles.append(item.split('.')[title].strip())

    matches = []

    def get_matches(titles, records_df=records_df):

        global matches # Shared across threads

        for title in titles:
            for record in records_df.title:
                if fuzz.ratio(title, record) > 90:
                    matches.append(title)
                    break

    titles_chunked = [titles[i:i+10] for i in range(0,len(titles),10)]
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(get_matches, titles_chunked)

    print(len(matches))
    print(len(matches)/len(references))
        

In [181]:
# Load the reference text file as a list, one line per list item
with open('data/refs_zio.txt', 'r') as f:
    references = f.read().splitlines()

# Extract the text after the first period and before the second period in each item in references
titles = []
for item in references:
    titles.append(item.split('.')[1].strip())

matches = []

def get_matches(titles, records_df=records_df):

    global matches # Shared across threads

    for title in titles:
        for record in records_df.title:
            if fuzz.ratio(title, record) > 90:
                matches.append(title)
                break

titles_chunked = [titles[i:i+10] for i in range(0,len(titles),10)]
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(get_matches, titles_chunked)

print(len(matches))
print(len(matches)/170)

In [185]:
# Load the reference text file as a list, one line per list item
with open('data/refs_forcina.txt', 'r') as f:
    references = f.read().splitlines()

# Extract the text after the first period and before the second period in each item in references
titles = []
for item in references:
    titles.append(item.split('.')[1].strip())

# For each item in titles, check if there is at least one item in records_df.title that has a fuzz.ratio of at least 90.
matches = []

def get_matches(titles, records_df=records_df):

    global matches # Shared across threads

    for title in titles:
        for record in records_df.title:
            if fuzz.ratio(title, record) > 90:
                matches.append(title)
                break

titles_chunked = [titles[i:i+10] for i in range(0,len(titles),10)]
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(get_matches, titles_chunked)

print(len(matches))
print(len(matches)/119)

0
0.0


In [213]:
# Load the reference text file as a list, one line per list item
with open('data/refs_maurya.txt', 'r') as f:
    references = f.read().splitlines()

# Extract the text after the first period and before the second period in each item in references (second and third for wiley)
titles = []
for item in references:
    titles.append(item.split('.')[2].strip())

# For each item in titles, check if there is at least one item in records_df.title that has a fuzz.ratio of at least 90.
matches = []

def get_matches(titles, records_df=records_df):

    global matches # Shared across threads

    for title in titles:
        for record in records_df.title:
            if fuzz.ratio(title, record) > 90:
                matches.append(title)
                break

titles_chunked = [titles[i:i+10] for i in range(0,len(titles),10)]
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(get_matches, titles_chunked)

print(len(matches))
print(len(matches)/len(references))

22
0.22916666666666666


# Summary statistics

In [188]:
# Total number of papers
len(records_df)

30543

In [194]:
# Count number of records in records_df for each document type
records_df.groupby(['doc_type']).count()['doc_id'].sort_values(ascending=False)

doc_type
Journal article (JA)                   20634
Conference article (CA)                 7764
Book chapter (CH)                        772
Conference proceeding (CP)               161
Article in Press                         128
                                       ...  
Final Report, Nov. 1970 - Sep. 1974        1
Final Report, May 1971 - Mar. 1972         1
Final Report, Jun. 1973 - Aug. 1975        1
Final Report, Jun. 1965 - Jun. 1969        1
Thesis (Dr. Scient)                        1
Name: doc_id, Length: 130, dtype: int64

In [196]:
records_df.groupby(['year']).count()['doc_id']

year
1955       2
1957       2
1958       1
1961       9
1962       5
        ... 
2019    1498
2020    1709
2021    1705
2022    2141
2023     539
Name: doc_id, Length: 66, dtype: int64

In [197]:
records_df.groupby(['year']).count()['doc_id'].sort_values(ascending=False)

year
2022    2141
2020    1709
2021    1705
2016    1683
2017    1529
        ... 
1962       5
1965       3
1957       2
1955       2
1958       1
Name: doc_id, Length: 66, dtype: int64

In [199]:
records_df.groupby(['source_title']).count()['doc_id'].sort_values(ascending=False)

source_title
Reliability Engineering and System Safety                                                                                                                     6871
Quality and Reliability Engineering International                                                                                                             5829
Reliability Engineering &amp; System Safety                                                                                                                   4918
Springer Series in Reliability Engineering                                                                                                                     703
Reliability Engineering                                                                                                                                        296
                                                                                                                                                              ... 
IET Chenn

In [208]:
_ = model.hierarchical_topic_reduction(num_topics=20)

In [209]:
set(np.array([', '.join(x) for x in model.topic_words_reduced[model.doc_top_reduced,0:3]]))

{'bug, developers, bugs',
 'charts, chart, shewhart',
 'companies, customer, market',
 'disruptions, disruption, infrastructure',
 'electron, oxide, silicon',
 'engineering, book, topic',
 'experts, linguistic, opinions',
 'human, hra, cognitive',
 'nuclear, plants, reactor',
 'pipelines, ship, corrosion',
 'preventive, replacement, maintenance',
 'redundancy, rrap, solve',
 'repairable, markov, repair',
 'rul, prediction, prognostics',
 'surrogate, sobol, kriging',
 'trees, tree, boolean',
 'uml, language, checking',
 'vibration, rotor, rotating',
 'virtualization, virtualized, vm',
 'weibull, censoring, censored'}