# Import libraries

In [1]:
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import torch
import re

# Append all the webscraped files to one single dataframe

In [2]:
# load/read in all csv + txt files into a list
timeperiods = [
              '21_05to09', '21_01to04',
              '20_08to12', '20_04to7', '20_01to03',
              '19_08to12', '19_04to7', '19_01to03',
              '18_08to12', '18_04to7', '18_01to03',
              '17_08to12', '17_04to7', '17_01to03',
              '16_08to12', '16_04to7', '16_01to03',
              '15_08to12', '15_04to7', '15_01to03',
              '14_08to12', '14_04to7', '14_01to03',
              '13_08to12', '13_04to7', '13_01to03',
              '12_08to12', '12_04to7', '12_01to03',
              '11_08to12', '11_04to7', '11_01to03'
]

csvfiles = []
txtfiles = []
for timeperiod in timeperiods:
    
    csvfile = r'../Graphical Search Method/webscraped_data/' + timeperiod + '_pubmed_scrape.csv'
    txtfile = r'../Graphical Search Method/webscraped_data/' + timeperiod + '_doilist_pubmed_scraped.txt'
    csvfiles.append(csvfile)
    txtfiles.append(txtfile)

In [3]:
# append all csv files into a single df (csvdf)
csvdf = pd.DataFrame()
for csvfile in csvfiles:
    partcsvdf = pd.read_csv(csvfile)
    csvdf = csvdf.append(partcsvdf)
csvdf.shape[0]

228498

In [4]:
# Drop duplicates of title in csvdf
csvdf = csvdf.drop_duplicates(subset=['title'], keep='first')
csvdf.shape[0]

166885

In [5]:
# append all txt files into a single df (txtdf)
txtdf = pd.DataFrame()
for txtfile in txtfiles:
    parttxtdf = pd.read_table(txtfile, sep=';;', engine='python')
    txtdf = txtdf.append(parttxtdf)
txtdf.shape[0]

228475

In [6]:
# Drop duplicates of title in txtdf
txtdf = txtdf.drop_duplicates(subset=['title'], keep='first')
txtdf.shape[0]

166846

In [7]:
# merge csvdf and txtdf by title
df = pd.DataFrame()
df = pd.merge(csvdf, txtdf, how='inner', on = 'title')

In [8]:
# save files just in case
#np.savetxt('fulldoilist.txt', txtdf.values, delimiter=';;', fmt='%s', encoding='utf-8')
#csvdf.to_csv('fullscrapelist.csv', encoding='utf-8', index=None)
#df.to_pickle('fulldf.pkl')

In [9]:
# load in saved fulldf
fulldf = pd.read_pickle('fulldf.pkl')

In [10]:
fulldf

Unnamed: 0,journal,title,date,authorslist,free,publicationtype,abstract,citednumber,doi,link,citeddoilist
0,Cancer Immunol Immunother.,Immunotherapy in nonsmall-cell lung cancer: cu...,2021 May,"['Elena María Brozos-Vázquez#', 'Roberto Díaz-...",\n \n Springer\n ...,Review,Immunotherapy has been one of the great advanc...,2,10.1007/s00262-020-02752-z.,https://pubmed.ncbi.nlm.nih.gov/33113004/,[' 10.1007/s00262-020-02752-z. Epub 2020 Oct 2...
1,Lancet.,Lung cancer,2021 Aug 7,"['Alesha A Thai', 'Benjamin J Solomon', 'Lecia...",\n \n Elsevier Science...,Review,Lung cancer is one of the most frequently diag...,0,10.1016/S0140-6736(21)00312-3.,https://pubmed.ncbi.nlm.nih.gov/34273294/,[' 10.1177/0218492319881036. Epub 2019 Sep 30....
2,Cancer Immunol Immunother.,Comparative analysis of the tumor immune-micro...,2021 Jul,"['Seung Geun Song', 'Sehui Kim', 'Jaemoon Koh'...",\n \n Springer\n ...,Comparative Study,To evaluate the characteristics of the tumor i...,1,10.1007/s00262-020-02840-0.,https://pubmed.ncbi.nlm.nih.gov/33420630/,[' 10.1007/s00262-020-02840-0. Epub 2021 Jan 9...
3,MMW Fortschr Med.,[Lung cancer reduction strategies],2021 Jun,['Felix J F Herth'],\n \n Springer\n ...,Review,,0,10.1007/s15006-021-9958-8.,https://pubmed.ncbi.nlm.nih.gov/34086233/,"[' 10.3779/j.issn.1009-3419.2016.05.12.', ' 10..."
4,Expert Rev Pharmacoecon Outcomes Res.,Economic analyses of immune-checkpoint inhibit...,2021 Jun,"['Alain Vergnenegre', 'Christos Chouaid']",\n \n Taylor & Francis...,Review,Total lung-cancer-management costs are increas...,0,10.1080/14737167.2021.1863790.,https://pubmed.ncbi.nlm.nih.gov/33306411/,[' 10.1371/journal.pone.0238536. eCollection 2...
...,...,...,...,...,...,...,...,...,...,...,...
166408,Prog Mol Biol Transl Sci.,Cell delivery of therapeutic nanoparticles,2011,"['JoEllyn McMillan', 'Elena Batrakova', 'Howar...",Free PMC article,,Nanomedicine seeks to manufacture drugs and ot...,29,10.1016/B978-0-12-416020-0.00014-0.,https://pubmed.ncbi.nlm.nih.gov/22093229/,"[' 10.1016/B978-0-12-416020-0.00014-0.', ' 10...."
166409,Med Phys.,Comparison of scatter rejection and low-contra...,2011 Jan,"['Xinming Liu', 'Chris C Shaw', 'Chao-Jen Lai'...",Free PMC article,Comparative Study,To investigate and compare the scatter rejecti...,2,10.1118/1.3519903.,https://pubmed.ncbi.nlm.nih.gov/21361171/,"[' 10.1118/1.3519903.', ' 10.1088/0031-9155/59..."
166410,Chem Res Toxicol.,"Analysis of acrolein-derived 1,N2-propanodeoxy...",2011 Jan 14,"['Siyi Zhang', 'Silvia Balbo', 'Mingyao Wang',...",Free PMC article,,"(6R/S)-3-(2'-deoxyribos-1'-yl)-5,6,7,8-tetrahy...",21,10.1021/tx100321y.,https://pubmed.ncbi.nlm.nih.gov/21090699/,"[' 10.1021/tx100321y. Epub 2010 Nov 22.', ' 10..."
166411,Circulation.,Heart disease and stroke statistics--2011 upda...,2011 Feb 1,"['Véronique L Roger', 'Alan S Go', 'Donald M L...",Free PMC article,,"Each year, the American Heart Association (AHA...",1808,10.1161/CIR.0b013e3182009701.,https://pubmed.ncbi.nlm.nih.gov/21160056/,"[' 10.1161/CIR.0b013e3182456d46.', ' 10.1161/C..."


# Data Cleaning

In [11]:
# Create another copy for cleaning
cleandf = fulldf.copy()

In [12]:
### journal ###

# remove Na values
print(cleandf['journal'].isna().sum())
print(len(cleandf['journal'].unique()))

cleandf = cleandf[~cleandf['journal'].isna()]

262
5451


In [13]:
### date ###

# remove na values
print('na sum: ', cleandf['date'].isna().sum())
cleandf = cleandf[~cleandf['date'].isna()]


# Dates have different formats, so extract year and month, then remake date column with specific format
# create another column for year
years = [str(x) for x in range(2010, 2022)]
conditions = list(map(cleandf['date'].str.contains, years))
cleandf['year'] = np.select(conditions, years, '')

# create another column for month
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
conditions = list(map(cleandf['date'].str.contains, months))
cleandf['month'] = np.select(conditions, months, '')

# remove rows with no month
print('no months: ', cleandf[cleandf['month'] ==''].shape[0])
cleandf = cleandf[cleandf['month'] != '']

# combine year and month to recreate column: date in datetime format
cleandf['date'] = cleandf['year'] + ' ' + cleandf['month']
cleandf['date'] = pd.to_datetime(cleandf['date'])

# remove dates that are not fully webscraped

cleandf = cleandf[(cleandf['date'] != '2021-09-01')]
cleandf = cleandf[(cleandf['date'] != '2021-10-01')]
cleandf = cleandf[(cleandf['date'] != '2021-11-01')]
cleandf = cleandf[(cleandf['date'] != '2021-12-01')]
cleandf = cleandf[(cleandf['date'] != '2010-12-01')]

# drop year and month column
cleandf = cleandf.drop(['year', 'month'], axis=1)

na sum:  0
no months:  9976


In [14]:
cleandf['date'].dt.year.unique()

array([2021, 2020, 2019, 2017, 2018, 2016, 2015, 2014, 2013, 2012, 2011],
      dtype=int64)

In [15]:
### authors ###

# reformat string format
cleandf['authorslist'] = cleandf['authorslist'].str.replace('\[', '').str.replace('\]', '')

  cleandf['authorslist'] = cleandf['authorslist'].str.replace('\[', '').str.replace('\]', '')


In [16]:
### Free ###

# replace Na values with 'Free' (pubmed webscrape format)
print(cleandf['free'].shape[0])
print(cleandf['free'].isna().sum())
cleandf['free'] = cleandf['free'].fillna('free')

# change 'free' value to 'Free' and everything else is 'Paid'
freelist = ['Free']
conditions = list(map(cleandf['free'].str.contains, freelist))
cleandf['free'] = np.select(conditions, freelist, 'Paid')

154916
4493


In [17]:
### Publication Type ###

print(cleandf['publicationtype'].unique())

# replace na values with 'Article' value (pubmed webscrape format)
cleandf['publicationtype'] = cleandf['publicationtype'].fillna('Article')

['Review' 'Comparative Study' nan 'Multicenter Study'
 'Observational Study' 'Comment' 'Randomized Controlled Trial'
 'Case Reports' 'Meta-Analysis' 'Clinical Trial'
 'Retraction of Publication' 'Published Erratum' 'Classical Article'
 'Practice Guideline' 'Editorial' 'Guideline' 'Controlled Clinical Trial']


In [18]:
### Abstract ###

# remove na values
cleandf['abstract'].isna().sum()
cleandf = cleandf[~cleandf['abstract'].isna()]

In [19]:
### citednumber ###

# na values
print(cleandf['citednumber'].isnull().sum())

# convert object dtype to int
print(cleandf['citednumber'].dtype)

intcitednumberlist = []
for i in cleandf['citednumber']:
    citednumber = int(str(i).replace(',', '')) # remove comma to change str to int
    intcitednumberlist.append(citednumber)
cleandf['citednumber'] = intcitednumberlist

0
object


In [20]:
### doi ###

# remove na values
print(cleandf['doi'].isna().sum())
cleandf = cleandf[~cleandf['doi'].isna()]

# drop duplicates of doi
cleandf.drop_duplicates(subset=['doi'], keep='first', inplace=True)

# remove period at the end of doi
newdoilist = []
for i in cleandf['doi']:
    if i[-1] == '.':
        newdoi = i[:-1]
    else:
        newdoi = i
    newdoilist.append(newdoi)
cleandf['doi'] = newdoilist

7041


In [21]:
### citeddoilist ###

# reformat citeddoilist (webscraping errors)
fullformattedlist = []
for i, b in zip(cleandf['citeddoilist'].str.replace('\[', '').str.replace('\]', '').str.replace('\'', ''), cleandf['doi']):
    doilist = i.split(',')
    formattedlist = []
    for y in doilist:
        doi = y.strip().split('. ')[0]
        # remove period at end if present
        try:
            if doi[-1] == '.':
                newdoi = doi[:-1]
            else:
                newdoi = doi
        except: # if there are no cited doi
            newdoi = doi
            
        try: 
            if newdoi[0].isdigit() and '/' in newdoi and b not in newdoi:
                formattedlist.append(newdoi)
            else:
                pass
        except: # if there are no cited doi
            pass

    fullformattedlist.append(formattedlist)
cleandf['citeddoilist'] = fullformattedlist

  for i, b in zip(cleandf['citeddoilist'].str.replace('\[', '').str.replace('\]', '').str.replace('\'', ''), cleandf['doi']):


### Final cleaning check

In [22]:
cleandf.isna().sum()

journal            0
title              0
date               0
authorslist        0
free               0
publicationtype    0
abstract           0
citednumber        0
doi                0
link               0
citeddoilist       0
dtype: int64

In [23]:
cleandf.dtypes

journal                    object
title                      object
date               datetime64[ns]
authorslist                object
free                       object
publicationtype            object
abstract                   object
citednumber                 int64
doi                        object
link                       object
citeddoilist               object
dtype: object

In [24]:
cleandf.reset_index(drop=True, inplace=True)

In [25]:
cleandf

Unnamed: 0,journal,title,date,authorslist,free,publicationtype,abstract,citednumber,doi,link,citeddoilist
0,Cancer Immunol Immunother.,Immunotherapy in nonsmall-cell lung cancer: cu...,2021-05-01,"'Elena María Brozos-Vázquez#', 'Roberto Díaz-P...",Paid,Review,Immunotherapy has been one of the great advanc...,2,10.1007/s00262-020-02752-z,https://pubmed.ncbi.nlm.nih.gov/33113004/,"[10.3390/cancers13071675, 10.1177/175883592199..."
1,Lancet.,Lung cancer,2021-08-01,"'Alesha A Thai', 'Benjamin J Solomon', 'Lecia ...",Paid,Review,Lung cancer is one of the most frequently diag...,0,10.1016/S0140-6736(21)00312-3,https://pubmed.ncbi.nlm.nih.gov/34273294/,"[10.1177/0218492319881036, 10.1007/s12094-009-..."
2,Cancer Immunol Immunother.,Comparative analysis of the tumor immune-micro...,2021-07-01,"'Seung Geun Song', 'Sehui Kim', 'Jaemoon Koh',...",Paid,Comparative Study,To evaluate the characteristics of the tumor i...,1,10.1007/s00262-020-02840-0,https://pubmed.ncbi.nlm.nih.gov/33420630/,[10.3389/fimmu.2021.693709]
3,Expert Rev Pharmacoecon Outcomes Res.,Economic analyses of immune-checkpoint inhibit...,2021-06-01,"'Alain Vergnenegre', 'Christos Chouaid'",Paid,Review,Total lung-cancer-management costs are increas...,0,10.1080/14737167.2021.1863790,https://pubmed.ncbi.nlm.nih.gov/33306411/,"[10.1371/journal.pone.0238536, 10.1080/1472821..."
4,Ann Thorac Surg.,Organized Lung Cancer Screening Pilot: Informi...,2021-06-01,"'Gail E Darling', 'Martin C Tammemägi', 'Heidi...",Paid,Article,Lung cancer is the leading cause of cancer dea...,3,10.1016/j.athoracsur.2020.07.051,https://pubmed.ncbi.nlm.nih.gov/33039364/,"[10.3390/curroncol28030181, 10.1371/journal.po..."
...,...,...,...,...,...,...,...,...,...,...,...
141342,Bioorg Med Chem.,Synthesis of a covalent gemcitabine-(carbamate...,2011-01-01,"'C P Coyne', 'Toni Jones', 'Todd Pharr'",Paid,Article,Gemcitabine is a potent chemotherapeutic that ...,13,10.1016/j.bmc.2010.11.046,https://pubmed.ncbi.nlm.nih.gov/21169024/,"[10.1128/JVI.01360-16, 10.2147/DDDT.S102075, 1..."
141343,Med Phys.,Comparison of scatter rejection and low-contra...,2011-01-01,"'Xinming Liu', 'Chris C Shaw', 'Chao-Jen Lai',...",Free,Comparative Study,To investigate and compare the scatter rejecti...,2,10.1118/1.3519903,https://pubmed.ncbi.nlm.nih.gov/21361171/,"[10.1088/0031-9155/59/5/1305, 10.1118/1.3659709]"
141344,Chem Res Toxicol.,"Analysis of acrolein-derived 1,N2-propanodeoxy...",2011-01-01,"'Siyi Zhang', 'Silvia Balbo', 'Mingyao Wang', ...",Free,Article,"(6R/S)-3-(2'-deoxyribos-1'-yl)-5,6,7,8-tetrahy...",21,10.1021/tx100321y,https://pubmed.ncbi.nlm.nih.gov/21090699/,"[10.1021/acs.chemrestox.0c00265, 10.1021/acs.c..."
141345,Circulation.,Heart disease and stroke statistics--2011 upda...,2011-02-01,"'Véronique L Roger', 'Alan S Go', 'Donald M Ll...",Free,Article,"Each year, the American Heart Association (AHA...",1808,10.1161/CIR.0b013e3182009701,https://pubmed.ncbi.nlm.nih.gov/21160056/,"[10.1161/CIR.0b013e3182456d46, 10.1161/CIRCULA..."


In [26]:
# Save cleaned df version
#cleandf.to_pickle('clean2df.pkl')

### Final DataFrame for Plotly Dash

In [27]:
# note title and abstract aren't NLP cleaned here.
# Instead, it will be cleaned with BERT tokenizer
plotlydash = pd.read_pickle('clean2df.pkl')

In [28]:
# Reset index
plotlydash.reset_index(drop=True, inplace=True)

# Drop unused columns for visualization to save memory 
plotlydash.drop(columns=['citeddoilist', 'authorslist'], inplace=True)

In [29]:
# get transformer from SBERT
model = SentenceTransformer('all-mpnet-base-v2')

In [31]:
# Sentence Embedding # Warning: this takes a long time
'''

sents1 = plotlydash['title'].iloc[0:70673]
sents2 = plotlydash['title'].iloc[70673:]

sentence_embeddings = np.array([]).reshape(0, 768) # BERT vectorizes to 768 dimensions
for i in sents1:
    embedding = model.encode([i])
    sentence_embeddings = np.concatenate([sentence_embeddings, embedding])
'''

"\n\nsents1 = plotlydash['title'].iloc[0:70673]\nsents2 = plotlydash['title'].iloc[70673:]\n\nsentence_embeddings = np.array([]).reshape(0, 768) # BERT vectorizes to 768 dimensions\nfor i in sents1:\n    embedding = model.encode([i])\n    sentence_embeddings = np.concatenate([sentence_embeddings, embedding])\n"

In [32]:
'''for i in sents2:
    embedding = model.encode([i])
    sentence_embeddings = np.concatenate([sentence_embeddings, embedding])
'''

'for i in sents2:\n    embedding = model.encode([i])\n    sentence_embeddings = np.concatenate([sentence_embeddings, embedding])\n'

In [33]:
#sentence_embeddings.shape

In [34]:
# Save sentence embeddings to file
'''
np.save('sentence_embeddings', sentence_embeddings)
'''

"\nnp.save('sentence_embeddings', sentence_embeddings)\n"

In [35]:
sentence_embeddings = np.load('sentence_embeddings.npy')

In [36]:
# calc similarities by finding cosine angle between two vectorized sentences
# the smaller the angle, the more relavent, the higher the score
def sbert(articlenumber, sbertterm):
    score = cosine_similarity([sentence_embeddings[articlenumber],sbertterm])[0]
    return score

# get similarity scores between one sentence/article/term and ALL the articles
def sbertscorer(userinput):
    term = model.encode(userinput)
    scores = []
    for i in range(len(sentence_embeddings)):
        scores.append(sbert(i, term)[1])
    plotlydash['scores'] = scores
    
# Set default scores to 'lung cancer'
userinputterm = 'lung cancer'

In [37]:
sbertscorer(userinputterm)

In [38]:
plotlydash = plotlydash[['title', 'abstract', 'date', 'publicationtype', 'free', 'citednumber', 'scores', 'link']]

In [39]:
# Save final df for plotly dash
plotlydash.to_csv('plotlydash.csv')

In [40]:
plotlydash

Unnamed: 0,title,abstract,date,publicationtype,free,citednumber,scores,link
0,Immunotherapy in nonsmall-cell lung cancer: cu...,Immunotherapy has been one of the great advanc...,2021-05-01,Review,Paid,2,0.378918,https://pubmed.ncbi.nlm.nih.gov/33113004/
1,Lung cancer,Lung cancer is one of the most frequently diag...,2021-08-01,Review,Paid,0,1.000000,https://pubmed.ncbi.nlm.nih.gov/34273294/
2,Comparative analysis of the tumor immune-micro...,To evaluate the characteristics of the tumor i...,2021-07-01,Comparative Study,Paid,1,0.301354,https://pubmed.ncbi.nlm.nih.gov/33420630/
3,Economic analyses of immune-checkpoint inhibit...,Total lung-cancer-management costs are increas...,2021-06-01,Review,Paid,0,0.361602,https://pubmed.ncbi.nlm.nih.gov/33306411/
4,Organized Lung Cancer Screening Pilot: Informi...,Lung cancer is the leading cause of cancer dea...,2021-06-01,Article,Paid,3,0.501801,https://pubmed.ncbi.nlm.nih.gov/33039364/
...,...,...,...,...,...,...,...,...
141342,Synthesis of a covalent gemcitabine-(carbamate...,Gemcitabine is a potent chemotherapeutic that ...,2011-01-01,Article,Paid,13,0.122529,https://pubmed.ncbi.nlm.nih.gov/21169024/
141343,Comparison of scatter rejection and low-contra...,To investigate and compare the scatter rejecti...,2011-01-01,Comparative Study,Free,2,0.142462,https://pubmed.ncbi.nlm.nih.gov/21361171/
141344,"Analysis of acrolein-derived 1,N2-propanodeoxy...","(6R/S)-3-(2'-deoxyribos-1'-yl)-5,6,7,8-tetrahy...",2011-01-01,Article,Free,21,0.275874,https://pubmed.ncbi.nlm.nih.gov/21090699/
141345,Heart disease and stroke statistics--2011 upda...,"Each year, the American Heart Association (AHA...",2011-02-01,Article,Free,1808,0.260897,https://pubmed.ncbi.nlm.nih.gov/21160056/
