#### Cleaning and Processing Abstract for Topic Modeling

In [1]:
import sys
import os
import pandas as pd
import pickle

import warnings
warnings.filterwarnings('ignore')

from gensim.corpora import Dictionary
from gensim.models import Phrases

import gensim
import nltk

from nltk import word_tokenize
from nltk.corpus import stopwords
import re

import spacy


In [106]:
df = pd.read_csv("../Data/data_cleaned.csv")
df.head()

Unnamed: 0,title,abstract,title_abstract,journal,year
0,Psychiatric-Related Revisits to the Emergency ...,OBJECTIVES: Repeat visits (revisits) to Emerge...,Psychiatric-Related Revisits to the Emergency ...,Academic Emergency Medicine,2019
1,"Synergistic Bradycardia from Beta Blockers, Hy...",BACKGROUND: Bradycardia is a common vital sign...,"Synergistic Bradycardia from Beta Blockers, Hy...",Journal of Emergency Medicine,2019
2,Lack of fetal effect from adenosine administra...,Supraventricular Tachycardias are the most com...,Lack of fetal effect from adenosine administra...,American Journal of Emergency Medicine,2019
3,Comparing intranasal ketamine with intravenous...,BACKGROUND: Kidney stones are a fairly common ...,Comparing intranasal ketamine with intravenous...,American Journal of Emergency Medicine,2019
4,Involuntary patient length-of-stay at a suburb...,BACKGROUND: Patients who may be a danger to th...,Involuntary patient length-of-stay at a suburb...,American Journal of Emergency Medicine,2019


In [107]:
# function to clean text

def initial_cleanText(text):
    
    stop_words = set(stopwords.words('english'))
    stop_words.update(['elsevier', 'copyright', 'rights' 'reserved', 'ed'])
    text = text.lower()

    # strip section before methodology
    text = re.split("(methods:)|(methodology:)", text, flags=re.IGNORECASE)[0]
    text = [i for i in text.split() if i not in stop_words]
    text = ' '.join(text)
    text = re.sub('-', ' ', text)
    return text

df['title_abstract'] = df['title_abstract'].astype('str').apply(initial_cleanText)
df.head()


Unnamed: 0,title,abstract,title_abstract,journal,year
0,Psychiatric-Related Revisits to the Emergency ...,OBJECTIVES: Repeat visits (revisits) to Emerge...,psychiatric related revisits emergency departm...,Academic Emergency Medicine,2019
1,"Synergistic Bradycardia from Beta Blockers, Hy...",BACKGROUND: Bradycardia is a common vital sign...,"synergistic bradycardia beta blockers, hyperka...",Journal of Emergency Medicine,2019
2,Lack of fetal effect from adenosine administra...,Supraventricular Tachycardias are the most com...,lack fetal effect adenosine administration pre...,American Journal of Emergency Medicine,2019
3,Comparing intranasal ketamine with intravenous...,BACKGROUND: Kidney stones are a fairly common ...,comparing intranasal ketamine intravenous fent...,American Journal of Emergency Medicine,2019
4,Involuntary patient length-of-stay at a suburb...,BACKGROUND: Patients who may be a danger to th...,involuntary patient length of stay suburban em...,American Journal of Emergency Medicine,2019


In [108]:
test1 = df['title_abstract'][0]
test1

'psychiatric related revisits emergency department following rapid expansion community mental health services. objectives: repeat visits (revisits) emergency departments (eds) psychiatric care reflect poor continuity care impose high financial cost. test whether rapid expansion community health centers (chcs) which provide regional, low cost primary care correspond fewer repeat psychiatric related visits (predvs).'

In [105]:
import scispacy
nlp = spacy.load("en_core_sci_md") # loading the language model 

In [109]:
def clean_nlp(text):  # clean up your text and generate list of words for each document. 
    removal=['PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']
    text_out = []
    doc= nlp(text)
    for token in doc:
        if token.is_stop == False and token.is_alpha and len(token)>2 and token.pos_ not in removal:
            lemma = token.lemma_
            text_out.append(lemma)        
    return text_out
documents = df.title_abstract.apply(lambda x:clean_nlp(x))


In [110]:
def clean_nlp2(text):  # clean up your text and generate list of words for each document. 
    removal=['PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']
    text_out = []
    doc= nlp(text)
    for token in doc:
        if token.is_stop == False and token.is_alpha and len(token)>2 and token.pos_ not in removal:
            lemma = token.lemma_
            text_out.append(lemma)    
    text_out = ' '.join(text_out)
    return text_out
df['bert'] = df.title_abstract.apply(lambda x:clean_nlp2(x))

In [111]:
df.head()

Unnamed: 0,title,abstract,title_abstract,journal,year,bert
0,Psychiatric-Related Revisits to the Emergency ...,OBJECTIVES: Repeat visits (revisits) to Emerge...,psychiatric related revisits emergency departm...,Academic Emergency Medicine,2019,psychiatric related revisit emergency departme...
1,"Synergistic Bradycardia from Beta Blockers, Hy...",BACKGROUND: Bradycardia is a common vital sign...,"synergistic bradycardia beta blockers, hyperka...",Journal of Emergency Medicine,2019,synergistic bradycardia beta blocker hyperkale...
2,Lack of fetal effect from adenosine administra...,Supraventricular Tachycardias are the most com...,lack fetal effect adenosine administration pre...,American Journal of Emergency Medicine,2019,lack fetal effect adenosine administration pre...
3,Comparing intranasal ketamine with intravenous...,BACKGROUND: Kidney stones are a fairly common ...,comparing intranasal ketamine intravenous fent...,American Journal of Emergency Medicine,2019,compare intranasal ketamine intravenous fentan...
4,Involuntary patient length-of-stay at a suburb...,BACKGROUND: Patients who may be a danger to th...,involuntary patient length of stay suburban em...,American Journal of Emergency Medicine,2019,involuntary patient length stay suburban emerg...


In [113]:
print(df['title_abstract'][0])
print(documents[0])

psychiatric related revisits emergency department following rapid expansion community mental health services. objectives: repeat visits (revisits) emergency departments (eds) psychiatric care reflect poor continuity care impose high financial cost. test whether rapid expansion community health centers (chcs) which provide regional, low cost primary care correspond fewer repeat psychiatric related visits (predvs).
['psychiatric', 'related', 'revisit', 'emergency', 'department', 'follow', 'rapid', 'expansion', 'community', 'mental', 'health', 'service', 'objective', 'repeat', 'visit', 'revisit', 'emergency', 'department', 'ed', 'psychiatric', 'care', 'reflect', 'poor', 'continuity', 'care', 'impose', 'high', 'financial', 'cost', 'test', 'rapid', 'expansion', 'community', 'health', 'center', 'chcs', 'provide', 'regional', 'low', 'cost', 'primary', 'care', 'correspond', 'few', 'repeat', 'psychiatric', 'related', 'visit', 'predvs']


In [126]:
# Remove rare and common tokens.

# Create a dictionary representation of the documents.
dictionary = Dictionary(documents)

print('Number of unique tokens prior to filter: %d' % len(dictionary))

# Filter out words that occur less than 10 documents, or more than 20% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.05)


print('Number of unique tokens after to filter: %d' % len(dictionary))


Number of unique tokens prior to filter: 23001
Number of unique tokens after to filter: 3845


In [127]:
def filter_words(text, dictionary):
   
    text_out = [i for i in text.split() if i in dictionary.values()]
    text_out = ' '.join(text_out)
    return text_out

In [128]:
df['bert_filtered'] = df['bert'].apply(lambda x:filter_words(x, dictionary))

In [129]:
print(df['title_abstract'][0])
print(df['bert'][0])
print(df['bert_filtered'][0])

psychiatric related revisits emergency department following rapid expansion community mental health services. objectives: repeat visits (revisits) emergency departments (eds) psychiatric care reflect poor continuity care impose high financial cost. test whether rapid expansion community health centers (chcs) which provide regional, low cost primary care correspond fewer repeat psychiatric related visits (predvs).
psychiatric related revisit emergency department follow rapid expansion community mental health service objective repeat visit revisit emergency department ed psychiatric care reflect poor continuity care impose high financial cost test rapid expansion community health center chcs provide regional low cost primary care correspond few repeat psychiatric related visit predvs
psychiatric revisit rapid expansion community mental repeat revisit ed psychiatric reflect poor continuity financial cost rapid expansion community regional cost primary correspond few repeat psychiatric


In [130]:
corpus = [dictionary.doc2bow(doc) for doc in documents]

In [131]:

data_list = [df, documents, dictionary, corpus]

with open('../data/data_list_bert_filt_20_5', 'wb') as fp:
    pickle.dump(data_list, fp)