#### Cleaning and Processing Abstract for Topic Modeling

In [1]:
import sys
import os
import pandas as pd
import pickle

import warnings
warnings.filterwarnings('ignore')

from gensim.corpora import Dictionary
from gensim.models import Phrases

import gensim
import nltk

from nltk import word_tokenize
from nltk.corpus import stopwords
import re

import spacy


In [2]:
df = pd.read_csv("../Data/data_cleaned.csv")
df.head()

Unnamed: 0,title,abstract,title_abstract,journal,year
0,Psychiatric-Related Revisits to the Emergency ...,OBJECTIVES: Repeat visits (revisits) to Emerge...,Psychiatric-Related Revisits to the Emergency ...,Academic Emergency Medicine,2019
1,"Synergistic Bradycardia from Beta Blockers, Hy...",BACKGROUND: Bradycardia is a common vital sign...,"Synergistic Bradycardia from Beta Blockers, Hy...",Journal of Emergency Medicine,2019
2,Lack of fetal effect from adenosine administra...,Supraventricular Tachycardias are the most com...,Lack of fetal effect from adenosine administra...,American Journal of Emergency Medicine,2019
3,Comparing intranasal ketamine with intravenous...,BACKGROUND: Kidney stones are a fairly common ...,Comparing intranasal ketamine with intravenous...,American Journal of Emergency Medicine,2019
4,Involuntary patient length-of-stay at a suburb...,BACKGROUND: Patients who may be a danger to th...,Involuntary patient length-of-stay at a suburb...,American Journal of Emergency Medicine,2019


In [3]:
# function to clean text

def cleanText(text):
    
    text = re.split("(methods:)|(methodology:)", text, flags=re.IGNORECASE)[0]
    text = text.lower()
    return text

df['title_abstract'] = df['title_abstract'].astype('str').apply(cleanText)
df.head()


Unnamed: 0,title,abstract,title_abstract,journal,year
0,Psychiatric-Related Revisits to the Emergency ...,OBJECTIVES: Repeat visits (revisits) to Emerge...,psychiatric-related revisits to the emergency ...,Academic Emergency Medicine,2019
1,"Synergistic Bradycardia from Beta Blockers, Hy...",BACKGROUND: Bradycardia is a common vital sign...,"synergistic bradycardia from beta blockers, hy...",Journal of Emergency Medicine,2019
2,Lack of fetal effect from adenosine administra...,Supraventricular Tachycardias are the most com...,lack of fetal effect from adenosine administra...,American Journal of Emergency Medicine,2019
3,Comparing intranasal ketamine with intravenous...,BACKGROUND: Kidney stones are a fairly common ...,comparing intranasal ketamine with intravenous...,American Journal of Emergency Medicine,2019
4,Involuntary patient length-of-stay at a suburb...,BACKGROUND: Patients who may be a danger to th...,involuntary patient length-of-stay at a suburb...,American Journal of Emergency Medicine,2019


In [4]:
import scispacy
nlp = spacy.load("en_core_sci_md") # loading the language model 

In [5]:
def clean_nlp(text):  # clean up your text and generate list of words for each document. 
    removal=['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']
    text_out = []
    doc= nlp(text)
    for token in doc:
        if token.is_stop == False and token.is_alpha and len(token)>2 and token.pos_ not in removal:
            lemma = token.lemma_
            text_out.append(lemma)
    return text_out
documents = df.title_abstract.apply(lambda x:clean_nlp(x))


In [6]:
df.head()

Unnamed: 0,title,abstract,title_abstract,journal,year
0,Psychiatric-Related Revisits to the Emergency ...,OBJECTIVES: Repeat visits (revisits) to Emerge...,psychiatric-related revisits to the emergency ...,Academic Emergency Medicine,2019
1,"Synergistic Bradycardia from Beta Blockers, Hy...",BACKGROUND: Bradycardia is a common vital sign...,"synergistic bradycardia from beta blockers, hy...",Journal of Emergency Medicine,2019
2,Lack of fetal effect from adenosine administra...,Supraventricular Tachycardias are the most com...,lack of fetal effect from adenosine administra...,American Journal of Emergency Medicine,2019
3,Comparing intranasal ketamine with intravenous...,BACKGROUND: Kidney stones are a fairly common ...,comparing intranasal ketamine with intravenous...,American Journal of Emergency Medicine,2019
4,Involuntary patient length-of-stay at a suburb...,BACKGROUND: Patients who may be a danger to th...,involuntary patient length-of-stay at a suburb...,American Journal of Emergency Medicine,2019


In [7]:
print(documents[57])

['clinical', 'management', 'decision', 'adult', 'prolonged', 'acute', 'cough', 'frequency', 'associate', 'uncomplicated', 'episode', 'prolonged', 'acute', 'cough', 'viral', 'evidence', 'recommendation', 'contrary', 'treat', 'antibiotic']


In [8]:
# Compute bigrams.


# Add bigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(documents, min_count=20)
for idx in range(len(documents)):
    for token in bigram[documents[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            documents[idx].append(token)
            
trigram = Phrases(documents, min_count=20)
for idx in range(len(documents)):
    for token in trigram[documents[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            documents[idx].append(token)

In [10]:
# Remove rare and common tokens.

# Create a dictionary representation of the documents.
dictionary = Dictionary(documents)

print('Number of unique tokens prior to filter: %d' % len(dictionary))

# Filter out words that occur less than 25 documents, or more than 10% of the documents.
dictionary.filter_extremes(no_below=25, no_above=0.10)

print('Number of unique tokens after to filter: %d' % len(dictionary))


Number of unique tokens prior to filter: 22791
Number of unique tokens after to filter: 4181


In [11]:
corpus = [dictionary.doc2bow(doc) for doc in documents]

In [12]:

data_list = [df, documents, dictionary, corpus]

with open('../data/data_list_spacy', 'wb') as fp:
    pickle.dump(data_list, fp)