#### Cleaning and Processing Abstract for Topic Modeling

In [1]:
import sys
import os
import pandas as pd
import pickle

import warnings
warnings.filterwarnings('ignore')

from gensim.corpora import Dictionary
from gensim.models import Phrases

import gensim
import nltk

from nltk import word_tokenize
from nltk.corpus import stopwords
import re


In [2]:
df = pd.read_csv("../Data/data_cleaned.csv")
df.head()

Unnamed: 0,title,abstract,title_abstract,journal,year
0,Psychiatric-Related Revisits to the Emergency ...,OBJECTIVES: Repeat visits (revisits) to Emerge...,Psychiatric-Related Revisits to the Emergency ...,Academic Emergency Medicine,2019
1,"Synergistic Bradycardia from Beta Blockers, Hy...",BACKGROUND: Bradycardia is a common vital sign...,"Synergistic Bradycardia from Beta Blockers, Hy...",Journal of Emergency Medicine,2019
2,Lack of fetal effect from adenosine administra...,Supraventricular Tachycardias are the most com...,Lack of fetal effect from adenosine administra...,American Journal of Emergency Medicine,2019
3,Comparing intranasal ketamine with intravenous...,BACKGROUND: Kidney stones are a fairly common ...,Comparing intranasal ketamine with intravenous...,American Journal of Emergency Medicine,2019
4,Involuntary patient length-of-stay at a suburb...,BACKGROUND: Patients who may be a danger to th...,Involuntary patient length-of-stay at a suburb...,American Journal of Emergency Medicine,2019


In [3]:
# function to clean text

stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', 'copyright', 'inc', 'mosby', 'mm', 'hg', 'american', 'college'])

def cleanText(text):
    
    text = re.split("(methods:)|(methodology:)", text, flags=re.IGNORECASE)[0]
    
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r'<URL>', text)
    text = re.sub(r'[^a-zA-z\s]', ' ', text)
    text = text.lower()
    text = [i for i in text.split() if i not in stop_words]
    text = ' '.join(text)
    return text

In [4]:
df['title_abstract'] = df['title_abstract'].astype('str').apply(cleanText)
df['decade'] = df['year']//10 * 10

In [5]:
df.head()

Unnamed: 0,title,abstract,title_abstract,journal,year,decade
0,Psychiatric-Related Revisits to the Emergency ...,OBJECTIVES: Repeat visits (revisits) to Emerge...,psychiatric related revisits emergency departm...,Academic Emergency Medicine,2019,2010
1,"Synergistic Bradycardia from Beta Blockers, Hy...",BACKGROUND: Bradycardia is a common vital sign...,synergistic bradycardia beta blockers hyperkal...,Journal of Emergency Medicine,2019,2010
2,Lack of fetal effect from adenosine administra...,Supraventricular Tachycardias are the most com...,lack fetal effect adenosine administration pre...,American Journal of Emergency Medicine,2019,2010
3,Comparing intranasal ketamine with intravenous...,BACKGROUND: Kidney stones are a fairly common ...,comparing intranasal ketamine intravenous fent...,American Journal of Emergency Medicine,2019,2010
4,Involuntary patient length-of-stay at a suburb...,BACKGROUND: Patients who may be a danger to th...,involuntary patient length stay suburban emerg...,American Journal of Emergency Medicine,2019,2010


In [6]:
df['title_abstract'][57]

'clinical management decisions adults prolonged acute cough frequency associated factors background uncomplicated episodes prolonged acute cough usually viral self limited despite evidence recommendations contrary often treated antibiotics'

In [7]:
documents = [nltk.word_tokenize(cc) for cc in df['title_abstract']]

In [8]:
print(documents[57])

['clinical', 'management', 'decisions', 'adults', 'prolonged', 'acute', 'cough', 'frequency', 'associated', 'factors', 'background', 'uncomplicated', 'episodes', 'prolonged', 'acute', 'cough', 'usually', 'viral', 'self', 'limited', 'despite', 'evidence', 'recommendations', 'contrary', 'often', 'treated', 'antibiotics']


In [9]:
# Compute bigrams.


# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(documents, min_count=20)
for idx in range(len(documents)):
    for token in bigram[documents[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            documents[idx].append(token)

In [14]:
# Remove rare and common tokens.

# Create a dictionary representation of the documents.
dictionary = Dictionary(documents)

print('Number of unique tokens prior to filter: %d' % len(dictionary))

# Filter out words that occur less than 25 documents, or more than 10% of the documents.
dictionary.filter_extremes(no_below=25, no_above=0.10)

print('Number of unique tokens after to filter: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens prior to filter: 30388
Number of unique tokens after to filter: 5753
Number of documents: 20528


In [15]:
corpus = [dictionary.doc2bow(doc) for doc in documents]

In [16]:

data_list = [df, documents, dictionary, corpus]

with open('../data/data_list', 'wb') as fp:
    pickle.dump(data_list, fp)