# This function is for stop-word removal and lemmatization. 

In [13]:
def lemmatizer(doc, min_df, max_df, dtm_flag):


    import numpy as np
    import pandas as pd
    import re, nltk, spacy
    import pickle
    import os
    import scispacy
    from spacy import displacy
    from spacy.lang.en.stop_words import STOP_WORDS

    from tqdm import tqdm
    import gensim
    from gensim import corpora, models
    from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile
    from gensim.corpora.dictionary import Dictionary


    import en_core_sci_lg
    
    ### Load Scispacy model
    nlp = en_core_sci_lg.load()
    
    ### Lower case column names and drop NA
    docs = doc 
    docs.columns = map(str.lower, docs.columns)
    docs = docs[['pubmedid', 'abstract']]
    docs = docs.dropna()
    
    abstracts = docs['abstract'].astype('str')
    abstracts = abstracts.tolist()
    
    ### Remove new line characters and extra space
    data = [re.sub('\s+', ' ', abstract) for abstract in abstracts]

    ### Define custom stopwords for biomedical analysis and any contextual words
    custom_stop = list("""
    et al www com patient study mg
    """.split())

    ### Add to existing stopword dictionary
    for word in custom_stop:
        nlp.vocab[word].is_stop = True

    ### Lemmatization
    data_lemma = []
    for txt in tqdm(abstracts):
        lis = []
        doc = nlp(txt)
        for token in doc:
            lis.append(token.lemma_)
        data_lemma.append(' '.join(lis))
    

    def tokenization_with_gen_stop(text):
        result=[]
        for token in gensim.utils.simple_preprocess(text) :
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
                gensim.parsing.preprocessing.strip_punctuation(token)
                result.append(token)
        return result

    ## Apply tokenization function
    data_words = []
    for txt in tqdm(data_lemma):
        data_words.append(tokenization_with_gen_stop(txt))
        
   
    ### NLTK Stopword removal (extra stopwords)

    data_words_clean = []
    for word in tqdm(data_words):
        wrd = []
        for w in word:
            if w not in STOP_WORDS:
                wrd.append(w)
        data_words_clean.append(wrd)
    
    ### Create dictionary and corpus required for Topic Modeling
    
    dictionary = corpora.Dictionary(data_words_clean)
    dictionary.filter_extremes(no_below=min_df, no_above=max_df)
    corpus = [dictionary.doc2bow(doc) for doc in data_words_clean]
    
    if (dtm_flag == 1):
        tfidf = models.TfidfModel(corpus, normalize = True)
        corpus = tfidf[corpus]
    elif (dtm_flag == 0):
        corpus = corpus
    
    return corpus, dictionary, data_words_clean, docs['pubmedid']