### Function for text preprocessing 

In [1]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re

In [2]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [3]:
def preprocess(text):
    sentence = str(text)
    
    #convert to lower case
    text = text.lower()
    
    #replace html tags with spaces
    text=text.replace('{html}',"")
    
    #clean the text
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', text)
    
    #replace http... with cleantext
    rem_url=re.sub(r'http\S+', '',cleantext)
    
    #remove digits
    rem_num = re.sub('[0-9]+', '', rem_url)
    
    #tokenize the text
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)
    print('Tokens in text:\n',tokens,'\n')
    
    #filter stopwords
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    
    #stemming
    stem_words=[stemmer.stem(w) for w in filtered_words]
    print('Stemmed text:\n',stem_words,'\n')
    
    #lemmatization
    lemma_words=[lemmatizer.lemmatize(w) for w in filtered_words]
    print('Lemmatized text:\n',lemma_words,'\n')
    
    return " ".join(filtered_words)

In [4]:
text = 'We are building a startup team to solve hard challenges in healthcare like diagnosing disesases and discovering novel therapeutics.'

In [5]:
print('Filtered text:\n',preprocess(text))

Tokens in text:
 ['we', 'are', 'building', 'a', 'startup', 'team', 'to', 'solve', 'hard', 'challenges', 'in', 'healthcare', 'like', 'diagnosing', 'disesases', 'and', 'discovering', 'novel', 'therapeutics'] 

Stemmed text:
 ['build', 'startup', 'team', 'solv', 'hard', 'challeng', 'healthcar', 'like', 'diagnos', 'disesas', 'discov', 'novel', 'therapeut'] 

Lemmatized text:
 ['building', 'startup', 'team', 'solve', 'hard', 'challenge', 'healthcare', 'like', 'diagnosing', 'disesases', 'discovering', 'novel', 'therapeutic'] 

Filtered text:
 building startup team solve hard challenges healthcare like diagnosing disesases discovering novel therapeutics
