In [1]:
import sys
import itertools
import re
import pandas as pd
import numpy as np
from sklearn.base import TransformerMixin

import gensim
from gensim.models.phrases import Phraser, Phrases

import spacy
import logging

logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
health_grants_df = pd.read_csv('../data/health_research_grants_2018_04_23_1436.csv')

In [4]:
health_grants_df.head(2)

Unnamed: 0,agency_name,lad13nm,participant_name,project_start_date,project_title,public_description,dataset_id,project_id,paragraph_vectors,year,grants_funding,project_start_datetime
0,Health Resources and Services Administration,,,"Apr 03, 2013",Ryan White HIV/AIDS Program Part D Grants for ...,This announcement solicits applications for fi...,grants_gov,HRSA-13-264,[-0.35462067 -0.41949466 -0.27364257 -0.659733...,2013.0,,2013-04-03
1,Health Resources and Services Administration,,,"Jan 30, 2012",Ryan White HIV/AIDS Program Part D Grants for ...,The purpose of this funding opportunity announ...,grants_gov,HRSA-12-073,[-0.58284324 -0.42862821 -0.52920008 -0.820197...,2012.0,70000000.0,2012-01-30


## Preprocessing

In [5]:
def normalise_text(text):
    # remove newline characters and lower case
    text = text.replace('\n', '').replace('\r', '').strip().lower()
    # replace URLs with single identifier
    text = re.sub(r"[\(]?http\S+", 'URL', text)
    text = re.sub(r"[\(]?www\S+", 'URL', text)
    # replace digits with #
    #     text = re.sub(r"\d", '#', text)
    return text
    

In [6]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [6]:
descriptions = health_grants_df['public_description'].values[:10]

**Two corpora - one for exploration and one for topic modelling**
- Remove stop words
- Remove high and low frequency words
- Remove punctuation

**Exploration**
- Remove all digits

**Topic Modelling**
- Part of speech tagging
- Remove punctuation
- Transform into sub-corpora of - tokens, lemmas, part of speech

**For both after preprocessing**
- Generate ngram transformed corpus
- Generate phrase transformed corpus
- Generate combined corpus?

In [661]:
def save_parsed_corpus(docs, directory, prefix, length, suffix='', top_n=None):
    
    order = len(str(length)) + 1
    # create generic path
    if directory[-1] == os.sep:
        generic_path = directory + prefix + '_{}' + suffix
    else:
        generic_path = directory + os.sep + prefix + '_{}' + suffix
        
    for i, doc in enumerate(docs):
        n = str(i).rjust(order, '0')        
        doc.to_disk(generic_path.format(n))

In [497]:
def load_parsed_corpus(directory, prefix, length, model, suffix='', top_n=None):
    docs = []
    order = len(str(length)) + 1
    # create generic path
    if directory[-1] == os.sep:
        generic_path = directory + prefix + '_{}' + suffix
    else:
        generic_path = directory + os.sep + prefix + '_{}' + suffix
        
    for i in range(length):
        if i < top_n:
            n = str(i).rjust(order, '0')        
            doc = Doc(model.vocab).from_disk(generic_path.format(n))
            docs.append(doc)
    return docs

In [8]:
class SpacyParser(TransformerMixin):
    def __init__(self, spacy_model, **pipe_kwargs):
        self.spacy_model = spacy_model
        self.pipe_kwargs = pipe_kwargs
        
    def fit(self, texts, *args):
        return self
    
    def transform(self, texts, *args):
        pipe_kwargs = self.pipe_kwargs
        spacy_model = self.spacy_model
        docs = [spacy_model(text) for text in texts]
        return docs

In [9]:
parser = SpacyParser(nlp)

In [11]:
parsed = parser.transform(descriptions)

In [182]:
class Tokenizer(TransformerMixin):
    def __init__(self, **tokenize_kwargs):
        self.tokenize_kwargs = tokenize_kwargs
        
    def tokenize(self, texts):
        tokens = []
        tokenize_kwargs = self.tokenize_kwargs
        for text in texts:
            # tokenize each message; simply lowercase & match alphabetic chars, for now
            # yield gensim.utils.tokenize(text, **tokenize_kwargs)
            tokens.append(list(gensim.utils.tokenize(text, **tokenize_kwargs)))
        return tokens
                
    def fit(self, texts, *args):
        return self

    def transform(self, texts, *args):
        return self.tokenize(texts) 

In [183]:
class Lemmatizer(TransformerMixin):
    def __init__(self, **lemmatize_kwargs):
        self.lemmatize_kwargs = lemmatize_kwargs
        
    def lemmatize(self, texts):
        lemmatize_kwargs = self.lemmatize_kwargs
        lemmas = []
        for text in texts:
            # tokenize each message; simply lowercase & match alphabetic chars, for now
            # yield gensim.utils.lemmatize(text, **lemmatize_kwargs)
            text = gensim.utils.lemmatize(text, **lemmatize_kwargs)
            lemmas.append(list(text))
                
    def fit(self, texts, *args):
        return self

    def transform(self, texts, *args):
        return self.lemmatize(texts)

In [176]:
texts = [['the', 'mayor', 'of', 'new', 'york'],
         ['going', 'to', 'new', 'york', 'tomorrow'],
         ['i', 'love', 'new', 'york'],
         ['new', 'york', 'is', 'the', 'best'],
         ['the', 'pizza', 'in', 'new', 'york', 'is', 'ok'],
         ['the', 'mayor', 'of', 'new', 'york', 'is', 'rich']]

In [None]:
clean > tokenize > ngrammer > stopword removal > (lemmatize) >

In [67]:
class Tokenizer(TransformerMixin):
    
    def __init__(self, lower=True, remove=['SYM', 'PUNCT']):
        self.lower = lower
        self.remove = remove
        
    def fit(self, docs, *args):
        return self
    
    def transform(self, docs, *args):
        lower = self.lower
        remove = self.remove
        tokenized = []
        for d in docs:
            tokens = []
            for token in d:
                if token.pos_ in remove:
                    continue
                else:
                    if lower:
                        tokens.append(token.lower_)
                    else:
                        tokens.append(token.text)
            tokenized.append(tokens)
        return tokenized

In [49]:
tk = Tokenizer(remove=['SYM', 'PUNCT'])

In [50]:
tokenized = tk.transform(parsed)

In [52]:
d = nlp('I will be Playing (iN) the Park with a firing range')

In [68]:
class Lemmatizer(TransformerMixin):
    
    def __init__(self, remove=['SYM', 'PUNCT']):
        self.remove = remove
        
    def fit(self, docs, *args):
        return self
    
    def transform(self, docs, *args):
        remove = self.remove
        lemmatized = []
        for d in docs:
            lemmas = []
            for token in d:
                if token.pos_ in remove:
                    continue
                else:
                    lemmas.append(token.lemma_)
            lemmatized.append(lemmas)
        return lemmatized

In [69]:
lm = Lemmatizer()

In [70]:
lemmatized = lm.transform(parsed)

In [90]:
class NGrammer(TransformerMixin):
    def __init__(self, n=3, **phrase_kwargs):
        self.n = n
        self.phrase_kwargs = phrase_kwargs
        
    def fit(self, texts, *args):
        n = self.n
        if n > 1:
            for _ in range(n - 1):
                ngrams = Phrases(texts)
                ngrammer = Phraser(ngrams)
                texts = ngrammer[texts]
            self.ngrams = ngrams
            self.ngrammer = ngrammer
        return self
        
    def ngram(self, texts):
        phrase_kwargs = self.phrase_kwargs
        n = self.n
        if n > 1:
            for _ in range(n - 1):
                ngrams = Phrases(texts, **phrase_kwargs)
                ngrammer = Phraser(ngrams)
                texts = ngrammer[texts]
            self.ngrams = ngrams
            self.ngrammer = ngrammer
        return list(texts)
        
    def transform(self, texts, *args):
        return self.ngram(texts)

In [None]:
class StopWordRemover(TransformerMixin):
    

In [91]:
ng = NGrammer(**{'threshold': 5, 'min_count': 10})

In [92]:
ngrammed = ng.transform(tokenized)

INFO : collecting all words and their counts
INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO : collected 5604 word types from a corpus of 7774 words (unigram + bigrams) and 10 sentences
INFO : using 5604 counts as vocab in Phrases<0 vocab, min_count=10, threshold=5, max_vocab_size=40000000>
INFO : source_vocab length 5604
INFO : Phraser built with 17 17 phrasegrams
INFO : collecting all words and their counts
INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO : collected 5664 word types from a corpus of 7447 words (unigram + bigrams) and 10 sentences
INFO : using 5664 counts as vocab in Phrases<0 vocab, min_count=10, threshold=5, max_vocab_size=40000000>
INFO : source_vocab length 5664
INFO : Phraser built with 18 18 phrasegrams


In [85]:
ct = CleanText(log_every=5000)
processed = ct.transform(descriptions)
tk = Tokenizer(**{'lower': True})
tokenized = tk.transform(processed)
ng = NGrammer(n=2)
ngrams = ng.transform(tokenized)
# lm = Lemmatizer()
# lemmas = lm.transform(tokenized)

NameError: name 'CleanText' is not defined

In [115]:
print(list(itertools.islice(ng.ngrammer[texts], 3)))

[['the', 'mayor', 'of', 'new_york'], ['going', 'to', 'new_york', 'tomorrow'], ['i', 'love', 'new_york']]


In [93]:
phrases = Phrases(tokenized)

INFO : collecting all words and their counts
INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO : Cleaned 10000 docs
INFO : Cleaned 15000 docs
INFO : PROGRESS: at sentence #10000, processed 15787871 words and 1027 word types
INFO : Cleaned 20000 docs
INFO : Cleaned 25000 docs
INFO : PROGRESS: at sentence #20000, processed 39018014 words and 1167 word types
INFO : Cleaned 30000 docs
INFO : Cleaned 35000 docs
INFO : PROGRESS: at sentence #30000, processed 69997670 words and 1236 word types
INFO : collected 1236 word types from a corpus of 74443352 words (unigram + bigrams) and 31411 sentences
INFO : using 1236 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>


In [189]:
class StopWordRemover(TransformerMixin):
    def __init__(self, docs, stop_words):
        self.stop_words = stop_words
        
    def fit(self, texts, *args):
        return self
    
    def transform(self, texts, *args):
        

SyntaxError: unexpected EOF while parsing (<ipython-input-189-8eaf941fe9b3>, line 9)

In [None]:
class NGramTransformer(TransformerMixin):
    def __init__(n=3, **phrase_kws):
        self.n = n
        self.phrase_kws = phrase_kws
    
    def fit(self, texts, *args):
        return self
    
    def transform(self, texts, *args):
        return texts

In [None]:
class POSProcessor(BaseEstimator, TransformerMixin):
    """Transformer that labels tokens in a document with their
    part of speech tags.
    Parameters
    ----------
    pos_tags : bool, required
        Whether to tag words with their part of speech labels.
    lemmatize : bool, required
        Whether to lemmatize tokens.
    stop_words : book, required
        Whether to remove stop words.
    """

    def __init__(self, stop_words, pos_tags=True,
                rejoin=True):
        self.stop_words = stop_words
        self.pos_tags = pos_tags
        self.rejoin = rejoin

    def tag_pos(self, text):
        return [(t, t.pos_) for t in text]

    def get_lemmas(self, text):
        return [t[0].lemma_ for t in text]

    def remove_noise(self, text):
        class POSProcessor(BaseEstimator, TransformerMixin):
    """Transformer that labels tokens in a document with their
    part of speech tags.
    Parameters
    ----------
    pos_tags : bool, required
        Whether to tag words with their part of speech labels.
    lemmatize : bool, required
        Whether to lemmatize tokens.
    stop_words : book, required
        Whether to remove stop words.
    """

    def __init__(self, stop_words, pos_tags=True,
                rejoin=True):
        self.stop_words = stop_words
        self.pos_tags = pos_tags
        self.rejoin = rejoin

    def tag_pos(self, text):
        return [(t, t.pos_) for t in text]

    def get_lemmas(self, text):
        return [t[0].lemma_ for t in text]

    def remove_noise(self, text):
        noise_tags = ['DET', 'NUM', 'SYM']
        text = [t for t in text if t[0].text not in self.stop_words]
        text = [t for t in text if len(t[0]) > 2]
        text = [t for t in text if t[1] not in noise_tags]
        text = [t for t in text if ~t[0].like_num]
        return text

    def join_pos_lemmas(self, pos, lemmas):
        return ['{}_{}'.format(l, p[1]).lower() for p, l
                in zip(pos, lemmas)]

    def fit(self, texts, *args):
        return self

    def single_string(self, texts):
        strings = [' '.join(t) for t in texts]
        return strings

    def transform(self, texts, *args):
        docs = [nlp(sent) for sent in texts]
        docs = [self.tag_pos(d) for d in docs]
        docs = [self.remove_noise(d) for d in docs]
        lemmas = [self.get_lemmas(d) for d in docs]
        if self.pos_tags:
            docs = [self.join_pos_lemmas(d, l) for d, l
                    in zip(docs, lemmas)]
        if self.rejoin:
            docs = self.single_string(docs)
        return docs

        text = [t for t in text if t[0].text not in self.stop_words]
        text = [t for t in text if len(t[0]) > 2]
        text = [t for t in text if t[1] not in noise_tags]
        text = [t for t in text if ~t[0].like_num]
        return text

    def join_pos_lemmas(self, pos, lemmas):
        return ['{}_{}'.format(l, p[1]).lower() for p, l
                in zip(pos, lemmas)]

    def fit(self, texts, *args):
        return self

    def single_string(self, texts):
        strings = [' '.join(t) for t in texts]
        return strings

    def transform(self, texts, *args):
        docs = [nlp(sent) for sent in texts]
        docs = [self.tag_pos(d) for d in docs]
        docs = [self.remove_noise(d) for d in docs]
        lemmas = [self.get_lemmas(d) for d in docs]
        if self.pos_tags:
            docs = [self.join_pos_lemmas(d, l) for d, l
                    in zip(docs, lemmas)]
        if self.rejoin:
            docs = self.single_string(docs)
        return docs


In [None]:
def parse_text(corpus)

    parsed_descriptions = []
    
    for doc in nlp.pipe(corpus, batch_size=50, n_threads=-1):
        if doc.is_parsed:
            doc
            tokens.append([n.text for n in doc])
            lemma.append([n.lemma_ for n in doc])
            pos.append([n.pos_ for n in doc])
        else:
            # We want to make sure that the lists of parsed results have the
            # same number of entries of the original Dataframe, so add some blanks in case the parse fails
            tokens.append(None)
            lemma.append(None)
            pos.append(None)
