# Building an NLP Pipeline Class

For the pair problem today, you're going to work on building a class that manages your NLP needs. The goal is to build something that takes in a bunch of classes, and can spit out the cleaned text as a vector. I'll get you started with a template.

In [1]:
class nlp_pipe:
    
    def __init__(self, vectorizer, cleaning_function, tokenizer, stemmer):
        self.vectorizer = vectorizer
        self.cleaning_function = cleaning_function
        self.tokenizer = tokenizer
        self.stemmer = stemmer
    
    def fit(self, text_to_fit_on):
        pass
    
    def transfrom(self, text_to_clean):
        pass

As a quick note, if you want to pass a function into a class you can do so like this:

In [2]:
def print_the_word_bob_three_times():
    for i in range(3):
        print('bob')
        
        
class this_is_an_example:
    
    def __init__(self, function_input):
        self.function_to_run = function_input
        
    def do_the_thing(self):
        self.function_to_run()

In [3]:
example = this_is_an_example(print_the_word_bob_three_times)
# Note that when I put the function in, I didn't invoke it with the parens!

In [4]:
example.do_the_thing()

bob
bob
bob


So what I want is the ability to do something like:

```python
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer

nlp = nlp_pipe(CountVectorizer(), simple_cleaning_function_i_made, TreebankWordTokenizer(), PorterStemmer())
nlp.fit(train_corpus)
nlp.transform(test_corpus)
```
Which should return the test corpus in its vectorizer format.

# Solution!

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
import pickle


class nlp_preprocessor:
   
    def __init__(self, vectorizer=CountVectorizer(), tokenizer=None, 
                 cleaning_function=None, stemmer=None):
        """
        A class for pipelining our data in NLP problems. The user provides a series of 
        tools, and this class manages all of the training, transforming, and modification
        of the text data.
        ---
        Inputs:
        vectorizer: the model to use for vectorization of text data
        tokenizer: The tokenizer to use, if none defaults to split on spaces
        cleaning_function: how to clean the data, if None, defaults to the in built class
        """
        if not tokenizer:
            tokenizer = self.splitter
        if not cleaning_function:
            cleaning_function = self.clean_text
        self.stemmer = stemmer
        self.tokenizer = tokenizer
        #self.model = model
        self.cleaning_function = cleaning_function
        self.vectorizer = vectorizer
        self._is_fit = False
        
    def splitter(self, text):
        """
        Default tokenizer that splits on spaces naively
        """
        return text.split(' ')
        
    def clean_text(self, text, tokenizer, stemmer):
        """
        A naive function to lowercase all works can clean them quickly.
        This is the default behavior if no other cleaning function is specified
        """
        cleaned_text = []
        for post in text:
            cleaned_words = []
            for word in tokenizer(post):
                low_word = word.lower()
                if stemmer:
                    low_word = stemmer.stem(low_word)
                cleaned_words.append(low_word)
            cleaned_text.append(' '.join(cleaned_words))
        return cleaned_text
    
    def fit(self, text):
        """
        Cleans the data and then fits the vectorizer with
        the user provided text
        """
        clean_text = self.cleaning_function(text, self.tokenizer, self.stemmer)
        self.vectorizer.fit(clean_text)
        self._is_fit = True
        
    def transform(self, text):
        """
        Cleans any provided data and then transforms the data into
        a vectorized format based on the fit function. Returns the
        vectorized form of the data.
        """
        if not self._is_fit:
            raise ValueError("Must fit the models before transforming!")
        clean_text = self.cleaning_function(text, self.tokenizer, self.stemmer) # ????????? not defined here
        return self.vectorizer.transform(clean_text)
    
    def save_pipe(self, filename):
        """
        Writes the attributes of the pipeline to a file
        allowing a pipeline to be loaded later with the
        pre-trained pieces in place.
        """
        if type(filename) != str:
            raise TypeError("filename must be a string")
        pickle.dump(self.__dict__, open(filename+".mdl",'wb'))
        
    def load_pipe(self, filename):
        """
        Writes the attributes of the pipeline to a file
        allowing a pipeline to be loaded later with the
        pre-trained pieces in place.
        """
        if type(filename) != str:
            raise TypeError("filename must be a string")
        if filename[-4:] != '.mdl':
            filename += '.mdl'
        self.__dict__ = pickle.load(open(filename,'rb'))

In [11]:
corpus = ['BOB the builder', 'is a strange', 'caRtoon type thing']

In [12]:
def simple_cleaning_function_i_made(text, tokenizer, stemmer):
    cleaned_text = []
    for post in text:
        cleaned_words = []
        for word in tokenizer(post):
            low_word = word.lower()
            if stemmer:
                low_word = stemmer.stem(low_word)
            cleaned_words.append(low_word)
        cleaned_text.append(' '.join(cleaned_words))
    return cleaned_text

In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer

nlp = nlp_preprocessor(vectorizer=TfidfVectorizer(min_df=0.3, max_df=0.8), 
                       cleaning_function=simple_cleaning_function_i_made, 
                       tokenizer=TreebankWordTokenizer().tokenize, stemmer=PorterStemmer())
nlp.fit(corpus)
vectorized_docs = nlp.transform(corpus).toarray()

In [14]:
corpus = """Hi Mr. Smith! I’m going to buy some vegetables (tomatoes and cucumbers)
from the store. Should I pick up some black-eyed peas as well?
            """

In [15]:
nlp.fit(corpus)
vectorized_docs = nlp.transform(corpus).toarray()

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [156]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.tokenize import RegexpTokenizer
class nlp_preprocesseur:

    def __init__(self, tokenizer=None):
        """
        A class to prep data. 
        Tokenize
        Clean
            #    mispelling, remove numbers, remove special characters, go lowercase, remove stop words, chunking
            #        name entity recognition,coumpound term extraction
            #    stemming
            #    lemmatization
            #    POS Tagging
        Vectorize
        """
        if not tokenizer:
            tokenizer = self.tokenizeSentence
        elif tokenizer == "word":
            tokenizer = self.tokenizeWord
        elif tokenizer == "space":
            tokenizer = self.tokenizeSpace
        self.tokenizer = tokenizer    

    
    

    def tokenizeSentence(self,my_text):
        return sent_tokenize(my_text)

    def tokenizeWord(self,my_text):        # each ) . ? is a word
        return word_tokenize(my_text)

    def tokenizeSpace(self,my_text):
        return my_text.split(' ')  # keeps the blanks at the end of sentence
            # regex, N-gram

    def tokenizeBigrams(self,my_text):
        my_words = self.tokenizeWord(my_text) 
        return list(ngrams(my_words,2)) 

        
    def tokenizeChar(self,my_text):  # stops at the last non blank character
        caracteres = "\s+"
        return RegexpTokenizer(caracteres, gaps=True).tokenize(my_text)



In [158]:
nlp2 = nlp_preprocesseur()
nlp3 = nlp_preprocesseur(tokenizer = "word")


In [149]:
dir(nlp2)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'tokenizeBigrams',
 'tokenizeChar',
 'tokenizeSentence',
 'tokenizeSpace',
 'tokenizeWord',
 'tokenizer']

In [141]:
nlp2.tokenizer(corpus)

['Hi Mr. Smith!',
 'I’m going to buy some vegetables (tomatoes and cucumbers)\nfrom the store.',
 'Should I pick up some black-eyed peas as well?']

In [150]:
nlp3.tokenizer(corpus)

['Hi',
 'Mr.',
 'Smith',
 '!',
 'I',
 '’',
 'm',
 'going',
 'to',
 'buy',
 'some',
 'vegetables',
 '(',
 'tomatoes',
 'and',
 'cucumbers',
 ')',
 'from',
 'the',
 'store',
 '.',
 'Should',
 'I',
 'pick',
 'up',
 'some',
 'black-eyed',
 'peas',
 'as',
 'well',
 '?']

In [151]:
nlp2.tokenizeSpace(corpus)

['Hi',
 'Mr.',
 'Smith!',
 'I’m',
 'going',
 'to',
 'buy',
 'some',
 'vegetables',
 '(tomatoes',
 'and',
 'cucumbers)\nfrom',
 'the',
 'store.',
 'Should',
 'I',
 'pick',
 'up',
 'some',
 'black-eyed',
 'peas',
 'as',
 'well?\n',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

In [152]:
nlp2.tokenizeChar(corpus)

['Hi',
 'Mr.',
 'Smith!',
 'I’m',
 'going',
 'to',
 'buy',
 'some',
 'vegetables',
 '(tomatoes',
 'and',
 'cucumbers)',
 'from',
 'the',
 'store.',
 'Should',
 'I',
 'pick',
 'up',
 'some',
 'black-eyed',
 'peas',
 'as',
 'well?']

In [153]:
nlp2.tokenizeWord(corpus)

['Hi',
 'Mr.',
 'Smith',
 '!',
 'I',
 '’',
 'm',
 'going',
 'to',
 'buy',
 'some',
 'vegetables',
 '(',
 'tomatoes',
 'and',
 'cucumbers',
 ')',
 'from',
 'the',
 'store',
 '.',
 'Should',
 'I',
 'pick',
 'up',
 'some',
 'black-eyed',
 'peas',
 'as',
 'well',
 '?']

In [159]:
nlp2.tokenizeBigrams(corpus)

[('Hi', 'Mr.'),
 ('Mr.', 'Smith'),
 ('Smith', '!'),
 ('!', 'I'),
 ('I', '’'),
 ('’', 'm'),
 ('m', 'going'),
 ('going', 'to'),
 ('to', 'buy'),
 ('buy', 'some'),
 ('some', 'vegetables'),
 ('vegetables', '('),
 ('(', 'tomatoes'),
 ('tomatoes', 'and'),
 ('and', 'cucumbers'),
 ('cucumbers', ')'),
 (')', 'from'),
 ('from', 'the'),
 ('the', 'store'),
 ('store', '.'),
 ('.', 'Should'),
 ('Should', 'I'),
 ('I', 'pick'),
 ('pick', 'up'),
 ('up', 'some'),
 ('some', 'black-eyed'),
 ('black-eyed', 'peas'),
 ('peas', 'as'),
 ('as', 'well'),
 ('well', '?')]