# Train Text Classification Model and Save to Mlflow

## I) Train Model

In [1]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import LinearSVC
from bs4 import BeautifulSoup
import re
import numpy as np

In [2]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/christian/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/christian/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/christian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw to /Users/christian/nltk_data...
[nltk_data]   Package omw is already up-to-date!


True

In [3]:
from sklearn.datasets import fetch_20newsgroups

In [27]:

# sklearn transformer for pipeline
class peprocessor(BaseEstimator, TransformerMixin):
    
    """text preprocessor"""

    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.nltk_stopwords = stopwords.words('english')

    def nltk_tag_to_wordnet_tag(self, nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:          
            return wordnet.NOUN

    def remove_html_tags(self, text):
        soup = BeautifulSoup(text, "html.parser")
        removed_html_text = soup.get_text()
        return removed_html_text

    def remove_special_chars(self, text):

        pattern = r'[^a-zA-z0-9\s]'
        text = re.sub(pattern, '', text)
        text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', '', text)
        return text

    def preprocess_doc(self, text):
        
        print('preprocess doc ', text)

        assert type(text) == str

        text = text.replace(r'\n',' ')

        text = self.remove_html_tags(text)

        text = self.remove_special_chars(text)
        print("after prepro ", text)

        if len(text.split()) < 5:
            return np.nan

        # tokenize the sentence and find the POS tag for each token
        nltk_tagged = nltk.pos_tag(nltk.word_tokenize(text))
        
        # tuple of (token, wordnet_tag)
        wordnet_tagged = map(lambda x: (x[0], self.nltk_tag_to_wordnet_tag(
                    x[1])), nltk_tagged)    
        lemmas = [self.lemmatizer.lemmatize(word, tag) for word, tag in \
                wordnet_tagged if not word in self.nltk_stopwords]
        
        return lemmas

    def transform(self, X, y=None):
        return [self.preprocess_doc(text) for text in X]
    def fit(self, X, y=None):
        return self    


In [28]:
preproc = peprocessor()

### Data Prep

In [6]:
newsgroups_train = fetch_20newsgroups(subset='train', categories=['alt.atheism', 'sci.space'],
                                     random_state=42)

In [7]:
X = newsgroups_train.data
y = newsgroups_train.target

In [8]:
X[0]

'From: bil@okcforum.osrhe.edu (Bill Conner)\nSubject: Re: Not the Omni!\nNntp-Posting-Host: okcforum.osrhe.edu\nOrganization: Okcforum Unix Users Group\nX-Newsreader: TIN [version 1.1 PL6]\nLines: 18\n\nCharley Wingate (mangoe@cs.umd.edu) wrote:\n: \n: >> Please enlighten me.  How is omnipotence contradictory?\n: \n: >By definition, all that can occur in the universe is governed by the rules\n: >of nature. Thus god cannot break them. Anything that god does must be allowed\n: >in the rules somewhere. Therefore, omnipotence CANNOT exist! It contradicts\n: >the rules of nature.\n: \n: Obviously, an omnipotent god can change the rules.\n\nWhen you say, "By definition", what exactly is being defined;\ncertainly not omnipotence. You seem to be saying that the "rules of\nnature" are pre-existant somehow, that they not only define nature but\nactually cause it. If that\'s what you mean I\'d like to hear your\nfurther thoughts on the question.\n\nBill\n'

In [9]:
y[0]

0

In [10]:
len(X), len(y)

(1073, 1073)

### Feature Engineering

Create tokens in prepro step, hence pass tokens to tfidf vect.

In [11]:
X_prepro = [preproc.preprocess_doc(text) for text in X]

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
def identity_tokenizer(text):
    return text
tfidf_vec = TfidfVectorizer(lowercase=False, tokenizer=identity_tokenizer, stop_words=None)

In [14]:
from sklearn.pipeline import Pipeline

In [15]:
X_feat = tfidf_vec.fit_transform(X_prepro)

In [16]:
X_feat

<1073x25528 sparse matrix of type '<class 'numpy.float64'>'
	with 147153 stored elements in Compressed Sparse Row format>

### Training

In [29]:
pipe = Pipeline([
    ('prepro', peprocessor()),
    ('tfidf', tfidf_vec),
    ('clf', LinearSVC(class_weight = 'balanced', random_state=42))
])

In [32]:
pipe.fit(X[:5], y[:5])

preprocess doc  From: bil@okcforum.osrhe.edu (Bill Conner)
Subject: Re: Not the Omni!
Nntp-Posting-Host: okcforum.osrhe.edu
Organization: Okcforum Unix Users Group
X-Newsreader: TIN [version 1.1 PL6]
Lines: 18

Charley Wingate (mangoe@cs.umd.edu) wrote:
: 
: >> Please enlighten me.  How is omnipotence contradictory?
: 
: >By definition, all that can occur in the universe is governed by the rules
: >of nature. Thus god cannot break them. Anything that god does must be allowed
: >in the rules somewhere. Therefore, omnipotence CANNOT exist! It contradicts
: >the rules of nature.
: 
: Obviously, an omnipotent god can change the rules.

When you say, "By definition", what exactly is being defined;
certainly not omnipotence. You seem to be saying that the "rules of
nature" are pre-existant somehow, that they not only define nature but
actually cause it. If that's what you mean I'd like to hear your
further thoughts on the question.

Bill

after prepro  From bilokcforumosrheedu Bill Conner
Su

Pipeline(steps=[('prepro', peprocessor()),
                ('tfidf',
                 TfidfVectorizer(lowercase=False,
                                 tokenizer=<function identity_tokenizer at 0x7fcc2c756b90>)),
                ('clf', LinearSVC(class_weight='balanced', random_state=42))])

### Evalution
Skip validation set performance evaluation. Short check on training data only for functionality test.

In [33]:
pipe.predict(X[:5])

preprocess doc  From: bil@okcforum.osrhe.edu (Bill Conner)
Subject: Re: Not the Omni!
Nntp-Posting-Host: okcforum.osrhe.edu
Organization: Okcforum Unix Users Group
X-Newsreader: TIN [version 1.1 PL6]
Lines: 18

Charley Wingate (mangoe@cs.umd.edu) wrote:
: 
: >> Please enlighten me.  How is omnipotence contradictory?
: 
: >By definition, all that can occur in the universe is governed by the rules
: >of nature. Thus god cannot break them. Anything that god does must be allowed
: >in the rules somewhere. Therefore, omnipotence CANNOT exist! It contradicts
: >the rules of nature.
: 
: Obviously, an omnipotent god can change the rules.

When you say, "By definition", what exactly is being defined;
certainly not omnipotence. You seem to be saying that the "rules of
nature" are pre-existant somehow, that they not only define nature but
actually cause it. If that's what you mean I'd like to hear your
further thoughts on the question.

Bill

after prepro  From bilokcforumosrheedu Bill Conner
Su

array([0, 1, 1, 1, 0])

## II) Save to Mlflow

Use custom pyfunc implementation due to the nltk dependency

In [34]:
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import Schema, ColSpec
import mlflow

In [35]:
input_example = {
  "text": "Please enlighten me.  How is omnipotence contradictory",
}
input_schema = Schema([
  ColSpec("string", "text")
])
output_schema = Schema([ColSpec("string")])
signature = ModelSignature(inputs=input_schema, outputs=output_schema)
signature

inputs: 
  ['text': string]
outputs: 
  [string]

In [36]:
test_text = ["Please enlighten me.  How is omnipotence contradictory"]

### Save Local

In [37]:
! rm -rf tests/classifier
mlflow.sklearn.save_model(
    pipe, 
    path = "tests/classifier", # save local
    conda_env="environment.yml", 
    #mlflow_model=None, 
    #serialization_format='cloudpickle', 
    signature = signature,
    input_example = input_example, 
    pip_requirements=None, extra_pip_requirements=None)

In [101]:
local_model = mlflow.sklearn.load_model("classifier")

In [104]:
local_model.predict(test_text)

array([1])

### Save to MLflow

In [38]:
mlflow.set_tracking_uri("http://localhost:8088")

In [39]:
experiment_name = "document_classifier"

In [40]:
#mlflow.create_experiment(experiment_name)

In [42]:
mlflow.set_experiment(experiment_name)
experiment = mlflow.get_experiment_by_name(experiment_name)

In [43]:
mlflow.start_run(experiment_id=experiment.experiment_id, run_name="baseline2")

<ActiveRun: >

In [44]:
mlflow.sklearn.log_model(
    pipe, 
    artifact_path="model", 
    conda_env="environment.yml", 
    #serialization_format='cloudpickle', 
    registered_model_name=None, 
    signature= signature,
    input_example= input_example)
    #await_registration_for=300, 
    #pip_requirements=None, 
    #extra_pip_requirements=None)

In [45]:
!jupyter nbconvert --to html NLP_Classifier.ipynb
mlflow.log_artifact("NLP_Classifier.ipynb")
mlflow.log_artifact("NLP_Classifier.html")

[NbConvertApp] Converting notebook NLP_Classifier.ipynb to html
[NbConvertApp] Writing 647136 bytes to NLP_Classifier.html


In [46]:
mlflow.end_run()