# Text Classification Project Lifecycle
---------------------------




<img src="files/overview.jpg">

In [1]:
# imports
import bz2
import pandas as pd
import pickle
from os import path
import os

os.chdir('../')
os.getcwd()

import ace.pipelines.pipeline_text as pt
import ace.pipelines.pipeline_feature as pf
import ace.pipelines.pipeline_ml as pm
import ace.pipelines.pipeline_deploy as dl
import ace.pipelines.pipeline_compare as pc

from ace.factories.ml_factory import MLFactory

In [2]:
# path vars
data_path = path.join('data',  'notti_nhs.csv')
pickle_data_file_name='nhs.pkl.bz2'

pickle_file = path.join('data', 'processed', pickle_data_file_name)
exp_data_path = path.join('data', 'processed')

experiment_path = 'exp_1'
lr_classifier = 'LogisticRegression'

In [3]:
# Data Engineering

nhsn_data = pd.read_csv(data_path)
nhsn_data = nhsn_data.dropna()

with bz2.BZ2File(pickle_file, 'wb') as pickle_file:
    # we pickle the data object as data, targets
    pkl_obj = nhsn_data[['improve', 'super']], list(nhsn_data['super'])
    pickle.dump(pkl_obj, pickle_file, protocol=4, fix_imports=False)
    

# text pre-processing

- Lower-case
- Spell check
- Split joined-up words
- Stopwords
- Lemmatize
- Stem

<img src="files/text_pipe.png">

In [4]:
pt.configure_pipeline(experiment_path,exp_data_path , spell=True, split_words=True, text_headers=['improve'],
                      stop_words=True, lemmatize=False, stemm=True)

pipe_text = pt.PipelineText(experiment_path, pickle_data_file_name)
pipe_text.fit_transform()

Fitting pipeline for improve
Lower casing
correcting spelling
Finding joined up words
removing stopwords
Saving text pipeline
Loading text pipeline for improve
Transforming pipeline for improve
Lower casing
correcting spelling
Finding joined up words
removing stopwords
Stemming using porter stemmer


[                                                 improve
 0                                                   noth
 1                                  DO not need TO improv
 2                                                   noth
 3                                                   noth
 4      IT can BE difficult TO know the date the self ...
 ...                                                  ...
 51838                             AN excel servic receiv
 51839                                               noth
 51840  I had TO wait approxim 3 month TO see A psychi...
 51841                                   noth good servic
 51842  I can not suggest ani improv all the care supp...
 
 [51843 rows x 1 columns]]

# Feature Engineering

- Count Matrix / TFIDF
- Embeddings
    - BERT
    - sBERT
    - w2v
- Word/Doc Features
    -word_counts
    -NMF
    -POS (Part of Speech)

<img src="files/feature_pipe.png">

In [None]:
pf.configure_pipeline(experiment_path, feature_set=['frequency_matrix'], num_features=0, idf=True,
                       feature_selection_type='Logistic', min_df=3, min_ngram=1, max_ngram=3)

pipe_features = pf.PipelineFeatures(experiment_path, pickle_data_file_name)
pipe_features.fit_transform()

<img src="files/tfidf.png">

# Model Training

- Logistic Regression
- Random Forrests
- Convolutional Neural Networks

In [None]:

pm.configure_pipeline(experiment_path)

# ---- MODEL 1 --- #
cls = MLFactory.factory(lr_classifier)
pipe_ml = pm.MLTrainTest(experiment_path, pickle_data_file_name, classifier=cls)
pipe_ml.fit_transform()

In [None]:
# ---- MODEL 2 --- #
rf_classifier = "RandomForestClassifier"
cls2 = MLFactory.factory(rf_classifier)
pipe_ml = pm.MLTrainTest(experiment_path, pickle_data_file_name, classifier=cls2)
pipe_ml.fit_transform()

# Deploy
- Records for all pipeline steps are kept on __exp_1__ diractory on several config files
- The model can easily be deployed on new unknown data!

In [None]:
responces_path = path.join('data',  'responces.csv')
pickle_responces_file_name='responces.pkl.bz2'
pickle_responces_file = path.join('data', 'processed', pickle_responces_file_name)

responce_data = pd.read_csv(responces_path)

with bz2.BZ2File(pickle_responces_file, 'wb') as pickle_file:
    pkl_obj = responce_data[['improve']], None
    pickle.dump(pkl_obj, pickle_file, protocol=4, fix_imports=False)

pipe_text = pt.PipelineText(experiment_path, pickle_responces_file_name)
# notice we do not call fit here!
pipe_text.transform()

pipe_features = pf.PipelineFeatures(experiment_path, pickle_responces_file_name)
# notice we do not call fit here either!
pipe_features.transform()

dl.configure_pipeline(experiment_path, classifier_name, validation_path=experiment_path + "/features")
ml_dep = dl.MLDeploy(experiment_path, pickle_responces_file_name)

y_pred, y_prob = ml_dep.transform()
print(y_pred)

# Change Features to sBERT!

- Need to redo the text processing as stopwords removal, stemming etc, are not applicable with sBert
- Need to redo the features pipeline asking for sBERT to be the main feature
- Then run the rest of the pipeline steps

In [None]:
pt.configure_pipeline(experiment_path,exp_data_path , spell=True, split_words=True, text_headers=['improve'],
                      stop_words=False, lemmatize=False, stemm=False)

pipe_text = pt.PipelineText(experiment_path, pickle_data_file_name)
pipe_text.fit_transform()

pf.configure_pipeline(experiment_path, feature_set=['sbert'])

pipe_features = pf.PipelineFeatures(experiment_path, pickle_data_file_name)
pipe_features.fit_transform()

In [None]:
# ---- MODEL 1 --- #
cls = MLFactory.factory(lr_classifier)
pipe_ml = pm.MLTrainTest(experiment_path, pickle_data_file_name, classifier=cls)
pipe_ml.fit_transform()

# ---- MODEL 2 --- #
second_classifier_name = "RandomForestClassifier"
cls2 = MLFactory.factory(rf_classifier)
pipe_ml = pm.MLTrainTest(experiment_path, pickle_data_file_name, classifier=cls2)
pipe_ml.fit_transform()