# ML Pipeline Condensing

In [1]:
#import libraries
#measuring time and making basic math
from time import time
import math
import numpy as np
import udacourse2 #my library for this project!
#import statistics

#my own ETL pipeline
#import process_data as pr

#dealing with datasets and showing content
import pandas as pd
#import pprint as pp

#SQLAlchemy toolkit
from sqlalchemy import create_engine
from sqlalchemy import pool
from sqlalchemy import inspect

#natural language toolkit
#from nltk.tokenize import word_tokenize 
#from nltk.corpus import stopwords 
#from nltk.stem import WordNetLemmatizer

#REGEX toolkit
#import re

#Machine Learning preparing/preprocessing toolkits
from sklearn.model_selection import train_test_split
#from sklearn.model_selection import GridSearchCV

#Machine Learning Feature Extraction tools
#from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

#Machine Learning Classifiers
#from sklearn.naive_bayes import MultinomialNB
#from sklearn.ensemble import RandomForestClassifier #need MOClassifier!
#from sklearn.ensemble import AdaBoostClassifier
#from sklearn.linear_model import SGDClassifier
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

#Machine Learning Classifiers extra tools
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline

#Machine Learning Metrics
#from sklearn.metrics import f1_score
#from sklearn.metrics import classification_report

#pickling tool
import pickle

verbose=True

In [2]:
#importing MySQL to Pandas - load data from database
engine = create_engine('sqlite:///Messages.db', poolclass=pool.NullPool) #, echo=True)

#retrieving tables names from my DB
#https://stackoverflow.com/questions/6473925/sqlalchemy-getting-a-list-of-tables
inspector = inspect(engine)
if verbose:
    print('existing tables in my SQLite database:', inspector.get_table_names())

connection = engine.connect()
df = pd.read_sql('SELECT * FROM Messages', con=connection)
connection.close()
df.name = 'df'

existing tables in my SQLite database: ['Messages']


In [3]:
#I.Prepare the data
if verbose:
    print('all labels are blank in {} rows'.format(df[df['if_blank'] == 1].shape[0]))

all labels are blank in 6317 rows


In [4]:
df = df[df['if_blank'] == 0]
if verbose:
    print('remaining rows:', df.shape[0])

remaining rows: 19928


In [5]:
#Verifying if removal was complete
if df[df['if_blank'] == 1].shape[0] == 0:
    if verbose:
        print('removal complete!')
else:
    raise Exception('something went wrong with rows removal before training')

removal complete!


In [6]:
#Premature Tokenization Strategy (pre-tokenizer)
#Pre-Tokenizer + not removing provisory tokenized column
start = time()

#inserting a tokenized column
try:
    df = df.drop('tokenized', axis=1)
except KeyError:
    print('OK')
df.insert(1, 'tokenized', np.nan)

#tokenizing over the provisory
df['tokenized'] = df.apply(lambda x: udacourse2.fn_tokenize_fast(x['message']), axis=1)

#removing NaN over provisory (if istill exist)
df = df[df['tokenized'].notnull()]

empty_tokens = df[df['tokenized'].apply(lambda x: len(x)) == 0].shape[0]
if verbose:
    print('found {} rows with no tokens'.format(empty_tokens))

df = df[df['tokenized'].apply(lambda x: len(x)) > 0]
empty_tokens = df[df['tokenized'].apply(lambda x: len(x)) == 0].shape[0]
if verbose:
    print('*after removal, found {} rows with no tokens'.format(empty_tokens))

#I will drop 'message' column
try:
    df = df.drop('message', axis=1)
except KeyError:
    if verbose:
        print('OK')

if verbose:
    print('now I have {} rows to train'.format(df.shape[0]))

spent = time() - start
if verbose:
    print('process time:{:.0f} seconds'.format(spent))

OK
found 6 rows with no tokens
*after removal, found 0 rows with no tokens
now I have 19922 rows to train
process time:22 seconds


In [7]:
#Database Data Consistency Fix
start = time()

#correction for aid_related
df = udacourse2.fn_group_check(dataset=df,
                               subset='aid',
                               correct=True, 
                               shrink=False, 
                               shorten=False, 
                               verbose=True)
#correction for weather_related
df = udacourse2.fn_group_check(dataset=df,
                               subset='wtr',
                               correct=True, 
                               shrink=False, 
                               shorten=False, 
                               verbose=True)
#correction for infrastrucutre_related
df = udacourse2.fn_group_check(dataset=df,
                               subset='ifr',
                               correct=True, 
                               shrink=False, 
                               shorten=False, 
                               verbose=True)
#correction for related(considering that the earlier were already corrected)
df = udacourse2.fn_group_check(dataset=df,
                               subset='main',
                               correct=True, 
                               shrink=False, 
                               shorten=False, 
                               verbose=True)
spent = time() - start
if verbose:
    print('Data Consistency performed')
    print('process time:{:.0f} seconds'.format(spent))

###function group_check started
  - count for main class:aid_related, 10877 entries
  - for main, without any sub-categories,  3515 entries
  - for subcategories,  7388 entries
  - for lost parent sub-categories,  26 entries
    *correcting, new count: 0 entries
elapsed time: 0.1218s
###function group_check started
  - count for main class:weather_related, 7304 entries
  - for main, without any sub-categories,  1359 entries
  - for subcategories,  5945 entries
  - for lost parent sub-categories,  0 entries
    *correcting, new count: 0 entries
elapsed time: 0.0604s
###function group_check started
  - count for main class:infrastructure_related, 1705 entries
  - for main, without any sub-categories,  679 entries
  - for subcategories,  2926 entries
  - for lost parent sub-categories,  1900 entries
    *correcting, new count: 0 entries
elapsed time: 0.0519s
###function group_check started
  - count for main class:related, 19922 entries
  - for main, without any sub-categories,  9436 entr

In [8]:
#II.Break the data
#X is the Training Text Column
X = df['tokenized']

In [9]:
#y is the Classification labels
#I REMOVED "related" column from my labels, as it is impossible to train it!
y = df[df.columns[4:]]
#y = df[df.columns[5:]]

#remove_lst = []

#for column in y.columns:
#    col = y[column]
#    if (col == 0).all():
#        if verbose:
#            print('*{} -> only zeroes training column!'.format(column))
#        remove_lst.append(column)
#    else:
        #print('*{} -> column OK'.format(column))
#        pass

#if verbose:
#    print(remove_lst)

#y = y.drop(remove_lst, axis=1)

if y.shape[1] == 36:
    if verbose:
        print('y dataset has 36 labels')
else:
    raise Exception('something went wrong, dataset has {} labels instead of 36'.format(y.shape[1]))

if verbose:
    print('Dataset breaked into X-Training Text Column and Y-Multilabels for Classification')

*child_alone -> only zeroes training column!
['child_alone']
Dataset breaked into X-Training Text Column and Y-Multilabels for Classification


In [10]:
#III.Slit the data
#Split makes randomization, so random_state parameter was set
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.25, 
                                                    random_state=42)

And it looks OK:

In [11]:
if (X_train.shape[0] + X_test.shape[0]) == X.shape[0]:
    if verbose:
        print('data split into train and text seems OK')
else:
    raise Exception('something went wrong when splitting the data')

data split into train and text seems OK


In [12]:
#IV. Train your Classifier
#Classifier is Support Vector Machine
if verbose:
    print('Classifier training started')

start = time()
    
def dummy(doc):
    return doc

feats = TfidfVectorizer(analyzer='word', 
                        tokenizer=dummy, 
                        preprocessor=dummy,
                        token_pattern=None,
                        ngram_range=(1, 3))

classif = OneVsRestClassifier(LinearSVC(C=2., 
                                        random_state=42))

pipeline_lnsv = Pipeline([('vect', feats),
                          ('clf', classif)])

pipeline_lnsv.fit(X_train, y_train)

spent = time() - start
if verbose:
    print('LINEAR SUPPORT VECTOR MACHINE - process time:{:.2f} seconds'.format(spent))

Classifier training started
LINEAR SUPPORT VECTOR MACHINE - process time:16.34 seconds


In [13]:
y_pred = pipeline_lnsv.predict(X_test)
if verbose:
    metrics = udacourse2.fn_scores_report2(y_test, 
                                 y_pred,
                                 best_10=True,
                                 verbose=True)
else:
    metrics = udacourse2.fn_scores_report2(y_test, 
                                           y_pred,
                                           best_10=True,
                                           verbose=False)

for metric in metrics:
    if metric < 0.6:
        raise Exception('something is wrong, model is predicting poorly')

###function scores_report started
using top 10 labels
######################################################
*aid_related -> label iloc[2]
              precision    recall  f1-score   support

           0       0.75      0.56      0.64      2313
           1       0.69      0.84      0.76      2668

    accuracy                           0.71      4981
   macro avg       0.72      0.70      0.70      4981
weighted avg       0.72      0.71      0.70      4981

######################################################
*weather_related -> label iloc[26]
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      3109
           1       0.80      0.79      0.80      1872

    accuracy                           0.85      4981
   macro avg       0.84      0.84      0.84      4981
weighted avg       0.85      0.85      0.85      4981

######################################################
*direct_report -> label iloc[33]
              precision    re

In [14]:
#pipeline_adab.fit(X_train, y_train)

file_name = 'classifier.pkl'

#writing the file
with open (file_name, 'wb') as pk_writer: 
    pickle.dump(pipeline_lnsv, pk_writer)

#reading the file
with open('classifier.pkl', 'rb') as pk_reader:
    pipeline_lnsv = pickle.load(pk_reader)
    
#pipeline_lnsv.predict(X_test)

## Use the notebook to complete `train.py`

Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.

In [15]:
#import packages
import sys
import math
import numpy as np
import udacourse2 #my library for this project!
import pandas as pd
from time import time

#SQLAlchemy toolkit
from sqlalchemy import create_engine
from sqlalchemy import pool
from sqlalchemy import inspect

#Machine Learning preparing/preprocessing toolkits
from sklearn.model_selection import train_test_split

#Machine Learning Feature Extraction tools
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

#Machine Learning Classifiers
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

#Machine Learning Classifiers extra tools
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline

#pickling tool
import pickle

#only a dummy function, as I pre-tokenize my data
def dummy(doc):
    return doc

#########1#########2#########3#########4#########5#########6#########7#########8
def load_data(data_file, 
              verbose=False):
    '''This function takes a path for a SQLite table and returns processed data
    for training a Machine Learning Classifier
    Inputs:
      - data_file (mandatory) - full path for SQLite table - text string
      - verbose (optional) - if you want some verbosity during the running 
        (default=False)
    Oputputs:
      - X - tokenized text X-training - Pandas Series
      - y - y-multilabels 0|1 - Pandas Dataframe'''
    if verbose:
        print('###load_data function started')
    start = time()

    #1.read in file
    #importing MySQL to Pandas - load data from database
    engine = create_engine(data_file, poolclass=pool.NullPool) #, echo=True)
    #retrieving tables names from my DB
    inspector = inspect(engine)
    if verbose:
        print('existing tables in my SQLite database:', inspector.get_table_names())
    connection = engine.connect()
    df = pd.read_sql('SELECT * FROM Messages', con=connection)
    connection.close()
    df.name = 'df'
    
    #2.clean data
    #2.1.Elliminate rows with all-blank labels
    if verbose:
        print('all labels are blank in {} rows'.format(df[df['if_blank'] == 1].shape[0]))
    df = df[df['if_blank'] == 0]
    if verbose:
        print('remaining rows:', df.shape[0])
    #Verifying if removal was complete
    if df[df['if_blank'] == 1].shape[0] == 0:
        if verbose:
            print('removal complete!')
        else:
            raise Exception('something went wrong with rows removal before training')
            
    #2.2.Premature Tokenization Strategy (pre-tokenizer)
    #Pre-Tokenizer + not removing provisory tokenized column
    #inserting a tokenized column
    try:
        df = df.drop('tokenized', axis=1)
    except KeyError:
        print('OK')
    df.insert(1, 'tokenized', np.nan)
    #tokenizing over the provisory
    df['tokenized'] = df.apply(lambda x: udacourse2.fn_tokenize_fast(x['message']), axis=1)
    #removing NaN over provisory (if istill exist)
    df = df[df['tokenized'].notnull()]
    empty_tokens = df[df['tokenized'].apply(lambda x: len(x)) == 0].shape[0]
    if verbose:
        print('found {} rows with no tokens'.format(empty_tokens))
    df = df[df['tokenized'].apply(lambda x: len(x)) > 0]
    empty_tokens = df[df['tokenized'].apply(lambda x: len(x)) == 0].shape[0]
    if verbose:
        print('*after removal, found {} rows with no tokens'.format(empty_tokens))
    #I will drop the original 'message' column
    try:
        df = df.drop('message', axis=1)
    except KeyError:
        if verbose:
            print('OK')
    if verbose:
        print('now I have {} rows to train'.format(df.shape[0]))

    #2.3.Database Data Consistency Check/Fix
    #correction for aid_related
    df = udacourse2.fn_group_check(dataset=df,
                                   subset='aid',
                                   correct=True, 
                                   shrink=False, 
                                   shorten=False, 
                                   verbose=True)
    #correction for weather_related
    df = udacourse2.fn_group_check(dataset=df,
                                   subset='wtr',
                                   correct=True, 
                                   shrink=False, 
                                   shorten=False, 
                                   verbose=True)
    #correction for infrastrucutre_related
    df = udacourse2.fn_group_check(dataset=df,
                                   subset='ifr',
                                   correct=True, 
                                   shrink=False, 
                                   shorten=False, 
                                   verbose=True)
    #correction for related(considering that the earlier were already corrected)
    df = udacourse2.fn_group_check(dataset=df,
                                   subset='main',
                                   correct=True, 
                                   shrink=False, 
                                   shorten=False, 
                                   verbose=True)
    
    #load to database <-I don't know for what it is
    
    #3.Define features and label arrays (break the data)
    #3.1.X is the Training Text Column
    X = df['tokenized']
    #3.2.y is the Classification labels
    #I REMOVED "related" column from my labels, as it is impossible to train it!
    y = df[df.columns[5:]]
    remove_lst = []

    for column in y.columns:
        col = y[column]
        if (col == 0).all():
            if verbose:
                print('*{} -> only zeroes training column!'.format(column))
            remove_lst.append(column)
        else:
            #print('*{} -> column OK'.format(column))
            pass
        
    if verbose:
        print(remove_lst)
    y = y.drop(remove_lst, axis=1)
    
    spent = time() - start
    if verbose:
        print('*dataset breaked into X-Training Text Column and Y-Multilabels')    
        print('process time:{:.0f} seconds'.format(spent))
    return X, y

In [16]:
#########1#########2#########3#########4#########5#########6#########7#########8
def build_model(verbose=False):
    '''This function builds the Classifier Pipeline, for future fitting
    Inputs:
      - verbose (optional) - if you want some verbosity during the running 
        (default=False)
    Output:
      - model_pipeline for your Classifiear (untrained)
    '''
    if verbose:
        print('###build_model function started')
    start = time()
    
    #1.text processing and model pipeline
    #(text processing was made at a earlier step, at Load Data function)
    feats = TfidfVectorizer(analyzer='word', 
                            tokenizer=dummy, 
                            preprocessor=dummy,
                            token_pattern=None,
                            ngram_range=(1, 3))
    
    classif = OneVsRestClassifier(LinearSVC(C=2., 
                                            random_state=42))
    
    model_pipeline = Pipeline([('vect', feats),
                               ('clf', classif)])
    
    #define parameters for GridSearchCV (parameters already defined)
    #create gridsearch object and return as final model pipeline (made at pipeline preparation)
    #obs: for better performance, I pre-tokenized my data. And GridSearch was runned on Jupyter,
    #     and the best parameters where adjusted, just to save processing time during code execution.
    spent = time() - start
    if verbose:
        print('*Linear Support Vector Machine pipeline was created')
        print('process time:{:.0f} seconds'.format(spent))
    return model_pipeline

In [17]:
#########1#########2#########3#########4#########5#########6#########7#########8
def train(X, 
          y, 
          model, 
          verbose=False):
    '''This function trains your already created Classifier Pipeline
    Inputs:
      - X (mandatory) - tokenized data for training - Pandas Series
      - y (mandatory) - Multilabels 0|1 - Pandas Dataset
      - verbose (optional) - if you want some verbosity during the running 
        (default=False)
    Output:
      - trained model'''
    if verbose:
        print('###train function started')
    start = time()

    #1.Train test split
    #Split makes randomization, so random_state parameter was set
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y, 
                                                        test_size=0.25, 
                                                        random_state=42)
    if (X_train.shape[0] + X_test.shape[0]) == X.shape[0]:
        if verbose:
            print('data split into train and text seems OK')
    else:
        raise Exception('something went wrong when splitting the data')
        
    #2.fit the model
    model.fit(X_train, y_train)
    
    # output model test results
    y_pred = pipeline_lnsv.predict(X_test)
    if verbose:
        metrics = udacourse2.fn_scores_report2(y_test, 
                                               y_pred,
                                               best_10=True,
                                               verbose=True)
    else:
        metrics = udacourse2.fn_scores_report2(y_test, 
                                               y_pred,
                                               best_10=True,
                                               verbose=False)

    for metric in metrics:
        if metric < 0.6:
            raise Exception('something is wrong, model is predicting poorly')

    spent = time() - start
    if verbose:
        print('*classifier was trained!')
        print('process time:{:.0f} seconds'.format(spent))
    return model

In [18]:
#########1#########2#########3#########4#########5#########6#########7#########8
def export_model(model,
                 file_name='classifier.pkl'
                 verbose=False):
    '''This function writes your already trained Classifiear as a Picke Binary
    file.
    Inputs:
      - model (mandatory) - your already trained Classifiear - Python Object
      - file_name (optional) - the name of the file to be created (default:
         'classifier.pkl')
      - verbose (optional) - if you want some verbosity during the running 
        (default=False)
       Output: return True if everything runs OK
      ''' 
    if verbose:
        print('###export_model function started')
    start = time()

    #1.Export model as a pickle file
    file_name = file_name

    #writing the file
    with open (file_name, 'wb') as pk_writer: 
        pickle.dump(model, pk_writer)

    #reading the file
    #with open('classifier.pkl', 'rb') as pk_reader:
    #    model = pickle.load(pk_reader)
    
    if verbose:
        print('*trained Classifier was exported')
        print('process time:{:.0f} seconds'.format(spent))
        
    return True

In [21]:
#########1#########2#########3#########4#########5#########6#########7#########8
def run_pipeline(data_file='sqlite:///Messages.db', 
                 verbose=False):
    '''This function is a caller: it calls load, build, train and save modules
    Inputs:
      - data_file (optional) - complete path to the SQLite datafile to be 
        processed - (default='sqlite:///Messages.db')
      - verbose (optional) - if you want some verbosity during the running 
        (default=False)
    Output: return True if everything runs OK
    '''
    if verbose:
        print('###run_pipeline function started')
    start = time()

    #1.Run ETL pipeline
    X, y = load_data(data_file, 
                     verbose=verbose)
    #2.Build model pipeline
    model = build_model(verbose=verbose)
    #3.Train model pipeline
    model = train(X, 
                  y, 
                  model, 
                  verbose=verbose)
    # save model
    export_model(model,
                 verbose=verbose)

    if verbose:
        print('process time:{:.0f} seconds'.format(spent))
    return True

In [22]:
run_pipeline(data_file='sqlite:///Messages.db',
             verbose=True)

###run_pipeline function started
###load_data function started
existing tables in my SQLite database: ['Messages']
all labels are blank in 6317 rows
remaining rows: 19928
removal complete!
OK
found 6 rows with no tokens
*after removal, found 0 rows with no tokens
now I have 19922 rows to train
###function group_check started
  - count for main class:aid_related, 10877 entries
  - for main, without any sub-categories,  3515 entries
  - for subcategories,  7388 entries
  - for lost parent sub-categories,  26 entries
    *correcting, new count: 0 entries
elapsed time: 0.0843s
###function group_check started
  - count for main class:weather_related, 7304 entries
  - for main, without any sub-categories,  1359 entries
  - for subcategories,  5945 entries
  - for lost parent sub-categories,  0 entries
    *correcting, new count: 0 entries
elapsed time: 0.0388s
###function group_check started
  - count for main class:infrastructure_related, 1705 entries
  - for main, without any sub-categorie

True

In [20]:
if __name__ == '__main__':
    data_file = sys.argv[1]  # get filename of dataset
    run_pipeline(data_file)  # run data pipeline

ArgumentError: Could not parse rfc1738 URL from string '-f'