# ML Pipeline Preparation

# 1. Import libraries and load data from database.

In [1]:
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\crdea\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\crdea\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\crdea\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
import re
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import sys
import os
import re
from sqlalchemy import create_engine
import pickle

from sklearn.metrics import confusion_matrix
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [3]:
# load data from database
engine = create_engine('sqlite:///all_messages.db')
df = pd.read_sql('SELECT * FROM all_messages', engine)
df.head(2)

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0


In [4]:
# split data in X and Y
X = df['message']
y = df.iloc[:, 4:]

In [5]:
y.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
# Transforming this process in a function

def load_data(database_filepath):
    """Load cleaned data from database into dataframe.
    Args:
        database_filepath: String. It contains cleaned data table.
        table_name: String. It contains cleaned data.
    Returns:
       X: numpy.ndarray. Disaster messages.
       Y: numpy.ndarray. Disaster categories for each messages.
       category_name: list. Disaster category names.
    """
    # load data from database
    engine = create_engine('sqlite:///' + database_filepath)
    df = pd.read_sql('SELECT * FROM all_messages', con=engine)

    category_names = df.columns[4:]

    X = df[['message']].values[:, 0]
    y = df[category_names].values

    return X, y

# 2. Write a tokenization function to process your text data

In [7]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

In [8]:
def tokenize(text):
    
    """Function that tokenizes a text.
    Args:
        text: String. A disaster message.
        lemmatizer: nltk.stem.Lemmatizer.
    Returns:
        list containing tokens.
    """
    
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

In [9]:
# test out function
X, y = load_data('all_messages.db')
for message in X[:5]:
    tokens = tokenize(message)
    print(message)
    print(tokens, '\n')

Weather update - a cold front from Cuba that could pass over Haiti
['weather', 'update', '-', 'a', 'cold', 'front', 'from', 'cuba', 'that', 'could', 'pas', 'over', 'haiti'] 

Is the Hurricane over or is it not over
['is', 'the', 'hurricane', 'over', 'or', 'is', 'it', 'not', 'over'] 

Looking for someone but no name
['looking', 'for', 'someone', 'but', 'no', 'name'] 

UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.
['un', 'report', 'leogane', '80-90', 'destroyed', '.', 'only', 'hospital', 'st.', 'croix', 'functioning', '.', 'needs', 'supply', 'desperately', '.'] 

says: west side of Haiti, rest of the country today and tonight
['say', ':', 'west', 'side', 'of', 'haiti', ',', 'rest', 'of', 'the', 'country', 'today', 'and', 'tonight'] 



# 3. Build a machine learning pipeline

In [13]:
def build_model():
    
    """Pipeline with feature extraction for the classification task.
    Returns:
        pipeline: MultiOutputClassifier(RandomForestClassifier()). 
    """
    # Set pipeline
    
    classifier = RandomForestClassifier(n_estimators=200)

    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(classifier, n_jobs=1
         
        ))
                    ])
    
    return pipeline

# 4. Train pipeline

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

model = build_model()
model.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x0000024C5762A820>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier(n_estimators=200),
                                       n_jobs=1))])

In [15]:
y_pred = model.predict(X_test)

# 5. Test your model

In [16]:
def evaluate_model(y_pred, Y_test, category_names):
    
    """Evaluate model
    Args:
        y_pred: numpy.ndarray. Predicted disaster message for each category.
        Y_test: numpy.ndarray. Disaster categories for each messages
        category_names: Disaster category names.
    """
    
    # Predict categories of messages.
#     category_names = y.columns
    Y_pred = y_pred

    # Print accuracy, precision, recall and f1_score for each categories
    for i in range(0, len(category_names)):
        print(category_names[i])
        print("\tAccuracy: {:.4f}\t\t% Precision: {:.4f}\t\t% Recall: {:.4f}\t\t% F1_score: {:.4f}".format(
            accuracy_score(Y_test[:, i], Y_pred[:, i]),
            precision_score(Y_test[:, i], Y_pred[:, i], average='weighted'),
            recall_score(Y_test[:, i], Y_pred[:, i], average='weighted'),
            f1_score(Y_test[:, i], Y_pred[:, i], average='weighted')
        ))

In [17]:
# Test out the function 
category_names = df.iloc[:, 4:].columns
evaluate_model(y_pred, y_test, category_names)

related
	Accuracy: 0.7963		% Precision: 0.7900		% Recall: 0.7963		% F1_score: 0.7557
request
	Accuracy: 0.8926		% Precision: 0.8900		% Recall: 0.8926		% F1_score: 0.8773
offer
	Accuracy: 0.9953		% Precision: 0.9906		% Recall: 0.9953		% F1_score: 0.9929
aid_related
	Accuracy: 0.7725		% Precision: 0.7743		% Recall: 0.7725		% F1_score: 0.7666
medical_help
	Accuracy: 0.9211		% Precision: 0.9060		% Recall: 0.9211		% F1_score: 0.8873
medical_products
	Accuracy: 0.9516		% Precision: 0.9415		% Recall: 0.9516		% F1_score: 0.9307
search_and_rescue
	Accuracy: 0.9731		% Precision: 0.9673		% Recall: 0.9731		% F1_score: 0.9616
security
	Accuracy: 0.9831		% Precision: 0.9834		% Recall: 0.9831		% F1_score: 0.9748
military
	Accuracy: 0.9684		% Precision: 0.9564		% Recall: 0.9684		% F1_score: 0.9551
child_alone
	Accuracy: 1.0000		% Precision: 1.0000		% Recall: 1.0000		% F1_score: 1.0000
water
	Accuracy: 0.9475		% Precision: 0.9462		% Recall: 0.9475		% F1_score: 0.9321
food


  _warn_prf(average, modifier, msg_start, len(result))


	Accuracy: 0.9329		% Precision: 0.9303		% Recall: 0.9329		% F1_score: 0.9227
shelter
	Accuracy: 0.9365		% Precision: 0.9343		% Recall: 0.9365		% F1_score: 0.9201
clothing
	Accuracy: 0.9867		% Precision: 0.9869		% Recall: 0.9867		% F1_score: 0.9806
money
	Accuracy: 0.9771		% Precision: 0.9689		% Recall: 0.9771		% F1_score: 0.9672
missing_people
	Accuracy: 0.9895		% Precision: 0.9896		% Recall: 0.9895		% F1_score: 0.9844
refugees
	Accuracy: 0.9672		% Precision: 0.9491		% Recall: 0.9672		% F1_score: 0.9518
death
	Accuracy: 0.9571		% Precision: 0.9520		% Recall: 0.9571		% F1_score: 0.9400
other_aid
	Accuracy: 0.8727		% Precision: 0.8485		% Recall: 0.8727		% F1_score: 0.8175
infrastructure_related
	Accuracy: 0.9355		% Precision: 0.8759		% Recall: 0.9355		% F1_score: 0.9047
transport
	Accuracy: 0.9558		% Precision: 0.9483		% Recall: 0.9558		% F1_score: 0.9372
buildings
	Accuracy: 0.9510		% Precision: 0.9420		% Recall: 0.9510		% F1_score: 0.9305
electricity
	Accuracy: 0.9792		% Precision: 0.9

# 6. Improve your model

In [None]:
# I am having issues to improve my model
# I can´t figure it out how to use GridSearch with MultiOutputClassifier

In [19]:
classifier = RandomForestClassifier()

pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(classifier, n_jobs=-1
         
        ))
                    ])
    
parameters = {

        'clf__n_estimators': [50, 100, 200],
        'clf__min_samples_split': [2, 3, 4]

    }

# cv = GridSearchCV(pipeline, param_grid=parameters)
cv = GridSearchCV(pipeline, param_grid = parameters, cv = 10,
                  refit = True, verbose = 1, return_train_score = True, n_jobs = 1)

In [20]:
# This is the error I am not able to correct
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)

Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


ValueError: Invalid parameter min_samples_split for estimator MultiOutputClassifier(estimator=RandomForestClassifier(), n_jobs=-1). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
def improve_model():

    """
    
    Improve model: uses Gridsearch in order to find the optimal model hyperparams.


    """
        
    classifier = RandomForestClassifier(n_estimators=200)

    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(classifier, n_jobs=-1
         
        ))
                    ])
    
    parameters = {

        'clf__n_estimators': [50, 100, 200],
        'clf__min_samples_split': [2, 3, 4]

    }

    cv = GridSearchCV(pipeline, param_grid=parameters)

    return cv

# 7. Test your model

# 8. Try improving your model further. Here are a few ideas

In [None]:
# I am still not able to make the GridSearch method work for multiple output classes

In [None]:
def new_build_model():
    """Build model.
    Returns:
        pipeline: sklearn.model_selection.GridSearchCV. 
    """
    # Set pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(
            AdaBoostClassifier(
                base_estimator=DecisionTreeClassifier(max_depth=1, class_weight='balanced'),
                learning_rate=0.3,
                n_estimators=200
            )
        ))
    ])

    # Set parameters for gird search
    parameters = {
        'clf__estimator__learning_rate': [0.1, 0.3],
        'clf__estimator__n_estimators': [100, 200]
    }

    # Set grid search
    cv = GridSearchCV(estimator=pipeline, param_grid=parameters, cv=3, scoring='f1_weighted', verbose=3)

    return cv

# 9. Export your model as a pickle file

In [23]:
def save_model(model, model_filepath):
    """Save model
    Args:
        model: sklearn.model_selection.GridSearchCV. It contains a sklearn estimator.
        model_filepath: String. Trained model is saved as pickel into this file.
    """
    with open(model_filepath, 'wb') as file:
        pickle.dump(model, file)

In [24]:
# test out the function

save_model(model, 'pickle_1')

In [21]:
with open('pickle_1', 'wb') as file:
        pickle.dump(model, file)

# 10. Use this notebook to complete train.py

In [None]:
import pickle
import re
import sys
import warnings
import nltk
nltk.download(['punkt', 'wordnet', 'stopwords'])
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sqlalchemy import create_engine


def load_data(database_filepath):
    """Load cleaned data from database into dataframe.
    Args:
        database_filepath: String. It contains cleaned data table.
        table_name: String. It contains cleaned data.
    Returns:
       X: numpy.ndarray. Disaster messages.
       Y: numpy.ndarray. Disaster categories for each messages.
       category_name: list. Disaster category names.
    """
    # load data from database
    engine = create_engine('sqlite:///' + database_filepath)
    df = pd.read_sql('SELECT * FROM all_messages', con=engine)

    category_names = df.columns[4:]

    X = df[['message']].values[:, 0]
    y = df[category_names].values

    return X, y

url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def tokenize(text):
    
    """Function that tokenizes a text.
    Args:
        text: String. A disaster message.
        lemmatizer: nltk.stem.Lemmatizer.
    Returns:
        list containing tokens.
    """
    
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

def new_build_model():
    """Build model.
    Returns:
        pipeline: sklearn.model_selection.GridSearchCV. 
    """
    # Set pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(
            AdaBoostClassifier(
                base_estimator=DecisionTreeClassifier(max_depth=1, class_weight='balanced'),
                learning_rate=0.3,
                n_estimators=200
            )
        ))
    ])

    # Set parameters for gird search
    parameters = {
        'clf__estimator__learning_rate': [0.1, 0.3],
        'clf__estimator__n_estimators': [100, 200]
    }

    # Set grid search
    cv = GridSearchCV(estimator=pipeline, param_grid=parameters, cv=3, scoring='f1_weighted', verbose=3)

    return cv


def evaluate_model(y_pred, Y_test, category_names):
    
    """Evaluate model
    Args:
        y_pred: numpy.ndarray. Predicted disaster message for each category.
        Y_test: numpy.ndarray. Disaster categories for each messages
        category_names: Disaster category names.
    """
    
    # Predict categories of messages.
#     category_names = y.columns
    Y_pred = y_pred

    # Print accuracy, precision, recall and f1_score for each categories
    for i in range(0, len(category_names)):
        print(category_names[i])
        print("\tAccuracy: {:.4f}\t\t% Precision: {:.4f}\t\t% Recall: {:.4f}\t\t% F1_score: {:.4f}".format(
            accuracy_score(Y_test[:, i], Y_pred[:, i]),
            precision_score(Y_test[:, i], Y_pred[:, i], average='weighted'),
            recall_score(Y_test[:, i], Y_pred[:, i], average='weighted'),
            f1_score(Y_test[:, i], Y_pred[:, i], average='weighted')
        ))


def save_model(model, model_filepath):
    """Save model
    Args:
        model: sklearn.model_selection.GridSearchCV. It contains a sklearn estimator.
        model_filepath: String. Trained model is saved as pickel into this file.
    """
    with open(model_filepath, 'wb') as file:
        pickle.dump(model, file)

def main():
    X, y = load_data(('all_messages.db'))
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model = build_model()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    category_names = df.iloc[:, 4:].columns
    evaluate_model(y_pred, y_test, category_names)
    
    save_model(model, 'pickle')

if __name__ == '__main__':
    main()