# ML Pipeline Preparation
### 1. Import libraries and load data from database

In [1]:
# import libraries
import numpy as np
import pandas as pd
from sqlalchemy import create_engine

import re
import time
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import pickle

[nltk_data] Downloading package punkt to /Users/chang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/chang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/chang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# load data from database
engine = create_engine('sqlite:///data/DisasterResponse.db')
df = pd.read_sql_table('cleaned_data', engine)
X = df['message'].values
Y = df.iloc[:,4:]

# split to training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

### 2. Write tokenization function to process text data

In [3]:
def tokenize(text):

    '''
    The function loads cleaned data set.

    Input:
    database_filepath: path to cleaned data set

    Output:
    X: column of loaded data set containing messages
    Y: columns of loaded data set containing categories of messages
    category_names: names of categories
    '''
    
    # replace urls with urlplaceholder
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")    
    
    # normalize, remove puntuations, tokenize, remove stopwords and lemmatize
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    tokens = word_tokenize(text)
    tokens = [WordNetLemmatizer().lemmatize(word) for word in tokens if word not in stopwords.words("english")]
    
    return tokens

### 3. Build a machine learning pipeline

In [4]:
# build ML pipeline that consists of count vectorizer and tf-idf transformers, and random forest classifier
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])

### 4. Train pipeline

In [5]:
# train and time pipeline
t0 = time.time()
pipeline.fit(X_train, Y_train)
Y_pred = pipeline.predict(X_test)
print(time.time() - t0)

430.1894052028656


### 5. Test model

In [6]:
# print precision, recall, and f1-score of predicted values
print(classification_report(Y_test, Y_pred, target_names = Y.columns))

                        precision    recall  f1-score   support

               related       0.84      0.95      0.89      5021
               request       0.84      0.50      0.63      1104
                 offer       0.00      0.00      0.00        29
           aid_related       0.75      0.70      0.72      2666
          medical_help       0.62      0.06      0.11       491
      medical_products       0.86      0.09      0.16       329
     search_and_rescue       0.47      0.04      0.08       190
              security       0.00      0.00      0.00       116
              military       0.70      0.08      0.14       183
           child_alone       0.00      0.00      0.00         0
                 water       0.88      0.32      0.47       417
                  food       0.83      0.65      0.73       714
               shelter       0.82      0.37      0.51       602
              clothing       1.00      0.08      0.15       110
                 money       0.80      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 6. Improve the model using grid search

In [8]:
# print pipeline parameters
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(tokenizer=<function tokenize at 0x1279c7160>)),
  ('tfidf', TfidfTransformer()),
  ('clf', MultiOutputClassifier(estimator=RandomForestClassifier()))],
 'verbose': False,
 'vect': CountVectorizer(tokenizer=<function tokenize at 0x1279c7160>),
 'tfidf': TfidfTransformer(),
 'clf': MultiOutputClassifier(estimator=RandomForestClassifier()),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': <function __main__.tokenize(text)>,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_i

In [9]:
# specify parameter grid to be searched on, and define cv pieline
parameters = {
    'clf__estimator__n_estimators': [25, 50], #50
    'clf__estimator__max_depth': [300, 400], #400
    'clf__estimator__min_samples_split': [4, 6], #6
}

cv = GridSearchCV(pipeline, param_grid=parameters)

In [10]:
# train cv pipeline
t0 = time.time()
cv.fit(X_train,Y_train)
Y_pred_cv = cv.predict(X_test)
print(time.time() - t0)

4859.257224082947


In [11]:
# print best parameters
cv.best_params_

{'clf__estimator__max_depth': 400,
 'clf__estimator__min_samples_split': 6,
 'clf__estimator__n_estimators': 50}

### 7. Test tuned model

In [16]:
# print results of tuned model
print(classification_report(Y_test, Y_pred_cv, target_names = Y.columns))

                        precision    recall  f1-score   support

               related       0.83      0.95      0.89      5075
               request       0.86      0.52      0.65      1148
                 offer       0.00      0.00      0.00        28
           aid_related       0.75      0.72      0.73      2752
          medical_help       0.65      0.12      0.20       489
      medical_products       0.73      0.06      0.11       324
     search_and_rescue       0.56      0.05      0.09       189
              security       0.00      0.00      0.00       106
              military       0.60      0.07      0.12       219
           child_alone       0.00      0.00      0.00         0
                 water       0.89      0.33      0.48       427
                  food       0.86      0.53      0.65       766
               shelter       0.83      0.42      0.56       583
              clothing       0.78      0.16      0.26        88
                 money       0.50      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 8. Improve and test model by adding feature

In [13]:
# define method to extract text length
class TextLengthExtractor(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return pd.DataFrame(pd.Series(X).apply(len))

In [14]:
# modify pipeline to allow for additional feature
pipeline2 = Pipeline([
    ('features', FeatureUnion([
        
        ('text_pipeline', Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer())                        
        ])),   
        
        ('text_length', TextLengthExtractor())

    ])),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])

In [17]:
# train new pipeline
t0 = time.time()
pipeline2.fit(X_train, Y_train)
Y_pred2 = pipeline2.predict(X_test)
print(time.time() - t0)

380.18575501441956


In [18]:
# print evaluation results
print(classification_report(Y_test, Y_pred2, target_names = Y.columns))

                        precision    recall  f1-score   support

               related       0.84      0.96      0.89      5075
               request       0.86      0.51      0.64      1148
                 offer       0.00      0.00      0.00        28
           aid_related       0.75      0.72      0.73      2752
          medical_help       0.61      0.09      0.15       489
      medical_products       0.72      0.08      0.14       324
     search_and_rescue       0.44      0.04      0.07       189
              security       0.00      0.00      0.00       106
              military       0.64      0.06      0.12       219
           child_alone       0.00      0.00      0.00         0
                 water       0.91      0.39      0.55       427
                  food       0.85      0.59      0.70       766
               shelter       0.80      0.40      0.54       583
              clothing       0.71      0.11      0.20        88
                 money       0.50      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
# look for parameters of improved model using grid search
parameters = {
    'clf__estimator__n_estimators': [25, 50], #50
    'clf__estimator__max_depth': [300, 400], #400
    'clf__estimator__min_samples_split': [4, 6], #4
}

cv2 = GridSearchCV(pipeline2, param_grid=parameters)

In [20]:
t0 = time.time()
cv2.fit(X_train,Y_train)
Y_pred_cv2 = cv2.predict(X_test)
print(time.time() - t0)

4818.731374025345


In [21]:
cv2.best_params_

{'clf__estimator__max_depth': 400,
 'clf__estimator__min_samples_split': 4,
 'clf__estimator__n_estimators': 50}

In [22]:
print(classification_report(Y_test, Y_pred_cv2, target_names = Y.columns))

                        precision    recall  f1-score   support

               related       0.83      0.96      0.89      5075
               request       0.84      0.51      0.63      1148
                 offer       0.00      0.00      0.00        28
           aid_related       0.75      0.72      0.73      2752
          medical_help       0.59      0.11      0.19       489
      medical_products       0.75      0.09      0.16       324
     search_and_rescue       0.52      0.07      0.13       189
              security       0.00      0.00      0.00       106
              military       0.65      0.07      0.12       219
           child_alone       0.00      0.00      0.00         0
                 water       0.91      0.38      0.54       427
                  food       0.83      0.64      0.72       766
               shelter       0.82      0.39      0.53       583
              clothing       0.76      0.15      0.25        88
                 money       0.50      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 9. Export model as a pickle file

In [24]:
pickle.dump(pipeline2, open('classifier.pkl', 'wb')) 