# ML Pipeline Preparation

### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with read_sql_table
- Define feature and target variables X and Y

In [2]:
# import libraries
import pandas as pd
import numpy as np
import pickle
from sqlalchemy import create_engine
import warnings
import string
import unittest
warnings.filterwarnings("ignore")

# import NLP libraries
import re
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet') # download for lemmatization

# import sklearn
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import precision_score, recall_score, f1_score,classification_report, make_scorer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.base import BaseEstimator, TransformerMixin

[nltk_data] Downloading package punkt to /Users/kechen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kechen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/kechen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# load data from database
engine = create_engine('sqlite:///data/DisasterResponse.db')
df = pd.read_sql_table('DisasterResponse', engine)
X = df['message']
Y = df.drop(['id', 'message', 'original', 'genre'], axis=1)

### 2. Write a tokenization function to process your text data

In [4]:

def tokenize(text):
    table = text.maketrans(dict.fromkeys(string.punctuation))
    words = word_tokenize(text.lower().strip().translate(table))
    words = [word for word in words if word not in stopwords.words('english')]
    lemmed = [WordNetLemmatizer().lemmatize(word) for word in words]
    lemmed = [WordNetLemmatizer().lemmatize(word, pos='v') for word in lemmed]
    stemmed = [PorterStemmer().stem(word) for word in lemmed]
    return stemmed

### 3. Build a machine learning pipeline



In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
def build_pipeline():
    
    # build NLP pipeline - count words, tf-idf, multiple output classifier
    pipeline = Pipeline([
        ('vec', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])
    
    return pipeline

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, Y)
pipeline = build_pipeline()
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_..._score=False, random_state=None, verbose=0,
            warm_start=False),
           n_jobs=None))])

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [7]:
y_pred = pipeline.predict(X_test)

results_dict = {}

for pred, label, col in zip(y_pred.transpose(), y_test.values.transpose(), y_test.columns):
    print(col)
    print(classification_report(label, pred))
    results_dict[col] = classification_report(label, pred, output_dict=True)
   

related
              precision    recall  f1-score   support

         0.0       0.33      0.14      0.19      1675
         1.0       0.75      0.90      0.82      4911

   micro avg       0.71      0.71      0.71      6586
   macro avg       0.54      0.52      0.51      6586
weighted avg       0.65      0.71      0.66      6586

request
              precision    recall  f1-score   support

         0.0       0.84      0.97      0.90      5451
         1.0       0.38      0.09      0.14      1135

   micro avg       0.82      0.82      0.82      6586
   macro avg       0.61      0.53      0.52      6586
weighted avg       0.76      0.82      0.77      6586

offer
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00      6552
         1.0       0.00      0.00      0.00        34

   micro avg       0.99      0.99      0.99      6586
   macro avg       0.50      0.50      0.50      6586
weighted avg       0.99      0.99      0.99      658

shops
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      6554
         1.0       0.00      0.00      0.00        32

   micro avg       1.00      1.00      1.00      6586
   macro avg       0.50      0.50      0.50      6586
weighted avg       0.99      1.00      0.99      6586

aid_centers
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99      6512
         1.0       0.00      0.00      0.00        74

   micro avg       0.99      0.99      0.99      6586
   macro avg       0.49      0.50      0.50      6586
weighted avg       0.98      0.99      0.98      6586

other_infrastructure
              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98      6299
         1.0       0.06      0.00      0.01       287

   micro avg       0.95      0.95      0.95      6586
   macro avg       0.51      0.50      0.49      6586
weighted avg       0.92      0.95  

There are a number of categories with small numbers of messages where none were picked up:

- offer
- missing_people
- tools
- hospitals
- shops
- add_centers
- fire


Let's take a look at the overall performance (weighted average) of the model.

In [8]:
weighted_avg = {}
for key in results_dict.keys():
    weighted_avg[key] = results_dict[key]['weighted avg']

df_wavg = pd.DataFrame(weighted_avg).transpose()
df_wavg

Unnamed: 0,f1-score,precision,recall,support
related,0.66288,0.646478,0.709535,6586.0
request,0.7682,0.757984,0.818251,6586.0
offer,0.992263,0.989702,0.994838,6586.0
aid_related,0.532054,0.539464,0.569997,6586.0
medical_help,0.883753,0.856734,0.91649,6586.0
medical_products,0.924406,0.907895,0.945035,6586.0
search_and_rescue,0.953918,0.943823,0.968418,6586.0
security,0.975998,0.968364,0.983753,6586.0
military,0.956468,0.946171,0.969936,6586.0
child_alone,1.0,1.0,1.0,6586.0


Because of the imbalance in the classes, it is possible to get high accuracy scores even with all positive messages in a class falsely labeled as negative. 

As a result, it might be better to evaluate models with f1 score.

In [9]:
df_wavg['f1-score'].describe()

count    36.000000
mean      0.898782
std       0.108072
min       0.532054
25%       0.870835
50%       0.929116
75%       0.976807
max       1.000000
Name: f1-score, dtype: float64

Let's take a look at the lower 25% of the f1 scores

In [10]:
df_wavg[df_wavg['f1-score'] <= df_wavg['f1-score'].quantile(0.25)]

Unnamed: 0,f1-score,precision,recall,support
related,0.66288,0.646478,0.709535,6586.0
request,0.7682,0.757984,0.818251,6586.0
aid_related,0.532054,0.539464,0.569997,6586.0
food,0.83734,0.808747,0.882022,6586.0
other_aid,0.809408,0.772755,0.857425,6586.0
weather_related,0.663748,0.675231,0.719405,6586.0
floods,0.868038,0.837423,0.905102,6586.0
storm,0.85659,0.847775,0.897206,6586.0
direct_report,0.733285,0.713111,0.793046,6586.0


In [11]:
key_categories = (df_wavg[df_wavg['f1-score'] <= df_wavg['f1-score'].quantile(0.25)].index).tolist()
key_categories

['related',
 'request',
 'aid_related',
 'food',
 'other_aid',
 'weather_related',
 'floods',
 'storm',
 'direct_report']

Also, let's take a look at the higher 25% f1 scores

In [12]:
df_wavg[df_wavg['f1-score'] >= df_wavg['f1-score'].quantile(0.75)]

Unnamed: 0,f1-score,precision,recall,support
offer,0.992263,0.989702,0.994838,6586.0
child_alone,1.0,1.0,1.0,6586.0
clothing,0.979234,0.975891,0.985727,6586.0
missing_people,0.980758,0.97465,0.986942,6586.0
tools,0.989763,0.986381,0.993167,6586.0
hospitals,0.984917,0.980056,0.989827,6586.0
shops,0.992718,0.990306,0.995141,6586.0
aid_centers,0.983026,0.977651,0.98846,6586.0
fire,0.980229,0.973755,0.98679,6586.0



Some of the highest performing categories appear to be doing so because of imbalance - there are very few 1s in these categories and so their metrics are still high even if they are all classified as 0s.

As a result, it seems reasonable to seek to improve the performance of the original "lowest" (25-percentile) performing categories.

In [13]:
df_wavg.loc[key_categories]['f1-score'].describe()

count    9.000000
mean     0.747949
std      0.111554
min      0.532054
25%      0.663748
50%      0.768200
75%      0.837340
max      0.868038
Name: f1-score, dtype: float64

The average f1-score for the key categories with the base model is 0.75 with a std of 0.11.

Let's create a function to easily assess a model's performance for those key categories.

In [14]:
def model_performance(labels, preds):
    # Print/collect results
    results_dict = {}
    for pred, label, col in zip(preds.transpose(), labels.values.transpose(), labels.columns):
        print(col)
        print(classification_report(label, pred))
        results_dict[col] = classification_report(label, pred, output_dict=True)
    
    # Convert to df
    weighted_avg = {}
    for key in results_dict.keys():
        weighted_avg[key] = results_dict[key]['weighted avg']

    df_wavg = pd.DataFrame(weighted_avg).transpose()
    
    # Show metrics for "key categories"
    key_categories = ['related', 'request', 'aid_related', 'food',
                      'shelter', 'other_aid', 'storm',
                      'weather_related', 'direct_report']
    
    print(df_wavg.loc[key_categories]['f1-score'].describe())
    
    return df_wavg

### 6. Improve your model
Use grid search to find better parameters. 

In [15]:
pipeline.get_params()

{'memory': None,
 'steps': [('vec',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), preprocessor=None, stop_words=None,
           strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
           tokenizer=<function tokenize at 0x1a16ba01e0>, vocabulary=None)),
  ('tfidf',
   TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
  ('clf',
   MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
               max_depth=None, max_features='auto', max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
               oob_score=False, random_state=N

In [27]:
make_scorer(f1_score, average='micro')

parameters = {
    'vec__max_df': [0.8],
    'clf__estimator__max_depth': (25, 50, None),
    'clf__estimator__min_samples_split': (2, 10, 25, 50, 100), 
    'clf__estimator__n_estimators': [200]
}

cv = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1 ,verbose=10)

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  


In [28]:
cv.fit(X_train, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 55.5min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 81.8min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 142.7min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 157.2min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 206.9min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 245.3min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 344.2min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 494.6min
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed: 604.1min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_..._score=False, random_state=None, verbose=0,
            warm_start=False),
           n_jobs=None))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'vec__max_df': [0.8], 'clf__estimator__max_depth': (25, 50, None), 'clf__estimator__min_samples_split': (2, 10, 25, 50, 100), 'clf__estimator__n_estimators': [200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [30]:
y_pred_cv = cv.predict(X_test)
cv_results = model_performance(y_test, y_pred_cv)

related
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      1675
         1.0       0.75      1.00      0.85      4911

   micro avg       0.75      0.75      0.75      6586
   macro avg       0.37      0.50      0.43      6586
weighted avg       0.56      0.75      0.64      6586

request
              precision    recall  f1-score   support

         0.0       0.83      1.00      0.91      5451
         1.0       0.00      0.00      0.00      1135

   micro avg       0.83      0.83      0.83      6586
   macro avg       0.41      0.50      0.45      6586
weighted avg       0.69      0.83      0.75      6586

offer
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00      6552
         1.0       0.00      0.00      0.00        34

   micro avg       0.99      0.99      0.99      6586
   macro avg       0.50      0.50      0.50      6586
weighted avg       0.99      0.99      0.99      658

              precision    recall  f1-score   support

         0.0       0.73      1.00      0.84      4726
         1.0       0.79      0.04      0.08      1860

   micro avg       0.73      0.73      0.73      6586
   macro avg       0.76      0.52      0.46      6586
weighted avg       0.74      0.73      0.62      6586

floods
              precision    recall  f1-score   support

         0.0       0.91      1.00      0.95      6011
         1.0       0.00      0.00      0.00       575

   micro avg       0.91      0.91      0.91      6586
   macro avg       0.46      0.50      0.48      6586
weighted avg       0.83      0.91      0.87      6586

storm
              precision    recall  f1-score   support

         0.0       0.90      1.00      0.95      5925
         1.0       0.00      0.00      0.00       661

   micro avg       0.90      0.90      0.90      6586
   macro avg       0.45      0.50      0.47      6586
weighted avg       0.81      0.90      0.85      6586

fire
 

In [31]:
cv.best_score_

0.19719593055625853

In [32]:
cv.best_params_

{'clf__estimator__max_depth': 50,
 'clf__estimator__min_samples_split': 100,
 'clf__estimator__n_estimators': 200,
 'vec__max_df': 0.8}

### 8. Try improving the model further

As we have already tried the parallel ensemble method(base learners are generated in parallel) - Random Forest, now let's try a sequential ensemble method(base learners are generated sequentially) - AdaBoost.
Here's an article that I found is a good resource to understand AdaBoost: https://www.datacamp.com/community/tutorials/adaboost-classifier-python 

In [33]:
pipeline_ada = Pipeline([
                                ('vect', CountVectorizer(tokenizer=tokenize)),
                                ('tfidf', TfidfTransformer()),
                                ('clf', MultiOutputClassifier(AdaBoostClassifier(n_estimators = 100)))
                            ])
pipeline_ada.fit(X_train, y_train)
y_pred_ada = pipeline_ada.predict(X_test)
ada_results = model_performance(y_test, y_pred_ada)


related
              precision    recall  f1-score   support

         0.0       0.39      0.06      0.11      1675
         1.0       0.75      0.97      0.85      4911

   micro avg       0.74      0.74      0.74      6586
   macro avg       0.57      0.51      0.48      6586
weighted avg       0.66      0.74      0.66      6586

request
              precision    recall  f1-score   support

         0.0       0.84      0.96      0.90      5451
         1.0       0.44      0.14      0.21      1135

   micro avg       0.82      0.82      0.82      6586
   macro avg       0.64      0.55      0.55      6586
weighted avg       0.77      0.82      0.78      6586

offer
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00      6552
         1.0       0.00      0.00      0.00        34

   micro avg       0.99      0.99      0.99      6586
   macro avg       0.50      0.50      0.50      6586
weighted avg       0.99      0.99      0.99      658

weather_related
              precision    recall  f1-score   support

         0.0       0.75      0.94      0.83      4726
         1.0       0.55      0.19      0.28      1860

   micro avg       0.73      0.73      0.73      6586
   macro avg       0.65      0.56      0.56      6586
weighted avg       0.69      0.73      0.68      6586

floods
              precision    recall  f1-score   support

         0.0       0.91      1.00      0.95      6011
         1.0       0.23      0.01      0.02       575

   micro avg       0.91      0.91      0.91      6586
   macro avg       0.57      0.50      0.49      6586
weighted avg       0.85      0.91      0.87      6586

storm
              precision    recall  f1-score   support

         0.0       0.91      0.99      0.95      5925
         1.0       0.42      0.08      0.13       661

   micro avg       0.90      0.90      0.90      6586
   macro avg       0.66      0.53      0.54      6586
weighted avg       0.86      0.90      0.86  

### 9. Export your model as a pickle file

In [34]:
pickle.dump(pipeline, open('rf_model.pkl', 'wb'))


In [35]:
pickle.dump(pipeline_ada, open('classifier.pkl', 'wb'))