# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [1]:
# import librarie
from sqlalchemy import create_engine
import sqlite3
import re
import pandas as pd
import numpy as np
import pickle


import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.metrics import classification_report,f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/fengfangzhong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/fengfangzhong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fengfangzhong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# load data from database
#engine = create_engine('sqlite:///InsertDatabaseName.db')
engine = sqlite3.connect('./data/DisasterResponse.db')
df = pd.read_sql('select * from DisasterResponse', engine)

cat = df.columns[-36:]

X = df['message']
Y = df[cat]

### 2. Write a tokenization function to process your text data

In [4]:
def tokenize(text):
    stop_words = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    
    # normalizing text
    normalizer = re.sub(r'[^a-zA-Z0-9]', ' ', text.lower())
    
    # splitting text into tokens
    token = word_tokenize(normalizer)
    
    # lemmatize and remove stop words
    return  [lemmatizer.lemmatize(word) for word in token if word not in stop_words]

### 3. Build a machine learning pipeline
- You'll find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [5]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier(random_state=42)))
])

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2)
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
           n_jobs=1))])

### 5. Test your model
Report the f1 score, precision and recall on both the training set and the test set. You can use sklearn's `classification_report` function here. 

In [7]:
y_pred = pipeline.predict(X_test)

In [9]:
print(classification_report(y_test, y_pred, target_names=cat, digits=2))

                        precision    recall  f1-score   support

               related       0.85      0.92      0.88      4006
               request       0.79      0.40      0.53       873
                 offer       0.00      0.00      0.00        22
           aid_related       0.74      0.59      0.66      2156
          medical_help       0.70      0.10      0.18       426
      medical_products       0.75      0.15      0.25       261
     search_and_rescue       0.46      0.04      0.07       150
              security       0.20      0.01      0.02        90
              military       0.79      0.07      0.12       168
           child_alone       0.00      0.00      0.00         0
                 water       0.81      0.37      0.50       344
                  food       0.83      0.43      0.57       561
               shelter       0.80      0.33      0.46       489
              clothing       0.75      0.14      0.24        86
                 money       1.00      

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


### 6. Improve your model
Use grid search to find better parameters. 

In [10]:
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), preprocessor=None, stop_words=None,
           strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
           tokenizer=<function tokenize at 0x1a189c3840>, vocabulary=None)),
  ('tfidf',
   TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
  ('clf',
   MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
               max_depth=None, max_features='auto', max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
               oob_score=False, random_state=42, ver

In [11]:
parameters = {
    'vect__min_df': [1,10,50],
    'vect__lowercase': [True, False],
    'tfidf__smooth_idf': [True, False],
}


cv = GridSearchCV(pipeline, param_grid = parameters, cv = 2)

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.

In [12]:
cv.fit(X_train, y_train)

GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
           n_jobs=1))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vect__min_df': [1, 10, 50], 'vect__lowercase': [True, False], 'tfidf__smooth_idf': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [13]:
cv.best_score_

0.2524318138470341

In [14]:
cv.best_params_

{'tfidf__smooth_idf': True, 'vect__lowercase': True, 'vect__min_df': 50}

In [15]:
# test cv
y_pred = cv.predict(X_test)

In [16]:
# classification_report
print(classification_report(y_test, y_pred, target_names=cat, digits=2))

                        precision    recall  f1-score   support

               related       0.86      0.90      0.88      4006
               request       0.76      0.44      0.56       873
                 offer       0.00      0.00      0.00        22
           aid_related       0.73      0.64      0.68      2156
          medical_help       0.56      0.12      0.20       426
      medical_products       0.67      0.17      0.27       261
     search_and_rescue       0.58      0.05      0.09       150
              security       0.33      0.01      0.02        90
              military       0.43      0.05      0.10       168
           child_alone       0.00      0.00      0.00         0
                 water       0.78      0.43      0.56       344
                  food       0.78      0.64      0.71       561
               shelter       0.76      0.44      0.56       489
              clothing       0.75      0.14      0.24        86
                 money       0.92      

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [17]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(AdaBoostClassifier(random_state=42)))
])

In [18]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...timator=None,
          learning_rate=1.0, n_estimators=50, random_state=42),
           n_jobs=1))])

In [19]:
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred, target_names=cat, digits=2))

                        precision    recall  f1-score   support

               related       0.80      0.97      0.87      4006
               request       0.75      0.45      0.56       873
                 offer       0.00      0.00      0.00        22
           aid_related       0.75      0.63      0.68      2156
          medical_help       0.62      0.28      0.39       426
      medical_products       0.65      0.32      0.43       261
     search_and_rescue       0.60      0.19      0.28       150
              security       0.00      0.00      0.00        90
              military       0.63      0.30      0.41       168
           child_alone       0.00      0.00      0.00         0
                 water       0.72      0.63      0.67       344
                  food       0.79      0.62      0.69       561
               shelter       0.78      0.51      0.61       489
              clothing       0.76      0.43      0.55        86
                 money       0.58      

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [20]:
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), preprocessor=None, stop_words=None,
           strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
           tokenizer=<function tokenize at 0x1a189c3840>, vocabulary=None)),
  ('tfidf',
   TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
  ('clf',
   MultiOutputClassifier(estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
             learning_rate=1.0, n_estimators=50, random_state=42),
              n_jobs=1))],
 'vect': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1

In [21]:
parameters = {
    'vect__min_df': [10,50,100],
    'vect__lowercase': [True, False],
    'tfidf__smooth_idf': [True, False],
}


cv = GridSearchCV(pipeline, param_grid = parameters, cv = 2)

In [22]:
cv.fit(X_train, y_train)

GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...timator=None,
          learning_rate=1.0, n_estimators=50, random_state=42),
           n_jobs=1))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vect__min_df': [10, 50, 100], 'vect__lowercase': [True, False], 'tfidf__smooth_idf': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [23]:
print("best_score:", cv.best_score_)
print("best_parameters:", cv.best_params_)

best_score: 0.23788861338928094
best_parameters: {'tfidf__smooth_idf': True, 'vect__lowercase': True, 'vect__min_df': 10}


In [24]:
# test improved model
y_pred = cv.predict(X_test)

In [25]:
# classification report
print(classification_report(y_test, y_pred, target_names=cat, digits=2))

                        precision    recall  f1-score   support

               related       0.80      0.97      0.87      4006
               request       0.75      0.48      0.59       873
                 offer       0.00      0.00      0.00        22
           aid_related       0.76      0.62      0.68      2156
          medical_help       0.65      0.27      0.38       426
      medical_products       0.64      0.34      0.45       261
     search_and_rescue       0.54      0.17      0.26       150
              security       0.08      0.01      0.02        90
              military       0.61      0.30      0.40       168
           child_alone       0.00      0.00      0.00         0
                 water       0.72      0.62      0.67       344
                  food       0.78      0.60      0.68       561
               shelter       0.77      0.51      0.61       489
              clothing       0.75      0.38      0.51        86
                 money       0.70      

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


### 9. Export your model as a pickle file

In [292]:
with open('model.pkl', 'wb') as pkl_file:
    pickle.dump(cv, pkl_file)
pkl_file.close()

### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.