# ML Pipeline Preparation
This is Machine Learning pipeline preparation, which will be restructed into train_classifier.py

In [69]:
# import libraries
import pandas as pd
from sqlalchemy import create_engine
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
import joblib

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\j8654\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\j8654\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\j8654\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [70]:
# load data from database
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql("SELECT * FROM Response",engine)
X = df["message"]
y = df.iloc[:,4:]

### Tokenization function to process text data

In [71]:
# normalize text by using lower case and remove puntuation
# tokenize words and lemmatize each word
def tokenize(text):
    text = re.sub("[^a-zA-Z0-9]"," ",text)
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    clean_tokens = []
    stopwords_list = stopwords.words("english")
    for token in tokens:
        clean_token = lemmatizer.lemmatize(token).lower().strip()
        if (clean_token not in stopwords_list): clean_tokens.append(clean_token)
    return clean_tokens

### Build a machine learning pipeline
Use CountVectorizer to put word count in vectors and Tfidf to statistically measure word frequency and how relevant the word is to the document.

In [72]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(LGBMClassifier()))
])

### Train pipeline


In [73]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

### Evaluate model

In [74]:
# y_pred = pd.DataFrame(y_pred,index=X_test.index, columns=y_test.columns)
# Though not shown here, LGBM gives better result than simply plugging RandomForest or KNeighbor in pipeline
print(classification_report(y_test,y_pred,target_names=y_test.columns))

                        precision    recall  f1-score   support

               related       0.85      0.93      0.89      5001
               request       0.79      0.57      0.66      1093
                 offer       0.00      0.00      0.00        32
           aid_related       0.77      0.68      0.72      2700
          medical_help       0.60      0.31      0.41       532
      medical_products       0.67      0.29      0.41       345
     search_and_rescue       0.64      0.16      0.26       165
              security       0.29      0.02      0.03       127
              military       0.53      0.31      0.39       197
           child_alone       0.00      0.00      0.00         0
                 water       0.76      0.71      0.73       408
                  food       0.82      0.81      0.81       723
               shelter       0.74      0.61      0.67       590
              clothing       0.67      0.43      0.53        95
                 money       0.61      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Improve model
Use grid search to find better parameters. 

In [75]:
# first look at parameters in pipeline for fine-tuning
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(tokenizer=<function tokenize at 0x000001C74C6B35E0>)),
  ('tfidf', TfidfTransformer()),
  ('clf', MultiOutputClassifier(estimator=LGBMClassifier()))],
 'verbose': False,
 'vect': CountVectorizer(tokenizer=<function tokenize at 0x000001C74C6B35E0>),
 'tfidf': TfidfTransformer(),
 'clf': MultiOutputClassifier(estimator=LGBMClassifier()),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': <function __main__.tokenize(text)>,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf

In [77]:
parameters = {
    'clf__estimator__colsample_bytree': [0.3,0.7,1.0],
    'clf__estimator__min_child_samples': [20,100,250,500],
    'vect__max_features': [None,3000,6000],
}

grid_search = GridSearchCV(pipeline, parameters,scoring='f1_micro')
grid_search.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x000001C74C6B35E0>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        MultiOutputClassifier(estimator=LGBMClassifier()))]),
             param_grid={'clf__estimator__colsample_bytree': [0.3, 0.7, 1.0],
                         'clf__estimator__min_child_samples': [20, 100, 250,
                                                               500],
                         'vect__max_features': [None, 3000, 6000]},
             scoring='f1_micro')

### Evaluate grid search model

In [78]:
grid_search.best_params_

{'clf__estimator__colsample_bytree': 0.7,
 'clf__estimator__min_child_samples': 20,
 'vect__max_features': None}

Simple grid search gives the same parameters used above, so the evaluation result is also the same.

In [79]:
final_model = grid_search.best_estimator_
final_pred = final_model.predict(X_test)
print(classification_report(y_test,final_pred,target_names=y_test.columns))

                        precision    recall  f1-score   support

               related       0.85      0.93      0.89      5001
               request       0.79      0.58      0.67      1093
                 offer       0.00      0.00      0.00        32
           aid_related       0.77      0.68      0.72      2700
          medical_help       0.63      0.28      0.39       532
      medical_products       0.69      0.30      0.42       345
     search_and_rescue       0.62      0.18      0.27       165
              security       0.33      0.02      0.04       127
              military       0.58      0.32      0.41       197
           child_alone       0.00      0.00      0.00         0
                 water       0.75      0.68      0.71       408
                  food       0.83      0.80      0.81       723
               shelter       0.76      0.59      0.67       590
              clothing       0.67      0.45      0.54        95
                 money       0.58      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Export model
Evaluation of the model from grid search doesn't show significant better performance, so we save original pipeline.

In [80]:
filename = "final_model.sav"
joblib.dump(pipeline, filename)

['final_model.sav']