# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [289]:
# import librarie
from sqlalchemy import create_engine
import sqlite3
import re
import pandas as pd
import numpy as np
import pickle


import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.metrics import classification_report,f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/fengfangzhong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/fengfangzhong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fengfangzhong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [105]:
# load data from database
engine = create_engine('sqlite:///InsertDatabaseName.db')
#engine = sqlite3.connect('InsertTableName.db')
df = pd.read_sql('select * from InsertTableName', engine)

cat = df.columns[-36:]

X = df['message']
Y = df[cat]

### 2. Write a tokenization function to process your text data

In [155]:
def tokenize(text):
    stop_words = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    
    # normalizing text
    normalizer = re.sub(r'[^a-zA-Z0-9]', ' ', text.lower())
    
    # splitting text into tokens
    token = word_tokenize(normalizer)
    
    # lemmatize and remove stop words
    return  [lemmatizer.lemmatize(word) for word in token if word not in stop_words]

### 3. Build a machine learning pipeline
- You'll find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [201]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier(random_state=42)))
])

In [None]:
DecisionTreeClassifier(random_state = 42)

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [202]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.3, random_state = 42)
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
           n_jobs=1))])

### 5. Test your model
Report the f1 score, precision and recall on both the training set and the test set. You can use sklearn's `classification_report` function here. 

In [203]:
y_pred = pipeline.predict(X_test)

In [270]:
print(classification_report(y_test.values[:,0], y_pred[:,0], digits=2))

             precision    recall  f1-score   support

          0       0.60      0.45      0.52      1873
          1       0.84      0.90      0.87      5934
          2       0.43      0.31      0.36        58

avg / total       0.78      0.79      0.78      7865



In [260]:
score = []
for i in range(36):
    score.append(accuracy_score(y_test.values[:,i], y_pred[:,i]))
print("accuracy score: {:.2f}".format(np.mean(score)))

accuracy score: 0.94


### 6. Improve your model
Use grid search to find better parameters. 

In [249]:
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), preprocessor=None, stop_words=None,
           strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
           tokenizer=<function tokenize at 0x1a267c37b8>, vocabulary=None)),
  ('tfidf',
   TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
  ('clf',
   MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
               max_depth=None, max_features='auto', max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
               oob_score=False, random_state=42, ver

In [271]:
parameters = {
    'vect__min_df': [1],
    'vect__lowercase': [True, False],
    'tfidf__smooth_idf': [True, False],
}


cv = GridSearchCV(pipeline, param_grid = parameters, cv = 2)

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.

In [272]:
cv.fit(X_train, y_train)

GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
           n_jobs=1))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vect__min_df': [1], 'vect__lowercase': [True, False], 'tfidf__smooth_idf': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [273]:
cv.best_score_

0.23704430276279223

In [274]:
cv.best_params_

{'tfidf__smooth_idf': False, 'vect__lowercase': True, 'vect__min_df': 1}

In [None]:
# test cv
y_pred = cv.predict(X_test)

In [None]:
# classification_report
print(classification_report(y_test.values[:,0], y_pred[:,0], digits=2))

In [None]:
# average accurate scores for the 36 classes
score = []
for i in range(36):
    score.append(accuracy_score(y_test.values[:,i], y_pred[:,i]))
print("accuracy score: {:.2f}".format(np.mean(score)))

### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [276]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(AdaBoostClassifier(random_state=42)))
])

In [277]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...timator=None,
          learning_rate=1.0, n_estimators=50, random_state=42),
           n_jobs=1))])

In [278]:
y_pred = pipeline.predict(X_test)

In [279]:
score = []
for i in range(36):
    score.append(accuracy_score(y_test.values[:,i], y_pred[:,i]))
print("accuracy score: {:.2f}".format(np.mean(score)))

accuracy score: 0.95


In [281]:
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), preprocessor=None, stop_words=None,
           strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
           tokenizer=<function tokenize at 0x1a267c37b8>, vocabulary=None)),
  ('tfidf',
   TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
  ('clf',
   MultiOutputClassifier(estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
             learning_rate=1.0, n_estimators=50, random_state=42),
              n_jobs=1))],
 'vect': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1

In [282]:
parameters = {
    'vect__min_df': [1,5,10],
    'vect__lowercase': [True, False],
    'tfidf__smooth_idf': [True, False],
}


cv = GridSearchCV(pipeline, param_grid = parameters, cv = 2)

In [283]:
cv.fit(X_train, y_train)

GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...timator=None,
          learning_rate=1.0, n_estimators=50, random_state=42),
           n_jobs=1))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vect__min_df': [1, 5, 10], 'vect__lowercase': [True, False], 'tfidf__smooth_idf': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [285]:
# test improved model
y_pred = cv.predict(X_test)

In [286]:
# classification report
print(classification_report(y_test.values[:,0], y_pred[:,0], digits=2))

             precision    recall  f1-score   support

          0       0.81      0.05      0.09      1873
          1       0.76      0.99      0.86      5934
          2       0.13      0.03      0.05        58

avg / total       0.77      0.76      0.67      7865



In [287]:
# average accurate scores for the 36 classes

score = []
for i in range(36):
    score.append(accuracy_score(y_test.values[:,i], y_pred[:,i]))
print("accuracy score: {:.2f}".format(np.mean(score)))

accuracy score: 0.95


### 9. Export your model as a pickle file

In [292]:
with open('model.pkl', 'wb') as pkl_file:
    pickle.dump(cv, pkl_file)
pkl_file.close()

### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.