# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [None]:
# import libraries
import sys
import re
import time
import pickle

import pandas as pd
from sqlalchemy import create_engine

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = stopwords.words("english")

In [None]:
# load data from database
engine = create_engine("sqlite:///../data/DisasterResponse.db")

connection = engine.connect()

query = "SELECT * FROM DisasterResponse"  ## limit 1000 WHERE related <> 2
df = pd.read_sql(query, connection)

In [None]:
# Let's check the size of the data
df.shape

In [None]:
# Let's get Summary stats on the data
df.describe(include='all')

In [None]:
# Find if there are response variables that have less than 2 values (0 or 1), that can be dropped from the

# Build SQL that will give count of distinct values in the response column
q2 = "SELECT 'related                 '  AS COL , COUNT(distinct related) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'request' AS COL , COUNT(distinct request) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'offer' AS COL , COUNT(distinct offer) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'aid_related' AS COL , COUNT(distinct aid_related) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'medical_help' AS COL , COUNT(distinct medical_help) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'medical_products' AS COL , COUNT(distinct medical_products) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'search_and_rescue' AS COL , COUNT(distinct search_and_rescue) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'security' AS COL , COUNT(distinct security) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'military' AS COL , COUNT(distinct military) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'child_alone' AS COL , COUNT(distinct child_alone) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'water' AS COL , COUNT(distinct water) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'food' AS COL , COUNT(distinct food) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'shelter' AS COL , COUNT(distinct shelter) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'clothing' AS COL , COUNT(distinct clothing) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'money' AS COL , COUNT(distinct money) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'missing_people' AS COL , COUNT(distinct missing_people) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'refugees' AS COL , COUNT(distinct refugees) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'death' AS COL , COUNT(distinct death) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'other_aid' AS COL , COUNT(distinct other_aid) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'infrastructure_related' AS COL , COUNT(distinct infrastructure_related) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'transport' AS COL , COUNT(distinct transport) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'buildings' AS COL , COUNT(distinct buildings) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'electricity' AS COL , COUNT(distinct electricity) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'tools' AS COL , COUNT(distinct tools) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'hospitals' AS COL , COUNT(distinct hospitals) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'shops' AS COL , COUNT(distinct shops) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'aid_centers' AS COL , COUNT(distinct aid_centers) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'other_infrastructure' AS COL , COUNT(distinct other_infrastructure) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'weather_related' AS COL , COUNT(distinct weather_related) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'floods' AS COL , COUNT(distinct floods) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'storm' AS COL , COUNT(distinct storm) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'fire' AS COL , COUNT(distinct fire) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'earthquake' AS COL , COUNT(distinct earthquake) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'cold' AS COL , COUNT(distinct cold) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'other_weather' AS COL , COUNT(distinct other_weather) AS KNT FROM DisasterResponse GROUP BY 1 UNION SELECT 'direct_report' AS COL , COUNT(distinct direct_report) AS KNT FROM DisasterResponse GROUP BY 1 "

# Execute the SQL
dfq2 = pd.read_sql(q2, connection)
connection.close()

# Show response variables that have less than 2 values (0 or 1)
dfq2[dfq2["KNT"] < 2]

In [None]:
## see the distribution of
# df1 = df.loc[:, ~df.columns.isin(['id','message','original', 'genre','child_alone'])]
# df1col = df1.columns
# for col in df1col:
#     print(col , "==\n", df1[col].value_counts())

In [None]:
# Keep only the predictors in the X
predictors = ["message"]

X = df[predictors].message.values
print("Dimensions of X are:", X.ndim)
print("Shape of X is", X.shape)
print("Size of X is", X.size)

In [None]:
# keep ony the 35 response variables in y; dropped child_alone since all values are 0
y = df.loc[:, ~df.columns.
           isin(['id', 'message', 'original', 'genre', 'child_alone'])]
y.head()
print("Dimensions of y are:", y.ndim)
print("Shape of y is", y.shape)
print("Size of y is", y.size)

### 2. Write a tokenization function to process your text data

In [None]:
def tokenize(text):
    """
    Tokenize text by removing punctuation, converting to lowercase, removing stop words, and lemmatizing.

    Args:
        text (str): The text to be tokenized.

    Returns:
        list: A list of clean tokens.
    """
    # Normalize case and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

    tokens = word_tokenize(text)
    # Remove stop words and lemmatize
    tokens = [w for w in tokens if w not in stopwords.words("english")]

    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

In [None]:
# test the tokenize function
for i in range(26, 29):  # (X.shape[0]):
    LL = str(X[i])
    print(tokenize(LL))

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [None]:
# Create pipline
logreg = LogisticRegression()  # multi_class='ovr')

pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultiOutputClassifier(logreg))])

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [None]:
# let's split the data as train test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.75,
                                                    random_state=42)

In [None]:
# let's check the train test split results
print("X_train Dim are:", X_train.ndim, "Shape=", X_train.shape, "Size =",
      X_train.size)
print("y_train Dim are:", y_train.ndim, "Shape=", y_train.shape, "Size =",
      y_train.size)
print("X_test  Dim are:", X_test.ndim, "Shape=", X_test.shape, "Size =",
      X_test.size)
print("y_test  Dim are:", y_test.ndim, "Shape=", y_test.shape, "Size =",
      y_test.size)

In [None]:
# test the tokenize function on X_test
for i in range(96, 100):  # (X.shape[0]):
    LL = str(X_test[i])
    print(tokenize(LL))

In [None]:
# Fit the pipeline to your training data
pipeline.fit(X_train, y_train)

In [None]:
pipeline

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [None]:
# Use the pipeline to make predictions on your test data
y_pred = pipeline.predict(X_test)

In [None]:
# Classification Report for reponse variable
i = 0
for col in y_test.columns:
    print(
        "\n--------------Classification Report for reponse variable#{}. {} ----------------"
        .format(i, col))
    print(classification_report(y_test[col], y_pred[:, i], zero_division=0))
    i += 1

In [None]:
# confusion_matrix for reponse variables
i = 0
for col in y_test.columns:
    cm = confusion_matrix(y_test[col],
                          y_pred[:, i],
                          labels=pipeline.classes_[i])
    fig, ax = plt.subplots()
    ConfusionMatrixDisplay(cm, display_labels=pipeline.classes_[i]).plot(ax=ax)
    ax.set_title(f"Confusion Matrix for {col}")
    i += 1
    
plt.close()

In [None]:
# visual representation of the pipeline in a html file
from sklearn.utils import estimator_html_repr
with open('my_estimator.html', 'w', encoding='utf-8') as f:
    f.write(estimator_html_repr(pipeline))

### 6. Improve your model
Use grid search to find better parameters. 

In [None]:
# let's see what parms can be tuned
pipeline.get_params().keys()

In [None]:
parameters = {
    'clf__estimator__multi_class': ['ovr'],
    'clf__estimator__solver':
    ['saga'],  # Algorithm to use in the optimization problem
    'clf__estimator__n_jobs':
    [2],  # Number of CPU cores used when parallelizing over classes
    'clf__estimator__warm_start': [
        True
    ],  # When set to True, reuse the solution of the previous call to fit as initialization'
    'clf__estimator__penalty': ['l2', 'elasticnet'],
    'clf__estimator__verbose': [1]
}

cv = GridSearchCV(pipeline, param_grid=parameters)
#    'multi_class' : ['ovr']   # not a choice
#    'clf__random_state' : [0],# Used when solver == ‘sag’, ‘saga’ or ‘liblinear’ to shuffle the data.
#    'clf__solver' : ['lbfgs','saga'],
#    'clf__n_jobs' : [2,4],        # Number of CPU cores used when parallelizing over classes if multi_class=’ovr’”
#    'clf__max_iter' : [500, 1000], # Maximum number of iterations taken for the solvers to converge
#    'clf__penalty' :  ['l1', 'l2','elasticnet'],
#    'clf__warm_start' : [True, False]           # When set to True, reuse the solution of the previous call to fit as initialization
# , random_state=0,n_jobs = 2, max_iter=1000, C=1, penalty='l2')
#     'vect__ngram_range' : [(1, 1), (1,2)],      # The lower and upper boundary of the range of n-values for different word n-grams
#   'tfidf__smooth_idf' : [True, False], #Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every term in the collection exactly once. Prevents zero divisions.

In [32]:
# Let's run the GridSearch and compute elapsed time for the fit
starttm = time.time()

cv.fit(X_train, y_train)

endtm = time.time()

execTmsec = (endtm - starttm) * 10**3

print("execution time for the fit=", execTmsec, "milliseconds")

5 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\multioutput.py", line 434, in fit
    super().fit(X, Y, sample_weight, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\multioutput.py", line 202, in fit
    self.estimators_ = Paralle

execution time for the fit= 2998089.579820633 milliseconds


[Parallel(n_jobs=2)]: Done   1 out of   1 | elapsed:    0.2s finished


In [41]:
# what are the best parms
cv.best_params_

{'clf__estimator__multi_class': 'ovr',
 'clf__estimator__n_jobs': 2,
 'clf__estimator__penalty': 'l2',
 'clf__estimator__solver': 'saga',
 'clf__estimator__verbose': 1,
 'clf__estimator__warm_start': True}

In [43]:
cv.cv_results_

{'mean_fit_time': array([247.44829483, 231.23097219]),
 'std_fit_time': array([4.44388587, 3.2594798 ]),
 'mean_score_time': array([57.86113458,  0.        ]),
 'std_score_time': array([0.70206072, 0.        ]),
 'param_clf__estimator__multi_class': masked_array(data=['ovr', 'ovr'],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_clf__estimator__n_jobs': masked_array(data=[2, 2],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_clf__estimator__penalty': masked_array(data=['l2', 'elasticnet'],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_clf__estimator__solver': masked_array(data=['saga', 'saga'],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_clf__estimator__verbose': masked_array(data=[1, 1],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_clf__estima

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [36]:
# Use the cv pipeline with GridSearch to make predictions on test data
y_pred = cv.predict(X_test)

In [37]:
# score on training dataset
print("\n score on training dataset:", cv.score(X_train, y_train))

# score on test dataset
print("\n score on test dataset:", cv.score(X_test, y_test))


 score on training dataset: 0.3621706845692198

 score on test dataset: 0.28181263350625574


### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [None]:
# Create pipline with different model, support vector machine (SVM)

# load data from database
engine = create_engine("sqlite:///../data/DisasterResponse.db")
connection = engine.connect()
query = "SELECT * FROM DisasterResponse"  ## limit 1000 WHERE related <> 2
df = pd.read_sql(query, connection)
connection.close()

# Keep only the predictors in the X
predictors = ["message"]
X = df[predictors].message.values
print("Dimensions of X are:", X.ndim)
print("Shape of X is", X.shape)
print("Size of X is", X.size)

# keep ony the 35 response variables in y; dropped child_alone since all values are 0
y = df.loc[:, ~df.columns.
           isin(['id', 'message', 'original', 'genre', 'child_alone'])]
y.head()
print("Dimensions of y are:", y.ndim)
print("Shape of y is", y.shape)
print("Size of y is", y.size)


def tokenize(text):
    # Normalize case and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

    tokens = word_tokenize(text)
    # Remove stop words and lemmatize
    tokens = [w for w in tokens if w not in stopwords.words("english")]

    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens


# let's split the data as train test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.75,
                                                    random_state=42)

# let's check the train test split results
print("X_train Dim are:", X_train.ndim, "Shape=", X_train.shape, "Size =",
      X_train.size)
print("y_train Dim are:", y_train.ndim, "Shape=", y_train.shape, "Size =",
      y_train.size)
print("X_test  Dim are:", X_test.ndim, "Shape=", X_test.shape, "Size =",
      X_test.size)
print("y_test  Dim are:", y_test.ndim, "Shape=", y_test.shape, "Size =",
      y_test.size)

# Define pipeline for SVC
pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                     ('tfidf', TfidfTransformer()),
                     ('scaler', StandardScaler()),
                     ('svc', MultiOutputClassifier(SVC(gamma='auto')))])

# from sklearn.model_selection import GridSearchCV
print(pipeline.get_params().keys())

# parms for pipline with different model
parameters = {
    'scaler__with_mean': [False],
    'svc__estimator__C': [1.0],  # Regularization parameter
    'svc__n_jobs':
    [2],  # Number of CPU cores used when parallelizing over classes
    'svc__estimator__verbose': [1]
}
# define GridSearchCV
cv = GridSearchCV(pipeline, param_grid=parameters)

# Fit the pipline with different model
import time

starttm = time.time()
cv.fit(X_train, y_train)
endtm = time.time()
execTmsec = (endtm - starttm) * 10**6
print("execution time for the fit=", execTmsec, "seconds")

# what are the best parms
cv.best_params_

# what are the overall results from the new model
cv.cv_results_

# Use the pipeline with GridSearch to make predictions on test data
y_pred = cv.predict(X_test)

# score on training dataset
print("\n score on training dataset:", cv.score(X_train, y_train))

# score on test dataset
print("\n score on test dataset:", cv.score(X_test, y_test))

### 9. Export your model as a pickle file

In [54]:
import pickle

model_filepath = "svc_model.pkl"
with open(model_filepath, 'wb') as pklfile:
    pickle.dump(cv, pklfile)

### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.