# ML Pipeline Preparation

### 1. Import libraries and load data from database.

In [1]:
# import libraries

import re
import numpy as np
import pandas as pd
import pickle
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import sqlite3
from sqlalchemy import create_engine

#from sklearn.svm import LinearSVC
#from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
#from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import  f1_score,precision_score,recall_score,accuracy_score,make_scorer
import matplotlib.pyplot as plt
%matplotlib inline

# suppress warnings from final output
import warnings
warnings.simplefilter("ignore")

[nltk_data] Downloading package punkt to /Users/johnma/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/johnma/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/johnma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# load data from database
#engine = create_engine('../ETL Pipeline/DisasterResponse.db')
conn = sqlite3.connect('../ETL Pipeline/DisasterResponse.db')
df = pd.read_sql('SELECT * FROM table1', con=conn)

X = df['message']
y = df.iloc[:, 4:]
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26028 entries, 0 to 26027
Data columns (total 35 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   related                 26028 non-null  int64
 1   request                 26028 non-null  int64
 2   offer                   26028 non-null  int64
 3   aid_related             26028 non-null  int64
 4   medical_help            26028 non-null  int64
 5   medical_products        26028 non-null  int64
 6   search_and_rescue       26028 non-null  int64
 7   security                26028 non-null  int64
 8   military                26028 non-null  int64
 9   water                   26028 non-null  int64
 10  food                    26028 non-null  int64
 11  shelter                 26028 non-null  int64
 12  clothing                26028 non-null  int64
 13  money                   26028 non-null  int64
 14  missing_people          26028 non-null  int64
 15  refugees           

### 2. Write a tokenization function to process your text data

In [3]:
# Function to tokenize the text

def tokenize(text):
    """Tokenization function: 
    Input: raw text 
    
    Process: 
    url replacement
    normalized
    stop words removed
    stemmd
    lemmatized
    
    Output: tokenized text"""
    
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    
    # replace url with "urlplaceholder"
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
    
    # Normalize text
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    stop_words = stopwords.words("english")
    
    
    #tokenize
    words = word_tokenize (text)
    
    #stemming
    stemmed = [PorterStemmer().stem(w) for w in words]
    
    #lemmatizing
    clean_tokens = [WordNetLemmatizer().lemmatize(w) for w in stemmed if w not in stop_words]
   
    return clean_tokens

In [4]:
# Check the function is working
tokenize(X[0])

['weather', 'updat', 'cold', 'front', 'cuba', 'could', 'pas', 'haiti']

### 3. Build a machine learning pipeline

In [5]:
# pipeline_rfc : Random Foreset Classifier is chosen as the default prediction model
pipeline_rfc = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier(random_state=42))),
    ])


# Pipeline_log uses Logistic Classifier
pipeline_log = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf_log',  MultiOutputClassifier(LogisticRegression())) 
    ])

# Pipeline_svm uses Support Vector Machines Classifier
pipeline_svm = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf_svm',  MultiOutputClassifier(SVC(max_iter=100000, random_state=42))) 
    ])

### 4. Train pipeline

In [6]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [7]:
# Train classifier -- Random Forest Classifier
pipeline_rfc.fit(X_train, y_train)

### 5. Test your model

In [8]:
y_pred_rfc = pipeline_rfc.predict(X_test)

# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(y_train, pipeline_rfc.predict(X_train))*100)
print(" ")

# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(y_test, y_pred_rfc)*100)
print(" ")

# Classification report
print("Classification report")
print(classification_report(y_test, y_pred_rfc, target_names=y.columns.values))

The training accuracy is: 
99.49797653808719
 
The test accuracy is: 
26.433072076225606
 
Classification report
                        precision    recall  f1-score   support

               related       0.83      0.95      0.89      4962
               request       0.82      0.48      0.61      1141
                 offer       0.00      0.00      0.00        30
           aid_related       0.75      0.70      0.72      2678
          medical_help       0.62      0.08      0.14       520
      medical_products       0.89      0.08      0.14       311
     search_and_rescue       0.62      0.05      0.09       202
              security       0.00      0.00      0.00       120
              military       0.39      0.03      0.06       210
                 water       0.85      0.38      0.52       413
                  food       0.82      0.63      0.71       716
               shelter       0.83      0.36      0.51       569
              clothing       0.78      0.07      0.13 

In [9]:
# Train classifier -- Logistic Regression Classifier
pipeline_log.fit(X_train, y_train)

y_pred_log = pipeline_log.predict(X_test)

# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(y_train, pipeline_log.predict(X_train))*100)
print(" ")

# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(y_test, y_pred_log)*100)
print(" ")

# Classification report
print("Classification report")
print(classification_report(y_test, y_pred_log, target_names=y.columns.values))

The training accuracy is: 
36.70406229189078
 
The test accuracy is: 
28.200399569694174
 
Classification report
                        precision    recall  f1-score   support

               related       0.84      0.95      0.89      4962
               request       0.81      0.54      0.64      1141
                 offer       0.00      0.00      0.00        30
           aid_related       0.77      0.67      0.72      2678
          medical_help       0.66      0.19      0.30       520
      medical_products       0.78      0.20      0.32       311
     search_and_rescue       0.81      0.06      0.12       202
              security       0.00      0.00      0.00       120
              military       0.65      0.12      0.21       210
                 water       0.75      0.54      0.63       413
                  food       0.82      0.58      0.68       716
               shelter       0.82      0.45      0.59       569
              clothing       0.77      0.23      0.36 

> The prediction result of Random Forest Classifier seems promising at the first glace; however, the training accuracy is extremely high, reaches at more than 99%, compared to merely 27.67% of test accuracy. It's a sign of overfitting, and thus I don't think it a proper model for further prediction.

> By using Logistic Regression Classifier, the speed of training is relatively quicker, and the prediction of accuracy is slightly higher than that of random forest model. I will proceed to next step on logistic regression model.

### 6. Improve your model
Use grid search to find better parameters. 

In [10]:
# Check parameters of Logistic Regression Classifier
#pipeline_log.get_params()

In [11]:
# Define parameters
parameters_log = {
              'clf_log__estimator__solver':['newton-cg', 'lbfgs', 'liblinear'],
              'clf_log__estimator__C': [100, 10, 1.0]
             }


%timeit
cv = GridSearchCV(pipeline_log, param_grid=parameters_log, cv=3, verbose=1)
cv.fit(X_train, y_train) # Fit GridSearchCV on training data

print("The best hyperparameters from Grid Search are:")
print(cv.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(cv.best_score_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
The best hyperparameters from Grid Search are:
{'clf_log__estimator__C': 10, 'clf_log__estimator__solver': 'newton-cg'}

The mean accuracy of a model with these hyperparameters is:
0.2774960299165002


In [12]:
# Fit the tuned model to the training data
tuned_log = cv.best_estimator_
tuned_log.fit(X_train, y_train)

### 7. Test your model

In [13]:
y_pred_tuned = tuned_log.predict(X_test)

# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(y_train, tuned_log.predict(X_train))*100)
print(" ")

# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(y_test, y_pred_tuned)*100)
print(" ")

# Classification report
print("Classification report")
print(classification_report(y_test, y_pred_tuned, target_names=y.columns.values))

The training accuracy is: 
62.31238153783105
 
The test accuracy is: 
28.53849700322729
 
Classification report
                        precision    recall  f1-score   support

               related       0.86      0.92      0.89      4962
               request       0.77      0.59      0.66      1141
                 offer       0.00      0.00      0.00        30
           aid_related       0.72      0.69      0.71      2678
          medical_help       0.59      0.32      0.41       520
      medical_products       0.65      0.32      0.43       311
     search_and_rescue       0.60      0.17      0.27       202
              security       0.33      0.03      0.05       120
              military       0.63      0.33      0.44       210
                 water       0.75      0.62      0.68       413
                  food       0.79      0.64      0.71       716
               shelter       0.76      0.53      0.63       569
              clothing       0.79      0.34      0.47  

> By adjusting two parameters, the accucary of training set improves drastically while the accuracy of test data slightly lower than that before parameter tuning. Overall, I would regard it as an effective parameter adjustment, but the performance of the tuned model is not as prospective as expected.

### 8. Try improving your model further. 

In [14]:
# Fit SVM into pipeline
pipeline_svm.fit(X_train, y_train)

# Predict on test data
y_pred_svm = pipeline_svm.predict(X_test)

# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(y_train, pipeline_svm.predict(X_train))*100)
print(" ")

# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(y_test, y_pred_svm)*100)
print(" ")

# Classification report
print("Classification report")
print(classification_report(y_test, y_pred_svm, target_names=y.columns.values))

The training accuracy is: 
64.14630398032888
 
The test accuracy is: 
29.260796065775317
 
Classification report
                        precision    recall  f1-score   support

               related       0.84      0.95      0.89      4962
               request       0.83      0.55      0.66      1141
                 offer       0.00      0.00      0.00        30
           aid_related       0.78      0.68      0.72      2678
          medical_help       0.63      0.17      0.27       520
      medical_products       0.79      0.20      0.31       311
     search_and_rescue       0.65      0.08      0.15       202
              security       0.00      0.00      0.00       120
              military       0.64      0.13      0.21       210
                 water       0.76      0.64      0.70       413
                  food       0.79      0.70      0.74       716
               shelter       0.83      0.50      0.62       569
              clothing       0.83      0.40      0.54 

In [15]:
parameters_svm= {'clf_svm__estimator__C': [0.1, 1,],
                 'clf_svm__estimator__gamma': [1, 0.1]
}
    
cv = GridSearchCV(pipeline_svm, param_grid=parameters_svm, cv=2, verbose=10)
cv.fit(X_train, y_train)

print("The best hyperparameters from Grid Search are:")
print(cv.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(cv.best_score_)

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV 1/2; 1/4] START clf_svm__estimator__C=0.1, clf_svm__estimator__gamma=1......
[CV 1/2; 1/4] END clf_svm__estimator__C=0.1, clf_svm__estimator__gamma=1;, score=0.209 total time= 6.3min
[CV 2/2; 1/4] START clf_svm__estimator__C=0.1, clf_svm__estimator__gamma=1......
[CV 2/2; 1/4] END clf_svm__estimator__C=0.1, clf_svm__estimator__gamma=1;, score=0.200 total time= 6.3min
[CV 1/2; 2/4] START clf_svm__estimator__C=0.1, clf_svm__estimator__gamma=0.1....
[CV 1/2; 2/4] END clf_svm__estimator__C=0.1, clf_svm__estimator__gamma=0.1;, score=0.201 total time= 3.9min
[CV 2/2; 2/4] START clf_svm__estimator__C=0.1, clf_svm__estimator__gamma=0.1....
[CV 2/2; 2/4] END clf_svm__estimator__C=0.1, clf_svm__estimator__gamma=0.1;, score=0.193 total time= 3.8min
[CV 1/2; 3/4] START clf_svm__estimator__C=1, clf_svm__estimator__gamma=1........
[CV 1/2; 3/4] END clf_svm__estimator__C=1, clf_svm__estimator__gamma=1;, score=0.277 total time= 6.8min
[CV 

In [16]:
# Fit the tuned model to the training data
tuned_svm = cv.best_estimator_
tuned_svm.fit(X_train, y_train)

In [17]:
y_pred_tuned = tuned_svm.predict(X_test)

# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(y_train, tuned_svm.predict(X_train))*100)
print(" ")

# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(y_test, y_pred_tuned)*100)
print(" ")

# Classification report
print("Classification report")
print(classification_report(y_test, y_pred_tuned, target_names=y.columns.values))

The training accuracy is: 
64.12581322678142
 
The test accuracy is: 
29.24542800061472
 
Classification report
                        precision    recall  f1-score   support

               related       0.84      0.95      0.89      4962
               request       0.83      0.55      0.66      1141
                 offer       0.00      0.00      0.00        30
           aid_related       0.78      0.68      0.72      2678
          medical_help       0.63      0.17      0.27       520
      medical_products       0.79      0.20      0.31       311
     search_and_rescue       0.65      0.08      0.15       202
              security       0.00      0.00      0.00       120
              military       0.64      0.13      0.21       210
                 water       0.76      0.64      0.70       413
                  food       0.79      0.70      0.74       716
               shelter       0.83      0.50      0.62       569
              clothing       0.83      0.40      0.54  

> In order to further improve the performance, classifier of Support Vector Machine is adopt. Compared to Logistic Regression and Random Forest models, the result of SVM reveals a better accuracy on test set and thus it is used as  the prediction model.

> The tuning of hyperparameter of SVM only includes 2 parameters because of the limitation of hardware. It takes too much time on tuning and eventually I have to choose few parameters to proceed the project. The result of the tuning does not reveal improvement compared to that of the simple SVM model. One the of main reasons would be too few parameters are involved in the GridSearch and lead to the minor positive impact on tuning.

### 9. Export your model as a pickle file

In [18]:
# Create a pickle file for the model
file_name = 'classifier.pkl'
with open (file_name, 'wb') as f:
    pickle.dump(pipeline_svm, f)

### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.