# ML Pipeline Preparation

### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [None]:
# import libraries
import numpy as np
import pandas as pd
from sqlalchemy import create_engine

import re
import nltk
nltk.download(['punkt', 'wordnet','stopwords'])
nltk.download(['punkt', 'wordnet', 'stopwords', 'averaged_perceptron_tagger'])

from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import pickle
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, hamming_loss

import warnings
warnings.filterwarnings('ignore')

In [None]:
# load data from database
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql_table("DisasterResponseDatabase", engine)
X = df["message"]
Y = df.iloc[:,4:]
df.head()
X.head()
Y.head()

### 2. The tokenization function to process your text data

In [None]:
def tokenize(text):
    """
        Tokenization function. To clean the text data and remove properties not useul for analysis.
        input: raw text
        process: remove stop words, ponctuations, reduce the words to their root etc...
        Returns: clean and tokenized text
                                        """
    
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
    
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    
    clean_tokens = []
    
    for clean_tok in tokens:
       
        # Remove stop words
        if clean_tok in stopwords.words("english"):
            continue
            
        # Reduce words to their stems
        clean_tok = PorterStemmer().stem(clean_tok)
        
        # Reduce words to their root form
        clean_tok = lemmatizer.lemmatize(clean_tok).lower().strip()

        clean_tokens.append(clean_tok)
        
    clean_tokens = [clean_tok for clean_tok in clean_tokens if clean_tok.isalpha()]
    
    return clean_tokens

In [None]:
print(X[4])
print(tokenize(X[4]))

In [None]:
print(X[10])
print(tokenize(X[10]))

### 3. Building the machine learning pipeline
This machine pipeline will take in the `message` column as input and output classification results on the other 36 categories in the dataset.

In [None]:
pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
pipeline.fit(X_train, Y_train)

### 5. Testing the model
Report the f1 score, precision and recall for each output category of the dataset.

In [None]:
Y_pred = pipeline.predict(X_test)

In [None]:
for ix, col in enumerate(Y.columns):
    print(col)
    print(classification_report(Y_test[col], Y_pred[:,ix]))

avg = (Y_pred == Y_test).mean().mean()
print("Accuracy Overall:\n", avg)

### 6. Improving the model
Use grid search to find better parameters. 

In [None]:
pipeline.get_params()

In [None]:
parameters = {
        'vect__max_df':[0.5,0.75,1.0],
        #'clf__estimator__n_estimators': [50,100,200],
        'clf__estimator__min_samples_split': [2, 3, 4]
    }

cv = GridSearchCV(pipeline, param_grid=parameters, verbose=2)

In [None]:
# Train model
cv.fit(X_train, Y_train)

### 7. Testing the model
Show the accuracy, precision, and recall of the tuned model.  

In [None]:
Y_pred_new = cv.predict(X_test)

In [None]:
for ix, col in enumerate(Y.columns):
    print(col)
    print(classification_report(Y_test[col], Y_pred_new[:,ix]))

avg = (Y_pred_new == Y_test).mean().mean()
print("Accuracy Overall:\n", avg)

### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

### 9. Export your model as a pickle file

In [None]:
pickle.dump(cv, open("models/classifier.pkl", 'wb'))