# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [1]:
# import libraries
import pandas as pd
import numpy as np
import re
from sqlalchemy import create_engine
from pickle import dump

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from lightgbm import LGBMClassifier

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gusta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gusta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gusta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# load data from database
engine = create_engine('sqlite:///../data/DisasterResponse.db') # access data from database
df = pd.read_sql_table('disaster_response', engine) # get data from database

category_columns = df.drop(['id', 'message', 'original', 'genre'], axis=1).columns

X = df['message'].values
Y = df.drop(['id', 'message', 'original', 'genre'], axis=1).values

### 2. Write a tokenization function to process your text data

In [3]:
# regex obtained from Udacity course classes
url_regex = "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"

def tokenize(text: str) -> list:
    '''
    function that transforms raw text into clean tokens
    
    Args:
        - text: str -> text to be tokenized
    Returns:
        - token list
    '''
    
    clean_text = re.sub(r'[^a-z-A-Z]|\W', ' ', text.lower()) # remove everything but characters
    clean_text = re.sub(url_regex, 'urlplaceholder', clean_text) # replace url's by `urlplaceholder`
    
    tokens = word_tokenize(clean_text) # tokenize text
    lemmatizer = WordNetLemmatizer() # instatiate lemmatizer
    
    clean_tokens = []
    for token in tokens:
        clean_token = lemmatizer.lemmatize(token).strip() # lemmatize word
        clean_token = lemmatizer.lemmatize(clean_token, pos='v') # lemmatize again operating by verbs
        
        if clean_token not in stopwords.words("english"): # remove stopwords
            clean_tokens.append(clean_token)
    
    return clean_tokens

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [4]:
# instantiate the main Pipeline with MultiOutputClassifier based on RandomForestClassifer

pipeline = Pipeline([
    ('count', CountVectorizer(tokenizer=tokenize)), # bag of words based on `tokenize` function
    ('tfidf', TfidfTransformer()), # TF-IDF transformer
    ('clf', MultiOutputClassifier(estimator=RandomForestClassifier(n_jobs=-1), n_jobs=-1)), 
])

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, Y) # split data into train & test sets
pipeline.fit(X_train, y_train) # fit the data on the classifier

Pipeline(steps=[('count',
                 CountVectorizer(tokenizer=<function tokenize at 0x00000253809AA0D0>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier(n_jobs=-1),
                                       n_jobs=-1))])

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [6]:
def get_classification(model, X_test: np.array, y_test: pd.DataFrame) -> None:
    '''
    shows the overall ranking of each target column
    
    Args:
        - model: classification model
        - X_test: x values for test
        - y_test: y values for test
    Returns: all column classification report
    '''
    
    y_pred = model.predict(X_test) # predict data
    
    for i, column in enumerate(category_columns):
        print(f"({column}):")
        print(classification_report(y_test[:,i], y_pred[:,i], zero_division=1))
    
    print("TOTAL ACC:", (y_pred == y_test).mean())
        
    return

In [7]:
def test_model(model, text='rain followed by hurricane left thousands of people starving and thirsty') -> list:
    '''
    function used to test model prediction
    
    Args:
        - model: trained model
        - text: disaster news to be predicted
    Returns: disaster response according to model prediction
    '''
    classification_labels = model.predict([text])[0]
    classification_results = dict(zip(df.columns[4:], classification_labels))
    
    return [key for key, value in classification_results.items() if value == 1]

In [8]:
test_model(pipeline)

['related', 'aid_related', 'water', 'food', 'weather_related', 'storm']

In [None]:
# report of all rows
get_classification(pipeline, X_test, y_test)

### 6. Improve your model
Use grid search to find better parameters. 

In [17]:
pipeline.get_params() # get the classifier parameters

{'memory': None,
 'steps': [('count',
   CountVectorizer(tokenizer=<function tokenize at 0x0000023D809DC550>)),
  ('tfidf', TfidfTransformer()),
  ('clf',
   MultiOutputClassifier(estimator=RandomForestClassifier(n_jobs=-1), n_jobs=-1))],
 'verbose': False,
 'count': CountVectorizer(tokenizer=<function tokenize at 0x0000023D809DC550>),
 'tfidf': TfidfTransformer(),
 'clf': MultiOutputClassifier(estimator=RandomForestClassifier(n_jobs=-1), n_jobs=-1),
 'count__analyzer': 'word',
 'count__binary': False,
 'count__decode_error': 'strict',
 'count__dtype': numpy.int64,
 'count__encoding': 'utf-8',
 'count__input': 'content',
 'count__lowercase': True,
 'count__max_df': 1.0,
 'count__max_features': None,
 'count__min_df': 1,
 'count__ngram_range': (1, 1),
 'count__preprocessor': None,
 'count__stop_words': None,
 'count__strip_accents': None,
 'count__token_pattern': '(?u)\\b\\w\\w+\\b',
 'count__tokenizer': <function __main__.tokenize(text: str) -> list>,
 'count__vocabulary': None,
 'tfid

In [None]:
'''
> REFERENCES:
    * Hyperparameter Tuning the Random Forest in Python [https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74];
    * Tuning the parameters of your Random Forest model [https://www.analyticsvidhya.com/blog/2015/06/tuning-random-forest-model/] 
'''

parameters = {'clf__estimator__n_estimators' : (10, 20, 50),   
              'clf__estimator__max_features': ('auto', 'sqrt'),
              'clf__estimator__max_depth': (None, 10, 50, 100),
              'clf__estimator__min_samples_leaf': (1, 10, 50),
            }

cv = GridSearchCV(pipeline, param_grid=parameters) # operating GridSearch to find best parameters
cv.fit(X_train, y_train) # fit model on split data

In [19]:
print("BEST ESTIMATOR:")
print(cv.best_estimator_)  # best estimator found by GridSearch
print("BEST PARAMS:")
print(cv.best_params_) # best parameters of the estimator

In [9]:
# result

pipeline = Pipeline([
    ('count', CountVectorizer(tokenizer=tokenize)), # bag of words based on `tokenize` function
    ('tfidf', TfidfTransformer()), # TF-IDF transformer
    ('clf', MultiOutputClassifier(estimator=RandomForestClassifier(
        n_estimators=20, max_features='sqrt', n_jobs=-1), n_jobs=-1)), 
])

In [10]:
pipeline.fit(X_train, y_train) # fit the data on the classifier

Pipeline(steps=[('count',
                 CountVectorizer(tokenizer=<function tokenize at 0x00000253809AA0D0>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier(max_features='sqrt',
                                                                        n_estimators=20,
                                                                        n_jobs=-1),
                                       n_jobs=-1))])

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [11]:
# report of all rows
get_classification(pipeline, X_test, y_test)
test_model(pipeline)

['related',
 'aid_related',
 'water',
 'food',
 'weather_related',
 'storm',
 'direct_report']

### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

#### **LGBMClassifier**

In [57]:
'''
> REFERENCES:
    * Understanding LightGBM Parameters (and How to Tune Them) [https://neptune.ai/blog/lightgbm-parameters-guide]
'''

PARAMS = {'learning_rate': 0.2,
          'max_depth': 10,
          'num_leaves': 20,
          'feature_fraction': 0.6,
          'subsample': 0.2,
          'class_weight': 'balanced',
          'n_estimators': 30
         }

pipeline2 = Pipeline([
    ('count', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(estimator=LGBMClassifier(**PARAMS), n_jobs=-1))
])

pipeline2.fit(X_train, y_train) # fit the data on the classifier

Pipeline(steps=[('count',
                 CountVectorizer(tokenizer=<function tokenize at 0x00000253809AA0D0>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=LGBMClassifier(class_weight='balanced',
                                                                feature_fraction=0.6,
                                                                learning_rate=0.2,
                                                                max_depth=10,
                                                                n_estimators=30,
                                                                num_leaves=20,
                                                                subsample=0.2),
                                       n_jobs=-1))])

In [58]:
test_model(pipeline2)

['related',
 'aid_related',
 'food',
 'death',
 'weather_related',
 'storm',
 'other_weather',
 'direct_report']

In [59]:
print(test_model(pipeline, 'lightning destroys building leaving many injured'))
print(test_model(pipeline2, 'lightning destroys building leaving many injured'))

['related', 'aid_related', 'medical_help']
['related', 'medical_help', 'infrastructure_related', 'buildings', 'other_infrastructure']


#### **MultinomialNB**

In [53]:
pipeline3 = Pipeline([
    ('count', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(estimator=MultinomialNB(), n_jobs=-1)),
])

pipeline3.fit(X_train, y_train)

Pipeline(steps=[('count',
                 CountVectorizer(tokenizer=<function tokenize at 0x0000023D809DC550>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=MultinomialNB(), n_jobs=-1))])

In [54]:
test_model(pipeline3)

['related', 'aid_related', 'weather_related']

In [56]:
parameters = {
        'count__max_df': (0.5, 0.75, 1.0),
        'clf__estimator__alpha': (0.5, 0.75, 1.0),
    }

grid_search = GridSearchCV(pipeline3, parameters, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


GridSearchCV(estimator=Pipeline(steps=[('count',
                                        CountVectorizer(tokenizer=<function tokenize at 0x0000023D809DC550>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        MultiOutputClassifier(estimator=MultinomialNB(),
                                                              n_jobs=-1))]),
             param_grid={'clf__estimator__alpha': (0.5, 0.75, 1.0),
                         'count__max_df': (0.5, 0.75, 1.0)},
             verbose=1)

In [57]:
test_model(grid_search)

['related', 'aid_related', 'weather_related', 'direct_report']

In [58]:
grid_search.best_estimator_

Pipeline(steps=[('count',
                 CountVectorizer(max_df=0.5,
                                 tokenizer=<function tokenize at 0x0000023D809DC550>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=MultinomialNB(alpha=0.5),
                                       n_jobs=-1))])

after all, the **LGBMClassifier** model seems to be better compared to the others

### 9. Export your model as a pickle file

In [60]:
# exporting the trained LGBMClassifier model as a pickle file
with open('classifier.pkl', 'wb') as f:
    dump(pipeline2, f)