# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [1]:
#!pip install git+http://github.com/scikit-learn/scikit-learn.git

In [2]:
# import libraries
import warnings
warnings.filterwarnings("ignore")
from sqlalchemy import create_engine

import os
from time import time
import copy
import json
import tqdm

import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style("darkgrid")
#matplotlib.use('nbagg')
import matplotlib.pyplot as plt
%matplotlib inline

from IPython.display import Markdown, display, HTML
def printmd(string):
    display(Markdown(string))
    
import pickle

# Needed for NLP processing
import re
import nltk
#nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger', 'stopwords'])
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

# import moduls for ML LEarning
from sklearn.metrics import confusion_matrix
#from sklearn.metrics import multilabel_confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report

from pipelinehelper import PipelineHelper

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


  import imp
  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, positive=False):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-no

In [3]:
# load data from database
def load_data(database_filepath):
    """ Load data from Sqlite database into DataFrame
        INPUTS:
        ------------
            database_filepath (string) -  path to Sqlite database
        
        OUTPUTS:
        ------------
            X (numpy arrays) - input features (messages) of DataFrame df
            Y (pandas DataFrame) - categories of DataFrame df
            category_names (list) of category names (column names of Y)
        
    """
    engine = create_engine('sqlite:///'+ database_filepath)
    df = pd.read_sql("SELECT * FROM disaster", engine)
    
    # X and y
    colnames = df.columns.tolist()
    category_names = colnames[4:]

    X = df.message.values
    Y = df[category_names]
    
    # Show DataFrame result, shapes, colnames, target
    print('DATAFRAME df')
    display(df.head())
    print('Shape of df: ' + str(df.shape))
    print(' ')
   
    print('DATAFRAME Y')
    display(Y.head())
    print('Shape of Y: ' + str(Y.shape))
    print(' ')
    
    print('colnames')
    print(colnames)
    print(' ')
    
    print('category_names')
    print(category_names)
    print(' ')
    
    return X, Y, category_names, df


### 2. Write a tokenization function to process your text data

In [4]:
def tokenize(text, word_prep='lemmatize'):
    """ function that will 
        - replace urls with spaceholder
        - remove punctuation
        - remove stopwords
        - stem/lemmatize words 
        - normalize all words to lower case 
        - remove white spaces
         
        INPUTS:
        ------------
            text (string) - message text
            
        OUTPUTS:
        ------------
            clean_tokens (list) - of cleaned words
    
    """
    # Detect URLs
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    # Remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [t for t in tokens if t not in stopwords.words('english')]  
    
    # Stem, normalize all words to lower case, remove white spaces 
    if word_prep == 'stem':
        clean_tokens = [PorterStemmer().stem(tok).lower().strip() for tok in tokens]
    
    # Lemmatize, normalize all words to lower case, remove white spaces 
    if word_prep == 'lemmatize':
        clean_tokens = [WordNetLemmatizer().lemmatize(tok).lower().strip() for tok in tokens]
          
    return clean_tokens

In [5]:
class CustomTokenizer(BaseEstimator, TransformerMixin):
    """ Custom Transformer class that will 
        - replace urls with spaceholder
        - remove punctuation
        - remove stopwords
        - stem/lemmatize words 
        - normalize all words to lower case 
        - remove white spaces
    """
    
    def __init__(self, word_prep):
        """ Init of CustomTokenizer
        
            INPUTS:
            ------------
                word_prep (string) - 'stem' or 'lemmatize',  to choose between stemming or lemmatization during tokenization
                                     Useful for GridSearchCV 
                        

            OUTPUTS:
            ------------
                no direct
        """
        self.word_prep = word_prep
        
    def tokenize(self, text):
        """ Function that will 
            - replace urls with placeholder
            - remove punctuation
            - remove stopwords
            - stem/lemmatize words 
            - normalize all words to lower case 
            - remove white spaces
         
        INPUTS:
        ------------
            text (string) - message text
        
        OUTPUTS:
        ------------
            clean_tokens - a list of cleaned words
    
        """
        # Detect URLs
        url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        detected_urls = re.findall(url_regex, text)
        for url in detected_urls:
            text = text.replace(url, "urlplaceholder")

        # Remove punctuation
        text = re.sub(r"[^a-zA-Z0-9]", " ", text)

        tokens = word_tokenize(text)

        # Remove stopwords
        tokens = [t for t in tokens if t not in stopwords.words('english')]  
        
        # Stem, normalize all words to lower case, remove white spaces 
        if self.word_prep == 'stem':
            clean_tokens = [str(PorterStemmer().stem(tok)).lower().strip() for tok in tokens]

        # Lemmatize, normalize all words to lower case, remove white spaces 
        if self.word_prep == 'lemmatize':
            clean_tokens = [str(WordNetLemmatizer().lemmatize(tok)).lower().strip() for tok in tokens]

        return clean_tokens
    

    def fit(self, X, y=None):
        """ fit function for estimator (object that learns from data), 
            
            INPUTS:
            ------------
                X (numpy.ndarray) - of training (testing) features  
                y (numpy.ndarray) - with dataset target labels
            
            OUTPUTS
            ------------
                self - allows to chain methods together. This method is required to be compatible with scikit-learn
        """
        return self

    def transform(self, X):
        """ function which includes the code to transform the data
            
            INPUTS:
            ------------
                X (numpy.ndarray) - of training (testing) features  
            
            OUTPUTS:
            ------------
                df_x_tagged (pandas DataFrame) - with a list of cleaned words in each row 
        """
        X_tagged = pd.Series(X).apply(self.tokenize)
        df_x_tagged = pd.DataFrame(X_tagged)
        print(df_x_tagged)
        
        return df_x_tagged

In [6]:
# defines a custom vectorizer class
class CustomVectorizer(CountVectorizer): 
    """ A CustomVectorizer class which inherits from the CountVectorizer class. 
        Aim: switch between Porterstemmer and Lemmatization during training via GridSearchCV. 
        A CountVectorizer object converts a collection of text documents to a matrix of token counts.  
    """
    def __init__(self, X, word_prep='lemmatize', remove_stopwords=True, **kwargs):
        """ Init function that takes all arguments of CountVectorizer base class and adds two own arguments
            
            INPUTS:
            ------------
                X (numpy.ndarray) - of training (testing) features  
                word_prep (string) - 'stem' or 'lemmatize',  to choose between stemming or lemmatization during tokenization
                                     Useful for GridSearchCV 
            
            OUTPUTS:
            ------------
                no direct outputs
        """
        super().__init__(**kwargs)
        
        self.X = X
        self.word_prep = word_prep
        self.remove_stopwords = remove_stopwords
        self.lowercase=False
        

    def prepare_doc(self, text):
        #print(self.word_prep)
        """ Function that will 
            - replace urls with spaceholder
            - remove punctuation
            - remove stopwords
            - stem/lemmatize words 
            - normalize all words to lower case 
            - remove white spaces

            INPUTS:
            ------------
                text (string) - message text

            OUTPUTS:
            ------------
                clean_tokens (list) - of cleaned words
    
        """
    
        # Detect URLs
        url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        detected_urls = re.findall(url_regex, text)
        for url in detected_urls:
            text = text.replace(url, "urlplaceholder")

        # Remove punctuation
        text = re.sub(r"[^a-zA-Z0-9]", " ", text)
        
        tokens = word_tokenize(text)
        
        # Remove stopwords
        if self.remove_stopwords == True:
            tokens = [t for t in tokens if t not in stopwords.words('english')]  
        else:
            pass

        # Stem, normalize all words to lower case, remove white spaces 
        if self.word_prep == 'stem':
            clean_tokens = [PorterStemmer().stem(tok).lower().strip() for tok in tokens]

        # Lemmatize, normalize all words to lower case, remove white spaces 
        if self.word_prep == 'lemmatize':
            clean_tokens = [WordNetLemmatizer().lemmatize(tok).lower().strip() for tok in tokens]
        #print(clean_tokens)
        return clean_tokens
        
    def get_params(self, deep=True):
        """ Overwrite get_params in CountVectorizer base class 
            create new get_params() including the new property word_prep
            
            INPUTS:
            ------------
                deep - parameter in get_params function of base class
            
            OUTPUTS:
            ------------
                params (dictionary) - new parameter dictionary
        """
        params = super().get_params(deep)
        
        # Hack to make get_params return base class params...
        cp = copy.copy(self)
        cp.__class__ = CountVectorizer
        
        params.update(CountVectorizer.get_params(cp, deep))
        return params
        
    def build_analyzer(self):
        """ Overwrite build_analyzer in CountVectorizer base class 
            
            INPUTS:
            ------------
            
            OUTPUTS:
            ------------
                call prepare_doc, transform training (testing) features, 
                return cleaned lists of word tokenized messages
        """
        preprocess = self.build_preprocessor()
        return lambda doc : preprocess(self.decode(self.prepare_doc(doc)))
        

In [7]:
def tfid_transform(X):
    """ Get weighted idfs for each word of the Bag-of-Words (CountVectorizer matrix) 
        
        INPUTS:
        ------------
            X (numpy.ndarray) - of training (testing) features 
        
        OUTPUTS:
        ------------
            df_idf.sort_values(by=['idf_weights']) (pandas DataFrame) -  with all words from Bag-of-Words as index 
                                                                         and idf_weights as one column
    """
    # Build the pipeline
    pipeline = Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer()),
            ])
    
    print('Compute the IDF values ...')
    pipeline.fit(X)
    
    # create df_idf DataFrame
    df_idf = pd.DataFrame(pipeline['tfidf'].idf_, index=pipeline['vect'].get_feature_names(),columns=["idf_weights"]) 
    
    # sort ascending 
    return df_idf.sort_values(by=['idf_weights'])
    

### 3. Custom Transformer classes
Let's create Custom Transformer classes to implement special NLP pipelines.

In [8]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):
    """ To know the parts of speech (like nouns, verbs, pronouns) 
        can help to understand the meaning of a sentence better.
        This class checks if the first word of a sentence is a verb.
        if yes --> return True
        if no --> return False
    
    """
    def starting_verb(self, text):
        """ function that 
            - divides a text string into a list of sentences 
            - checks if the first word of a sentence is a verb
            
            INPUTS: 
            ------------
            text - a string of text
            
            OUTPUTS:
            ------------
            True - if verb
            False - if anything else than verb
        """
            
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
        return False

    def fit(self, x, y=None):
        """ fit function for estimator (object that learns from data), 
            here estimator is an instance of StartingVerbExtractor class
            
            INPUTS:
            ------------
            x - 2d array X of the dataset features  
            y - 1d array y of the dataset target labels
            
            OUTPUTS
            ------------
            self - allows to chain methods together. This method is required to be compatible with scikit-learn
        """
        return self

    def transform(self, X):
        """ function which includes the code to transform the data
            
            INPUTS:
            ------------
            x - 2d array X of the dataset features
            
            OUTPUTS:
            ------------
            df_x_tagged - a DataFrame of X_tagged (containing a column with True and False values) 
                          this transformer object will be appendend to the pipeline object via Feature Union
        """
        X_tagged = pd.Series(X).apply(self.starting_verb)
        df_x_tagged = pd.DataFrame(X_tagged)
        return df_x_tagged

### 4 Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In order to successfully predict classification results for the 36 categories a pipeline helps to organize data preporcessing and data flow based on ETL and NLP.
The used pipeline is structured in the following way:

In [9]:
def build_model(X_train, Y_train, pipeline_name='pipeline_1'):
    """ Build a ML pipelines
        Test different pipelines
        - pipeline_1: standard based on CountVectorizer, TfidfTransformer and MultiOutputClassifier
        - pipeline_2: as pipeline_1 but with a CustomVectorizer and GridSearchCV to find optimized parameters
        - pipeline_3: add the CustomTransformer 'StartingVerbExtractor' into pipeline via Feature Union
        
        INPUTS:
        ------------
        pipeline_name - string name for calling a specific pipeline
        X_train - numpy.ndarray, input features for training 
        Y_train - numpy.ndarray, target values 
        
        OUTPUTS:
        ------------
        cv  - model based on sklearn GridSearchCV and actual parameter settings
        pipeline - model based on sklearn Pipeline (including ETL and NLP processing steps) 
        parameters - dictionary of GridSearchCV parameters
        
    """
    if pipeline_name == 'pipeline_1':
        print('pipeline_1 chosen')
        pipeline = Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer()),
                ('clf', MultiOutputClassifier(RandomForestClassifier())),
                #('clf', SGDClassifier()),
            ])
        return pipeline, None, None
    
    
    if pipeline_name == 'pipeline_2':
        print('pipeline_2 chosen')
        pipeline = Pipeline([
            ('nlp', Pipeline([
                #('tokenizer', CustomTokenizer(word_prep='stem')),
                ('vect', CustomVectorizer(X_train, word_prep='lemmatize')),
                ('tfidf', TfidfTransformer()),
            ])),
            ('classifier', PipelineHelper([
                ('rfc', MultiOutputClassifier(RandomForestClassifier())),
                ('abc', MultiOutputClassifier(
                        AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1, class_weight='balanced'))))
            ]))
        ])
       
        # uncommenting more parameters will give better exploring power but will
        # increase processing time in a combinatorial way
        parameters = {
                #'nlp__vect__word_prep': ('stem', 'lemmatize'),
                'nlp__vect__remove_stopwords': (True, 'False'),
                #'nlp__vect__max_df': (0.5, 0.75, 1.0),
                #'nlp__vect__max_features': (None, 5000, 10000, 50000),
                #'nlp__vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
                #'nlp__tfidf__use_idf': (True, False),
                #'nlp__tfidf__norm': ('l1', 'l2'),
            
            'classifier__selected_model': pipeline.named_steps['classifier'].generate({
                #'rfc__estimator__n_estimators': [10, 20],
                #'rfc__estimator__min_samples_split': [2, 5]
                'abc__estimator__learning_rate': [0.1, 0.3],
                'abc__estimator__n_estimators': [100, 200],
            })
        }
        cv = GridSearchCV(estimator=pipeline, param_grid=parameters, n_jobs=1, verbose=1)
        #cv.fit(X_train, Y_train)
        return cv, pipeline, parameters

        
    if pipeline_name == 'pipeline_3':
        print('pipeline_3 chosen')
        pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('starting_verb', StartingVerbExtractor())
        ])),

        ('clf', MultiOutputClassifier(RandomForestClassifier()))
        ])
        parameters = {
            #'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
            #'features__text_pipeline__vect__max_df': (0.5, 0.75, 1.0),
            'features__text_pipeline__vect__max_features': (None, 5000, 10000),
            'features__text_pipeline__tfidf__use_idf': (True, False),
            #'clf__estimator__n_estimators': [50, 100, 200],
            #'clf__estimator__min_samples_split': [2, 3, 4],
            'features__transformer_weights': (
                {'text_pipeline': 1, 'starting_verb': 0.5},
                {'text_pipeline': 0.5, 'starting_verb': 1},
                #{'text_pipeline': 0.8, 'starting_verb': 1},
            )
        }
        
        cv = GridSearchCV(estimator=pipeline, param_grid=parameters, n_jobs=1, verbose=1)
        #cv.fit(X_train, Y_train)
        return cv, pipeline, parameters
    

In [10]:
def evaluate_model(model, df, X_train, X_test, Y_test, category_names, pipeline_name):
    """ Get a classification report from testing results including precision, recall, f1-score an accuracy 
        
        INPUTS:
        ------------
            model (sklearn object) - the trained model based on the actual pipeline
            X_test (pandas DataFrame) - with test input features 
            Y_test (pandas DataFrame) - of true target values 
        
        OUTPUTS:
        ------------
            print statements for precision, recall, f1-score an accuracy for each category
            print statement for size of Bag-of-Words
            print statements and csv export for message stats
            print statements and csv export for 20 most common words
            print statements and csv export for 100 randomly chosen messages (in raw format and tokenized)
        
    """
    # Make predictions based on trained model
    Y_pred = model.predict(X_test)
    
    #print(classification_report(y_test, y_pred, target_names=y_test.keys()))
    accuracy = (Y_pred == Y_test).mean()
    df_classification_report = pd.DataFrame(classification_report(Y_test, Y_pred, target_names=Y_test.keys(),  output_dict=True))
    df_classification_report = pd.concat([df_classification_report.T, accuracy], axis=1).reindex(df_classification_report.T.index)
    df_classification_report.columns = ['f1_score', 'precision', 'recall', 'support', 'accuracy']
    print(pipeline_name)
    print(df_classification_report)
    print(' ')
    print('Total accuracy = ' + str(round(accuracy.mean(),2)))
    print(' ')
    
    # get most commomn words
    most_comon_words = tfid_transform(X_train)
    most_comon_words.to_csv('most_common_words.csv')
    print('20 most common words')
    print(list(most_comon_words.index)[:20])
    print(' ')
    print('Size of Bag-of-Words: ', len(list(most_comon_words.index)))
    print(' ')
    print('... most_common_words.csv saved!')
    print(' ')
    
    # Check 100 randomly chosen messages --- coming from X_train with and without tokenization
    rand_set = np.random.randint(df.shape[0], size=(1, 100))
    message_raw = []
    message_tok = []
    for index in rand_set[0]:
        try:
            print(X_train[index])
            print('')
            print(tokenize(X_train[index]))
            print('--------------------------------------------------------')
            message_raw.append(X_train[index])
            message_tok.append(tokenize(X_train[index]))
        except:
            pass
        
    message_set = pd.DataFrame({'message_raw': message_raw, 'message_tok': message_tok})
    message_set.to_csv('message_set.csv')
    print('... message_set.csv saved!')
    print(' ')
    
    # distribution of word counts for each genre
    # create boxplot and Histograms: What is the distribution of word-count for each genre? Are there any outliers?
    print('Tokenize messages ...')
    message_stats_direct = df[df['genre'] == 'direct']['message'].apply(lambda x: len(tokenize(x)))
    message_stats_news = df[df['genre'] == 'news']['message'].apply(lambda x: len(tokenize(x)))
    message_stats_social = df[df['genre'] == 'social']['message'].apply(lambda x: len(tokenize(x)))
    print('Median of direct message word count: ', message_stats_direct.median())
    print('Median of news message word count: ', message_stats_news.median())
    print('Median of social message word count: ', message_stats_social.median())
    print(' ')
    message_stats_direct.to_csv('message_stats_direct.csv')
    print('... message_stats_direct.csv saved!')
    message_stats_news.to_csv('message_stats_news.csv')
    print('... message_stats_news.csv saved!')
    message_stats_social.to_csv('message_stats_social.csv')
    print('... message_stats_social.csv saved!')
        
    print('... messages_stats.json saved!')
    print(' ')
    

### 5. Train pipeline
- Split data into train and test sets
- Train pipeline

In [11]:
def save_model(model, model_filepath):
    """ Save the model
        
        INPUTS:
        ------------
        model: model to be saved
        model_filepath: filepath to model
        
        OUTPUTS:
        ------------
        save model as a pickle file
    """  
    pickle.dump(model, open(model_filepath, 'wb'))

In [12]:
def start(pipeline_names):
    """ Main function to trigger model training, model evaluation and model saving
        Function that tiggers 
        - load_data
        - train_test_split
        - build_model
        - model.fit 
        - evaluate_model
        - save_model
    
        INPUTS:
        ------------
            pipeline_names (list) - of pipelines to execute, e.g. ['pipeline_2', 'pipeline_3']
                                    These pipelines are called via the pipeline_names list 
        
        OUTPUTS:
        ------------
            no odirect outputs, however the model is stored as a pickle file to disk
        
    """
   
    database_filepath = 'disaster.db'
    model_filepath = 'models/classifier.pkl'
    print('Loading data...\n    DATABASE: {}'.format(database_filepath))
    
    # load data
    X, Y, category_names, df = load_data(database_filepath)
    
    # train test split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
    
    # start pipelining, build the model
    for pipeline_name in tqdm.tqdm(pipeline_names):
        print('Building model...')
        model, pipeline, parameters = build_model(X_train, Y_train, pipeline_name)
        
        if pipeline_name in ['pipeline_2', 'pipeline_3']:
            print("Performing grid search...")
            print("pipeline:", [name for name, _ in pipeline.steps])
            print("parameters:")
            print(parameters)
            
            t0 = time()
        
        # train the model
        print('Training model...')
        model.fit(X_train, Y_train)
        #print(pipeline['vect'].get_feature_names())
        
        if pipeline_name in ['pipeline_2', 'pipeline_3']:
            print("done in %0.3fs" % (time() - t0))
            print()

            print("Best score: %0.3f" % model.best_score_)
            print("Best parameters set:")
            best_parameters = model.best_estimator_.get_params()
            for param_name in sorted(parameters.keys()):
                print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
        print('Evaluating model...')
        
        # evaluate the model
        evaluate_model(model, df, X_train, X_test, Y_test, category_names, pipeline_name)

        # save the model
        path, filename = os.path.split(model_filepath)
        base, ext  = os.path.splitext(filename)
        model_filepath = os.path.join(path, base + '_' + pipeline_name + '.pkl')
        print('Saving model...\n    MODEL: {}'.format(model_filepath))
        save_model(model, model_filepath)

        print('Trained model saved!')

start('pipeline_1')

Loading data...
    DATABASE: disaster.db
DATAFRAME df
   id                                            message  \
0   2  Weather update - a cold front from Cuba that c...   
1   7            Is the Hurricane over or is it not over   
2   8                    Looking for someone but no name   
3   9  UN reports Leogane 80-90 destroyed. Only Hospi...   
4  12  says: west side of Haiti, rest of the country ...   

                                            original   genre  related  \
0  Un front froid se retrouve sur Cuba ce matin. ...  direct        1   
1                 Cyclone nan fini osinon li pa fini  direct        1   
2  Patnm, di Maryani relem pou li banm nouvel li ...  direct        1   
3  UN reports Leogane 80-90 destroyed. Only Hospi...  direct        1   
4  facade ouest d Haiti et le reste du pays aujou...  direct        1   

   request  offer  aid_related  medical_help  medical_products  ...  \
0        0      0            0             0                 0  ...   
1  

  0%|          | 0/10 [00:00<?, ?it/s]

Building model...





TypeError: cannot unpack non-iterable NoneType object

### 6. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

### --> see build_model --> pipeline_1

### 7. Improve your model
Use grid search to find better parameters. 

### --> see build_model --> pipeline_2

### 8. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

### 9. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

### --> see build_model --> pipeline_3

### 10. Export your model as a pickle file

### see save_model

### 11. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.

### 12. Evaluation 

This is the last but very important part of a CRISP-DM analysis. In this section the following impoortant business questions will be answered:

- Question 1: How are the three different 'genre' types distributed?
- Question 2: What is the distribution of letters-count for each genre? Are there any outliers?
- Question 3: What is the distribution of words-counts for each genre? Are there any outliers?

### Load the data

In [None]:
database_filepath = 'disaster.db'
model_filepath = 'models/classifier.pkl'
X, Y, category_names, df = load_data(database_filepath)

#### Question 1: How are the three different 'genre' types distributed?

In [None]:
import plotly.graph_objects as go

# as a bar chart    
graph = []
# extract data needed for visuals
genre_counts = df.groupby('genre').count()['message']
genre_names = list(genre_counts.index)

graph.append(
  go.Bar(
  x = genre_names,
  y = genre_counts,
  )
)

layout = dict(title = 'Distribution of Message genres',
            xaxis = dict(title = 'Count',),
            yaxis = dict(title = 'Genre'),
            )

# append all charts to the figures list
figures = []
figures.append(dict(data=graph, layout=layout))
#figures.append(dict(data=graph_two, layout=layout_two))

fig = go.Figure(figures[0])

fig.show()

In [None]:
# Alternative to plotly: Values counts for type of message
df['genre'].value_counts()
sns.barplot(x=df['genre'].value_counts().index, y=df['genre'].value_counts());

#### Question 2: What is the distribution of letters-count for each genre? Are there any outliers?

In [None]:
# as a Boxplot    

graph = []
# extract data needed for visuals
message_length_direct = df[df['genre'] == 'direct']['message'].apply(lambda x: len(x))
message_length_news = df[df['genre'] == 'news']['message'].apply(lambda x: len(x))
message_length_social = df[df['genre'] == 'social']['message'].apply(lambda x: len(x))
messages_box = [('direct', message_length_direct), ('news', message_length_news), ('social', message_length_social)]

graph = [go.Box(y=message_length, name=message_type) for message_type, message_length in messages_box]
    
layout = dict(title = 'Letters-count descriptive stats for each genre')

# append all charts to the figures list
figures = []
figures.append(dict(data=graph, layout=layout))

fig = go.Figure(figures[0])

fig.show()

In [None]:
# as a Histogram    

graph = []
# extract data needed for visuals
message_length_direct = df[df['genre'] == 'direct']['message'].apply(lambda x: len(x))
message_length_news = df[df['genre'] == 'news']['message'].apply(lambda x: len(x))
message_length_social = df[df['genre'] == 'social']['message'].apply(lambda x: len(x))
messages_box = [('direct', message_length_direct), ('news', message_length_news), ('social', message_length_social)]

graph = [go.Histogram(x=message_length, name=message_type) for message_type, message_length in messages_box]
    
layout = dict(title = 'Letters-count distribution for each genre',
                    xaxis = dict(title = '# letters'),
                    yaxis = dict(title = 'Count', type='log'),
                    )

# append all charts to the figures list
figures = []
figures.append(dict(data=graph, layout=layout))

fig = go.Figure(figures[0])
#fig.update_layout(yaxis_type="log") 
fig.show()

#### Question 3: What is the distribution of words-counts for each genre? Are there any outliers?

In [None]:
# as a Boxplot    

graph = []
# extract data needed for visuals

messages_stats_direct = pd.read_csv(open('message_stats_direct.csv'))
messages_stats_news = pd.read_csv(open('message_stats_news.csv'))
messages_stats_social = pd.read_csv(open('message_stats_social.csv'))
messages_stats = {'direct': messages_stats_direct, 'news': messages_stats_news, 'social': messages_stats_social}


graph = [go.Box(y=message_length.iloc[:,1], name=message_type) for message_type, message_length in messages_stats.items()]
    
layout = dict(title = 'Word-count descriptive stats for each genre')

# append all charts to the figures list
figures = []
figures.append(dict(data=graph, layout=layout))

fig = go.Figure(figures[0])

fig.show()

In [None]:
# as a Histogram    

graph = []
# extract data needed for visuals
# 
graph = [go.Histogram(x=message_length.iloc[:,1], name=message_type) for message_type, message_length in messages_stats.items()]
    
layout = dict(title = 'Word-count distribution for each genre',
                    xaxis = dict(title = '# words'),
                    yaxis = dict(title = 'Count', type='log'),
                    )

# append all charts to the figures list
figures = []
figures.append(dict(data=graph, layout=layout))

fig = go.Figure(figures[0])
#fig.update_layout(yaxis_type="log") 
fig.show()

#### Question 4: What are the 20 most common words in the training set?

In [None]:
# Get the dataset
database_filepath = 'disaster.db'
model_filepath = 'models/classifier.pkl'
print('Loading data...\n    DATABASE: {}'.format(database_filepath))
X, Y, category_names, df = load_data(database_filepath)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
# Check 100 randomly chosen messages --- coming from X_train with and without tokenization
rand_set = np.random.randint(df.shape[0], size=(1, 100))
rand_set
message_raw = []
message_tok = []
for index in rand_set[0]:
    try:
        print(X_train[index])
        print('')
        print(tokenize(X_train[index]))
        print('--------------------------------------------------------')
        message_raw.append(X_train[index])
        message_tok.append(tokenize(X_train[index]))
    except:
        pass
message_set = pd.DataFrame(zip(message_raw,message_tok), columns=['message_raw', 'message_tok'])
message_set.to_csv('message_set.csv')

In [None]:
most_comon_words = tfid_transform(X_train)

In [None]:
most_comon_words[:20]

In [None]:
import plotly.graph_objects as go

# as a bar chart    
graph = []
# extract data needed for visuals
most_comon_words = pd.read_csv('most_common_words.csv', index_col=[0])
print(most_comon_words.head())
y = list(most_comon_words.index)[:20]
print(y)
x = most_comon_words['idf_weights']
graph.append(
  go.Bar(
  x = x,
  y = y,
  orientation='h'
  )
)

layout = dict(title = 'Most common words after tokenization',
            xaxis = dict(title = 'Most common words',),
            yaxis = dict(title = 'idf_weights'),
            )

# append all charts to the figures list
figures = []
figures.append(dict(data=graph, layout=layout))
#figures.append(dict(data=graph_two, layout=layout_two))

fig = go.Figure(figures[0])

fig.show()

##### Result:
Notice for words with lowest IDF values it is expected that these words appear more often. For idf_weights=1 they would appear in each and every document in our collection. The lower the IDF value of a word, the less unique it is to any particular document.

#### Question 5: Are there any significant correlations between the categories?

In [None]:
# as a bar Corrlation plot    

graph = []
# extract data needed for visuals
corr_x = df.corr().index
corr_y = df.corr().index

graph.append(go.Heatmap(
                    x=corr_x,
                    y=corr_y,
                    z=df.corr().values,
                    type = 'heatmap',
                    colorscale = 'Viridis')
            )
    
layout = dict(title = 'Correlation matrix',
                    xaxis = dict(automargin=True),
                    yaxis = dict(automargin=True),
             )

# append all charts to the figures list
figures = []
figures.append(dict(data=graph, layout=layout))

fig = go.Figure(figures[0])
#fig.update_layout(yaxis_type="log") 
fig.show()