In [5]:
# import libraries
import sys
import numpy as np
import nltk
nltk.download(['punkt', 'wordnet', 'stopwords'])

import sqlalchemy as sqla
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
import pickle

AttributeError: module 'numpy' has no attribute 'testing'

In [None]:
def load_data(database_filepath):
    '''
    INPUT:  
        database_filepath (str): database with table name "Messages" having processed messages
    OUTPUT: 
        X (pandas dataframe): messages column
        Y (pandas dataframe): category columns marked as 1 if the message belongs to that category 
        category_names (list of strings): list of category names
    DESCRIPTION:
            read table named "Messages" from the given database
            and select 'message' as X and all ccategories columns as Y
            and get list of catefories as category_names
    '''

    engine = sqla.create_engine('sqlite:///'+database_filepath)
    df = pd.read_sql('SELECT * FROM DisasterMessages', engine)
    X = df['message']
    #Y = df.iloc[:,4:]
    Y = df.drop(['id', 'message', 'original', 'genre'], axis = 1)

    category_names = Y.columns.values

    return X, Y, category_names


In [None]:
def tokenize(text):
    '''
        The function is to process the sentence, token the words and lower it.
        arg: str text
        return:list
        '''
    # normalize case and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

    # tokenize text
    word_list = word_tokenize(text)
    '''
        # remove stop words
    tokens = [w for w in word_list if w not in stopwords.words("english")]

    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)
    return clean_tokens
    '''
    
    # Remove stop words
    stemmer = PorterStemmer()
    stop_words = stopwords.words("english")
    
    stemmed = [stemmer.stem(word) for word in word_list if word not in stop_words]
    
    return stemmed
    

In [2]:
def build_model():
    '''
        The function is to build a pipeline and using gridsearch to training model.
        The pipeline including countVectorizer, TfidfTransformer to process the text and using
        RandomForestClassifier to fit the dataset
    '''

    # create ML pipeline
    '''
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf', RandomForestClassifier())])
   
    
    pipeline = Pipeline([
                            ('vect', CountVectorizer(tokenizer = tokenize, min_df = 5)),
                            ('tfidf', TfidfTransformer(use_idf = True)),
                            ('clf', MultiOutputClassifier(RandomForestClassifier(n_estimators = 5, min_samples_split = 10)))
                        ])

    # specify parameters for grid search
    ''' 
       parameters = {
            'vect__ngram_range': ((1, 1), (1, 2)),
            'vect__max_df': (0.5, 0.75, 1.0),
            'vect__max_features': (None, 5000, 10000),
            'tfidf__use_idf': (True, False),
            'clf__n_estimators': [50, 100, 200],
            'clf__min_samples_split': [2, 3, 4]
        }

   
    parameters = {'vect__ngram_range': ((1, 1), (1, 2)),
                 'tfidf__use_idf': (True, False),
                 'clf__estimator__n_estimators':[50, 100, 200],
                 'clf__estimator__max_depth':[50, 500, 1000, 5000],
                 'clf__estimator__max_features': [None, 5000, 10000],
                 'clf__estimator__min_samples_split':[3, 5, 9]} 
   
    parameters = {"clf__estimator__n_estimators": [50, 100],
                "clf__estimator__learning_rate": [0.5,1]
                }  
    parameters = {
        'vect__ngram_range': ((1, 1), (1, 2)),
        'vect__max_df': (0.5, 0.75, 1.0),
        'vect__max_features': (None, 5000, 10000),
        'tfidf__use_idf': (True, False),
        'clf__n_estimators': [50, 100, 200],
        'clf__min_samples_split': [2, 5, 10]
    }

    
  
    parameters = {'vect__min_df': [1, 5],
                  'tfidf__use_idf':[True, False],
                  'clf__estimator__n_estimators':[10, 25], 
                  'clf__estimator__min_samples_split':[2, 5, 10]}
    
    model = GridSearchCV(pipeline, param_grid=parameters, verbose = 10)
    '''
     pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultiOutputClassifier(
                            OneVsRestClassifier(LinearSVC())))])

    # hyper-parameter grid
    parameters = {'vect__ngram_range': ((1, 1), (1, 2)),
                  'vect__max_df': (0.75, 1.0)
                  }

    # create model
    model = GridSearchCV(estimator=pipeline,
            param_grid=parameters,
            verbose=3,
            cv=3)
    
    return model

IndentationError: unexpected indent (<ipython-input-2-634ed0427397>, line 23)

In [5]:
def evaluate_model(model, X_test, Y_test, category_names):
    '''
        The function is to return the results of prediction on test dataset, including precision socre,
        f1-score and recall score.
        args: model, test dataset and category names
        return: dict - the classification report of category names
    

    y_pred = model.predict(X_test)  # prediction
    prediction = pd.DataFrame(y_pred.reshape(-1, 36), columns=category_names)  # transform list to dataframe
    report = dict()
    for i in category_names:
        # iterate the category names and add its classification scores to dictionary
        classification = classification_report(Y_test[i], prediction[i])
        report[i] = classification
      '''  
        
    """
    Evaluate the model against a test dataset
    Args:
        model: Trained model
        X_test: Test features
        Y_test: Test labels
        category_names: String array of category names
    """
    y_preds = model.predict(X_test)
    print(classification_report(y_preds, Y_test.values, target_names=category_names))
    print("**** Accuracy scores for each category *****\n")
    for i in range(36):
        print("Accuracy score for " + Y_test.columns[i], accuracy_score(Y_test.values[:,i],y_preds[:,i]))

    #return report

In [6]:
def save_model(model, model_filepath):
    '''
       INPUT:
           model (str): trained model
           model_filepath (str): pickle file path to save the model
       OUTPUT:
       DESCRIPTION:
               save the model passed as the path given as input
       '''

    pickle.dump(model, open(model_filepath, "wb"))

In [None]:

X, Y, category_names = load_data('DisasterResponse.db')
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

category_names

print('Building model...')
model = build_model()

print('Training model...')
model.fit(X_train, Y_train)

print('Evaluating model...')
evaluate_model(model, X_test, Y_test, category_names)

print('Saving model...\n ')
save_model(model, model_filepath)

print('Trained model saved!')