In [1]:
# import libraries
import sys
import numpy as np
import nltk
nltk.download(['punkt', 'wordnet', 'stopwords'])

import sqlalchemy as sqla
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
import pickle

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def load_data(database_filepath):
    '''
    INPUT:  
        database_filepath (str): database with table name "Messages" having processed messages
    OUTPUT: 
        X (pandas dataframe): messages column
        Y (pandas dataframe): category columns marked as 1 if the message belongs to that category 
        category_names (list of strings): list of category names
    DESCRIPTION:
            read table named "Messages" from the given database
            and select 'message' as X and all ccategories columns as Y
            and get list of catefories as category_names
    '''

    engine = sqla.create_engine('sqlite:///'+database_filepath)
    df = pd.read_sql('SELECT * FROM DisasterMessages', engine)
    #X = df['message']
    #Y = df.iloc[:,4:]
    #Y = df.drop(['id', 'message', 'original', 'genre'], axis = 1)
    
    X,Y = df['message'], df.iloc[:,4:]

    
    # mapping extra values to `1`
    Y['related']=Y['related'].map(lambda x: 1 if x == 2 else x)

    category_names = Y.columns.values

    return X, Y, category_names


In [8]:
def tokenize(text):
    '''
        The function is to process the sentence, token the words and lower it.
        arg: str text
        return:list
        '''
    # normalize case and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

    # tokenize text
    words = word_tokenize(text)
   
     # remove stop words
    stopwords_ = stopwords.words("english")
    words = [word for word in words if word not in stopwords_]
    
    # extract root form of words
    words = [WordNetLemmatizer().lemmatize(word, pos='v') for word in words]
    

    
    return words
    

In [4]:
def build_model():
    '''
        The function is to build a pipeline and using gridsearch to training model.
        The pipeline including countVectorizer, TfidfTransformer to process the text and using
        RandomForestClassifier to fit the dataset
    '''

    # create ML pipeline
 
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultiOutputClassifier(
                            OneVsRestClassifier(LinearSVC())))])

    # hyper-parameter grid
    parameters = {'vect__ngram_range': ((1, 1), (1, 2)),
                  'vect__max_df': (0.75, 1.0)
                  }

    # create model
    model = GridSearchCV(estimator=pipeline,
            param_grid=parameters,
            verbose=3,
            cv=3)
    
    return model

In [12]:
def evaluate_model(model, X_test, Y_test, category_names):
    '''
        The function is to return the results of prediction on test dataset, including precision socre,
        f1-score and recall score.
        args: model, test dataset and category names
        return: dict - the classification report of category names
    

   
    '''  
        
    """
    Evaluate the model against a test dataset
    Args:
        model: Trained model
        X_test: Test features
        Y_test: Test labels
        category_names: String array of category names
    """
    y_preds = model.predict(X_test)
    
   # print classification report
    print(classification_report(Y_test.values, y_preds, target_names=category_names))

    # print accuracy score
    print('Accuracy: {}'.format(np.mean(Y_test.values == y_preds)))

    #return report

In [6]:
def save_model(model, model_filepath):
    '''
       INPUT:
           model (str): trained model
           model_filepath (str): pickle file path to save the model
       OUTPUT:
       DESCRIPTION:
               save the model passed as the path given as input
       '''

    pickle.dump(model, open(model_filepath, "wb"))

In [13]:

X, Y, category_names = load_data('DisasterResponse.db')
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

category_names

print('Building model...')
model = build_model()

print('Training model...')
model.fit(X_train, Y_train)

print('Evaluating model...')
evaluate_model(model, X_test, Y_test, category_names)



Building model...
Training model...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] vect__max_df=0.75, vect__ngram_range=(1, 1) .....................


  str(classes[c]))


[CV]  vect__max_df=0.75, vect__ngram_range=(1, 1), score=0.28136175082248605, total=  15.6s
[CV] vect__max_df=0.75, vect__ngram_range=(1, 1) .....................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   25.6s remaining:    0.0s
  str(classes[c]))


[CV]  vect__max_df=0.75, vect__ngram_range=(1, 1), score=0.2785009297668431, total=  19.7s
[CV] vect__max_df=0.75, vect__ngram_range=(1, 1) .....................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   56.2s remaining:    0.0s
  str(classes[c]))


[CV]  vect__max_df=0.75, vect__ngram_range=(1, 1), score=0.2842632331902718, total=  16.8s
[CV] vect__max_df=0.75, vect__ngram_range=(1, 2) .....................


  str(classes[c]))


[CV]  vect__max_df=0.75, vect__ngram_range=(1, 2), score=0.28493777714203977, total=  21.1s
[CV] vect__max_df=0.75, vect__ngram_range=(1, 2) .....................


  str(classes[c]))


[CV]  vect__max_df=0.75, vect__ngram_range=(1, 2), score=0.28365040766700045, total=  18.9s
[CV] vect__max_df=0.75, vect__ngram_range=(1, 2) .....................


  str(classes[c]))


[CV]  vect__max_df=0.75, vect__ngram_range=(1, 2), score=0.2815450643776824, total=  20.0s
[CV] vect__max_df=1.0, vect__ngram_range=(1, 1) ......................


  str(classes[c]))


[CV]  vect__max_df=1.0, vect__ngram_range=(1, 1), score=0.28136175082248605, total=  19.7s
[CV] vect__max_df=1.0, vect__ngram_range=(1, 1) ......................


  str(classes[c]))


[CV]  vect__max_df=1.0, vect__ngram_range=(1, 1), score=0.2785009297668431, total=  19.1s
[CV] vect__max_df=1.0, vect__ngram_range=(1, 1) ......................


  str(classes[c]))


[CV]  vect__max_df=1.0, vect__ngram_range=(1, 1), score=0.2842632331902718, total=  17.5s
[CV] vect__max_df=1.0, vect__ngram_range=(1, 2) ......................


  str(classes[c]))


[CV]  vect__max_df=1.0, vect__ngram_range=(1, 2), score=0.28493777714203977, total=  20.2s
[CV] vect__max_df=1.0, vect__ngram_range=(1, 2) ......................


  str(classes[c]))


[CV]  vect__max_df=1.0, vect__ngram_range=(1, 2), score=0.28365040766700045, total=  18.8s
[CV] vect__max_df=1.0, vect__ngram_range=(1, 2) ......................


  str(classes[c]))


[CV]  vect__max_df=1.0, vect__ngram_range=(1, 2), score=0.2815450643776824, total=  21.8s


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  5.9min finished
  str(classes[c]))


Evaluating model...
                        precision    recall  f1-score   support

               related       0.86      0.94      0.90      4041
               request       0.74      0.66      0.69       911
                 offer       0.00      0.00      0.00        24
           aid_related       0.72      0.78      0.75      2216
          medical_help       0.64      0.35      0.45       420
      medical_products       0.60      0.30      0.40       273
     search_and_rescue       0.42      0.13      0.20       137
              security       0.20      0.01      0.02        83
              military       0.64      0.41      0.50       152
           child_alone       0.00      0.00      0.00         0
                 water       0.73      0.75      0.74       314
                  food       0.81      0.79      0.80       602
               shelter       0.75      0.61      0.68       476
              clothing       0.81      0.63      0.71        81
                 mo

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


NameError: name 'model_filepath' is not defined

In [14]:
print('Saving model...\n ')
save_model(model, 'models/classifier.pkl')

print('Trained model saved!')

Saving model...
 
Trained model saved!
