In [1]:
import sys
import pandas as pd
import string
import pickle
import re
from sqlalchemy import create_engine
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [2]:
def load_data(database_filepath):
    '''
    Loads the data from the sql database in database_filepath
    return feature as X and results as y

    Args:
        database_filepath: path to the sql database

    returns:
        X: panda dataframe : feature: list of messages
        y: panda dataframe : result: classification of each message
        category_names: list of categories
    '''
    # load data from database
    engine = create_engine('sqlite:///{}'.format(database_filepath))
    df = pd.read_sql_table('disaster_messages', engine)

    # As feature is used the messages (translated in english)
    X = df['message']
    # classification for each message
    y = df.iloc[:, 3:]

    return X, y, y.columns

In [3]:
def tokenize(text, stop_words=set(stopwords.words('english'))):
    ''' Remove punctuation, strip unncessary spaces, lematize,
    remove stopwords and tokenize text

    Args:
        text: str
    returns:
        clean_tokens: list of tokens
    '''
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'[0-9]+', '', text)
    tokens = word_tokenize(text)

    filtered_tokens = [w for w in tokens if not w.lower() in stop_words]

    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for token in filtered_tokens:
        clean_token = lemmatizer.lemmatize(token).lower().strip()
        clean_tokens.append(clean_token)

    return clean_tokens

In [4]:
def build_model():
    '''
    build the model

    Args: None

    returns:
        pipeline
    '''
    pipeline = Pipeline([
                    ('vect', CountVectorizer(tokenizer=tokenize,  ngram_range= (1, 2))),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultiOutputClassifier(RandomForestClassifier(min_samples_split=2, n_estimators=20)))
                    ])
    
    # specify parameters for grid search
    parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
                  'clf__estimator__n_estimators': [5, 10, 20],
                  'clf__estimator__min_samples_split': [2, 5]
                 }

    # create grid search object
    cv = GridSearchCV(pipeline, param_grid=parameters)

    return cv

In [5]:
def evaluate_model(model, X_test, y_test, category_names):
    ''' calculate the f1 score for each category and overall average
    Args:
        model: trained model
        X_test: test data
        y_test: expected output for X_test
        category_names: list of categories / features
    Return:
        print f1 score for categories and average over all categories
        df_results: pandas dataframe with all results
    '''
    # make model predictions
    y_pred = model.predict(X_test)

    # Creating a dictionary to store results
    results = {}

    # looping though all categories an storing their precision, recall and f1_score
    for i in range(0, 36):
        precision, recall, fscore, support = score(y_test.iloc[:,i],y_pred[:,i],average='weighted')
        # adding results to the dictionary
        results[category_names[i]] = [precision, recall, fscore]

    # Creating pandas dataframe with results
    df_results = pd.DataFrame.from_dict(results, orient='index', columns=['precision', 'recall', 'fscore'])
    
    # printing overall modell performance
    print('overall f1_score: {}\n'.format(df_results.fscore.mean()))
    
    # printing category results
    print(df_results)

    return df_results

In [6]:
def save_model(model, model_filepath):
    ''' save the model to disk
    Arg:
        model: Machine learning model
        model_filepath: str path to file
    returns: None
    '''
    # save the model to dish to model_filepath
    pickle.dump(model, open(model_filepath, 'wb'))

    pass


In [7]:
database_filepath = '../data/DisasterResponse.db'
model_filepath = '../models/classifier.pkl'

In [8]:
print('Loading data...\n    DATABASE: {}'.format(database_filepath))
X, Y, category_names = load_data(database_filepath)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

Loading data...
    DATABASE: ../data/DisasterResponse.db


In [9]:
print('Building model...')
model = build_model()

Building model...


In [10]:
print('Training model...')
model.fit(X_train, Y_train)

Training model...


In [12]:
print('Evaluating model...')
evaluate_model(model, X_test, Y_test, category_names)

Evaluating model...
overall f1_score: 0.9352162818167749

                        precision    recall    fscore
related                  0.796063  0.809651  0.796328
request                  0.889799  0.894335  0.882908
offer                    0.989348  0.994660  0.991996
aid_related              0.766568  0.767690  0.764845
medical_help             0.903776  0.922182  0.893064
medical_products         0.944599  0.950601  0.933168
search_and_rescue        0.970046  0.974442  0.962726
security                 0.983313  0.983025  0.974797
military                 0.960113  0.969483  0.956847
child_alone              1.000000  1.000000  1.000000
water                    0.957066  0.959374  0.953488
food                     0.938276  0.941255  0.936543
shelter                  0.928904  0.935152  0.923599
clothing                 0.983737  0.985504  0.980336
money                    0.971466  0.978066  0.968286
missing_people           0.982304  0.987984  0.982381
refugees                

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,fscore
related,0.796063,0.809651,0.796328
request,0.889799,0.894335,0.882908
offer,0.989348,0.99466,0.991996
aid_related,0.766568,0.76769,0.764845
medical_help,0.903776,0.922182,0.893064
medical_products,0.944599,0.950601,0.933168
search_and_rescue,0.970046,0.974442,0.962726
security,0.983313,0.983025,0.974797
military,0.960113,0.969483,0.956847
child_alone,1.0,1.0,1.0


In [13]:
print('Saving model...\n    MODEL: {}'.format(model_filepath))
save_model(model, model_filepath)

Saving model...
    MODEL: ../models/classifier.pkl


In [8]:
def main():
    if len(sys.argv) == 3:
        database_filepath, model_filepath = sys.argv[1:]
        print('Loading data...\n    DATABASE: {}'.format(database_filepath))
        X, Y, category_names = load_data(database_filepath)
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

        print('Building model...')
        model = build_model()

        print('Training model...')
        model.fit(X_train, Y_train)

        print('Evaluating model...')
        evaluate_model(model, X_test, Y_test, category_names)

        print('Saving model...\n    MODEL: {}'.format(model_filepath))
        save_model(model, model_filepath)

        print('Trained model saved!')

    else:
        print('Please provide the filepath of the disaster messages database '\
              'as the first argument and the filepath of the pickle file to '\
              'save the model to as the second argument. \n\nExample: python '\
              'train_classifier.py ../data/DisasterResponse.db classifier.pkl')


if __name__ == '__main__':
    main()


Loading data...
    DATABASE: -f


ValueError: Table disaster_messages not found