In [1]:
# import libraries
import os
import re

import pandas as pd
from sqlalchemy import create_engine

import nltk
from nltk import pos_tag, ne_chunk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.metrics import classification_report

In [2]:
# load data from database
engine = create_engine('sqlite:///./data/disaster_response.db')
df = pd.read_sql_table('messages', engine)

labels = [col for col in df.columns if col not in ['id', 'message', 'original', 'genre', 'related']]

X = df['message'].values
y = df[labels].values

In [4]:
# tokenization function
def tokenize(text):
    clean_text = text.lower() # convert all chars to lower case
    clean_text = re.sub(r"[^a-zA-Z0-9]", " ", clean_text) # remove non alpha-numeric characters
    clean_text = re.sub(' +', ' ', clean_text) # remove duplicate spaces
    
    # tokenize text
    words = word_tokenize(clean_text)
    words = [w for w in words if w not in stopwords.words("english")]
    
    # reduce words to their stems
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(w) for w in words]
    
    # reduce words to root form
    lemmatizer = WordNetLemmatizer()
    clean_tokens = [lemmatizer.lemmatize(w) for w in stemmed]
    
    return clean_tokens

In [5]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier(random_state=42, n_jobs=-1)))
])

In [6]:
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# fit model
pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x000001F66638D670>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier(n_jobs=-1,
                                                                        random_state=42)))])

In [7]:
# predict on test data
y_pred = pipeline.predict(X_test)

# print model results
print(classification_report(y_test, y_pred, target_names=labels))

                        precision    recall  f1-score   support

               request       0.83      0.72      0.77       733
                 offer       0.00      0.00      0.00         1
           aid_related       0.82      0.75      0.78       801
          medical_help       0.71      0.11      0.19       111
      medical_products       0.83      0.07      0.13        71
     search_and_rescue       1.00      0.02      0.04        46
              security       0.00      0.00      0.00        24
              military       0.00      0.00      0.00        12
           child_alone       0.00      0.00      0.00         0
                 water       0.93      0.79      0.86       153
                  food       0.92      0.82      0.87       312
               shelter       0.87      0.46      0.60       211
              clothing       0.00      0.00      0.00        15
                 money       0.00      0.00      0.00        22
        missing_people       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
