In [1]:
# import libraries
import joblib
import pandas as pd
import pickle
import nltk
nltk.download(['stopwords','wordnet'])

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sqlalchemy import create_engine
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

import sys


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yangyinghui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yangyinghui/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# params
database_filepath = "sqlite:///DisasterResponse.db"

# functions
def load_data(database_filepath: str):
    """loads data from database_filepath
    input:
        database_filepath: path to the database
    output:
        X: numpy array of texts
        y: numpy array of target categories
        category_names: list of target category names
    """

    engine = create_engine(database_filepath)
    df = pd.read_sql_table("tbl_yyh_disaster_response_clean_data", engine)
    X = df[['message', 'genre']].head(2000)
    y = df.head(2000).drop(['message', 'genre'], axis=1).values
    category_names = list(df.columns)[2:]
    return X, y, category_names


def tokenize(text):
    tokens = word_tokenize(text)
    stop_words = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()

    clean_tokens = [
        lemmatizer.lemmatize(word).lower().strip()
        for word in tokens
        if word not in stop_words
    ]

    return clean_tokens


In [3]:
# load data and train test split
X, Y, category_names = load_data(database_filepath)
X_train, X_test, Y_train, Y_test = train_test_split(X.message.values, Y, test_size=0.2)

In [4]:
# I would like to select the best model out of a few algorithms
# use gridsearchcv to select the best one
def choose_model():
    """
    input: None
    Output: model pipeline
    """

    # build pipeline
    pipeline = Pipeline([
        ("vect", CountVectorizer(
            tokenizer=tokenize, 
            ngram_range=(1,1), 
            max_df=0.5, 
            max_features=None)
        ),
        ("tfidf", TfidfTransformer(use_idf=True)),
        ("clf", MultiOutputClassifier(LogisticRegression()))
    ])
    parameters = [
        {
            "clf": [MultiOutputClassifier(LogisticRegression(class_weight='balanced', random_state=42))]
        },{
            "clf": [MultiOutputClassifier(KNeighborsClassifier(n_neighbors=4))]
        },{
            "clf": [MultiOutputClassifier(RandomForestClassifier(n_estimators=200, random_state=42))]
        },{
            "clf": [MultiOutputClassifier(DecisionTreeClassifier(criterion='entropy', random_state=42))]
        }
    ]

    cv = GridSearchCV(pipeline, parameters)

    return cv

In [5]:
model_selection = choose_model()
model_selection.fit(X_train, Y_train)
print(model_selection.best_params_)

{'clf': MultiOutputClassifier(estimator=RandomForestClassifier(n_estimators=200,
                                                       random_state=42))}


The random forest classifier outperforms the other ones so we will use random forest for the project.

In [6]:
# build the random forest model to get best parameters with gridsearchcv
def build_rf_model():
    """
    input: None
    Output: model pipeline for random forest model
    """

    # build pipeline
    pipeline = Pipeline([
        ("vect", CountVectorizer(tokenizer=tokenize, max_df=0.5)),
        ("tfidf", TfidfTransformer()),
        ("clf", MultiOutputClassifier(
            estimator=RandomForestClassifier(random_state=42)
        ))
    ])
    parameters = {
        'clf__estimator__n_estimators': [50, 100, 200],
        'clf__estimator__max_features': ["sqrt", "log2", None],
        "clf__estimator__min_samples_split": [2, 3, 4]
    }

    cv = GridSearchCV(pipeline, param_grid=parameters)

    return cv


In [7]:
model_rf = build_rf_model()
model_rf.fit(X_train, Y_train)
model_rf.best_estimator_

In [8]:
# evaluate model with precision, recall and f1 score for each category
def evaluate_model(model, X_test, y_test, category_names: list):
    """prints precision, recall, F1 score of the model on the test set
    input:
        model: trained model object to make predictions
        X_test: numpy array
        Y_test: numpy array
        category_names: list of category names
    Output:
        None
    """
    y_pred = model.predict(X_test)
    y_test = pd.DataFrame(y_test, columns=category_names)
    y_pred = pd.DataFrame(y_pred, columns=category_names)

    for column in y_test:
        print(column)
        print(classification_report(
            y_test[column], y_pred[column], zero_division=0
        ))


In [10]:
evaluate_model(model_rf, X_test, Y_test, category_names)

related
              precision    recall  f1-score   support

           0       0.89      0.96      0.92        93
           1       0.99      0.97      0.98       303
           2       1.00      0.75      0.86         4

    accuracy                           0.96       400
   macro avg       0.96      0.89      0.92       400
weighted avg       0.96      0.96      0.96       400

request
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       202
           1       0.96      0.96      0.96       198

    accuracy                           0.96       400
   macro avg       0.96      0.96      0.96       400
weighted avg       0.96      0.96      0.96       400

offer
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       400

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       40

With random forest we achieve near to 1 weighted accuracy together with very high precision, recall and f1 scores. Just out of curiosity I want to see how it performs adding a new external feature 'genre'. 

In [11]:
# now I would like to see whether adding a new feature 'genre' would make it better
def build_model_with_genre():
    """
    input: None
    Output: model pipeline
    """
    text_features = 'message'
    text_transformer = Pipeline([
        ("vect", CountVectorizer(
            tokenizer=tokenize,
            ngram_range=(1, 1),
            max_features=None,
            max_df=0.5
        )),
        ("tfidf", TfidfTransformer(use_idf=True))
    ])

    categorical_features = ['genre']
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    preprocessor = ColumnTransformer(
        transformers=[
            ('text', text_transformer, text_features),
            ('genre', categorical_transformer, categorical_features)
        ]
    )

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ("clf", MultiOutputClassifier(
            estimator=RandomForestClassifier(
                n_estimators=200,
                max_features=None,
                random_state=42
            )
        ))
    ])

    parameters = {
        'preprocessor__transformer_weights': (
            {'text': 1, 'genre': 0},
            {'text': 1, 'genre': 0.5},
            None
        )
    }

    cv = GridSearchCV(pipeline, param_grid=parameters)

    return cv


In [12]:
X

Unnamed: 0,message,genre
0,Weather update - a cold front from Cuba that c...,direct
1,Is the Hurricane over or is it not over,direct
2,Looking for someone but no name,direct
3,UN reports Leogane 80-90 destroyed. Only Hospi...,direct
4,"says: west side of Haiti, rest of the country ...",direct
...,...,...
1995,"Good evening people of Boulevard 15 Octobre, w...",direct
1996,the aftershocks. that could help keep people f...,direct
1997,"In the city of Marchand Dessalines, we need fo...",direct
1998,"Good Morning, we are located around the runway...",direct


In [13]:
# add genre to train and test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
model_with_genre = build_model_with_genre()
model_with_genre.fit(X_train, Y_train)
model_with_genre.best_estimator_

In [14]:
evaluate_model(model_with_genre, X_test, Y_test, category_names)

related
              precision    recall  f1-score   support

           0       0.91      0.90      0.91        90
           1       0.97      0.97      0.97       308
           2       1.00      1.00      1.00         2

    accuracy                           0.96       400
   macro avg       0.96      0.96      0.96       400
weighted avg       0.96      0.96      0.96       400

request
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       204
           1       0.97      0.96      0.97       196

    accuracy                           0.97       400
   macro avg       0.97      0.97      0.97       400
weighted avg       0.97      0.97      0.97       400

offer
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       399
           1       1.00      1.00      1.00         1

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       40

It recommends adding the genre feature, however to better demonstrate the classification with a text message only in the web app, I will use the random forest model without the genre feature. 

In [13]:
# finally save model
def save_model(model, model_filepath: str):
    """saves model as pickle file to filepath
    """
    pickle.dump(model, open(model_filepath, "wb"))
