In [1]:
# import libraries

import pandas as pd
import numpy as np
from sqlalchemy import create_engine

import pickle
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
def load_data(database_filepath):
    '''
    Reads in the table from the sqlite database as a dataframe and returns feature and response as two different dataframes
    
    Arguments:
        database_filepath -> path of the sqlite database
    
    Output:
        X -> dataframe containg the features
        y -> dataframe containg the response
        category_names -> list of the response columns
    '''
    engine = create_engine(f'sqlite:///{database_filepath}')
    df = pd.read_sql(database_filepath.split('/')[-1].replace(".db",""), con = engine)
    
    X = df.iloc[:,1]
    y = df.iloc[:,4:]
    category_names = y.columns
    
    return X,y,category_names

In [3]:
def tokenize(text):
    '''
    Normalize, remove punctuation, tokenize the words, remove stop words and lemmatize
    
    Arguments:
        text -> 
        
    Output:
        lemmed -> transformed text
    '''
    # Normalize the text
    clean_text = text.lower()
    # Remove punctuation
    clean_text = re.sub(r"[^a-zA-Z0-9]", " ", clean_text)
    # Tokenize the sentence
    words = word_tokenize(clean_text)
    # remove stop words
    words = [w for w in words if w not in stopwords.words('english')]
    # lemmatize or stem the words
    lemmed = [WordNetLemmatizer().lemmatize(w).strip() for w in words]
    
    return lemmed

In [7]:
def build_model():
    '''
    Instanciate the model
    
    Output:
        cv -> cross validation model
    '''
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer = tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])
    
    
    # hyperparameters for the Random forrest classifier 
    parameters = {
        "clf__estimator__n_estimators": [10, 50, 100],
        "clf__estimator__max_depth": [3, 8]
    }
    
    # run cross validation
    cv = GridSearchCV(pipeline, param_grid = parameters, cv = 5)

    return cv

In [8]:
X,Y,category_name = load_data('../data/DisasterResponse.db')
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5)

In [9]:
model = build_model()
model.fit(X_train, Y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x7fd37ad1af70>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        MultiOutputClassifier(estimator=RandomForestClassifier()))]),
             param_grid={'clf__estimator__max_depth': [3, 8],
                         'clf__estimator__n_estimators': [10, 50, 100]})

In [10]:
model.best_estimator_

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x7fd37ad1af70>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier(max_depth=3,
                                                                        n_estimators=50)))])

In [8]:
y_pred = model.predict(X_test)
classification_report(y_true = Y_test.values[:,1], y_pred = y_pred[:,1])

'              precision    recall  f1-score   support\n\n           0       0.90      0.98      0.94     10847\n           1       0.84      0.48      0.61      2261\n\n    accuracy                           0.89     13108\n   macro avg       0.87      0.73      0.77     13108\nweighted avg       0.89      0.89      0.88     13108\n'

In [9]:
classification_report(y_true = Y_test.values[:,1], y_pred = y_pred[:,1], output_dict=True)

{'0': {'precision': 0.9001185034704587,
  'recall': 0.9803632340739374,
  'f1-score': 0.9385287498345174,
  'support': 10847},
 '1': {'precision': 0.8353941267387944,
  'recall': 0.47810703228659884,
  'f1-score': 0.6081575246132208,
  'support': 2261},
 'accuracy': 0.8937290204455295,
 'macro avg': {'precision': 0.8677563151046266,
  'recall': 0.7292351331802681,
  'f1-score': 0.773343137223869,
  'support': 13108},
 'weighted avg': {'precision': 0.8889541903952152,
  'recall': 0.8937290204455295,
  'f1-score': 0.8815429899760072,
  'support': 13108}}

In [11]:
f1_score = classification_report(y_true = Y_test.values[:,1], y_pred = y_pred[:,1], output_dict=True)['weighted avg']['f1-score']
precision = classification_report(y_true = Y_test.values[:,1], y_pred = y_pred[:,1], output_dict=True)['weighted avg']['precision']
recall = classification_report(y_true = Y_test.values[:,1], y_pred = y_pred[:,1], output_dict=True)['weighted avg']['recall']

# Model Evaluation

In [11]:
def evaluate_model(model, X_test, Y_test, category_names):
    best_model = model.best_estimator_
    Y_pred = best_model.predict(X_test)
    for i in range(Y_test.shape[1]):
        clf_result = classification_report(y_true = Y_test.values[:,i], y_pred = Y_pred[:,i], output_dict=True)['weighted avg']
        f1_score = clf_result['f1-score']
        precision = clf_result['precision']
        recall = clf_result['recall']
        print(f"For the column {category_names[i]}, f1_score is {f1_score} ,precison is {precision}, recall is {recall}")
    



In [12]:
evaluate_model(model, X_test, Y_test, category_name)

  _warn_prf(average, modifier, msg_start, len(result))


For the column related, f1_score is 0.6560454262548845 ,precison is 0.5772428992760689, recall is 0.7597650289899298
For the column request, f1_score is 0.7533628084552239 ,precison is 0.6894522619948785, recall is 0.8303326212999694
For the column offer, f1_score is 0.9932534742527778 ,precison is 0.9910181235134996, recall is 0.9954989319499542
For the column aid_related, f1_score is 0.4350794307617319 ,precison is 0.7316757923654328, recall is 0.5855965822398536
For the column medical_help, f1_score is 0.880292622117604 ,precison is 0.8446658876017823, recall is 0.9190570643881599
For the column medical_products, f1_score is 0.9275981779142335 ,precison is 0.905023885973843, recall is 0.9513274336283186
For the column search_and_rescue, f1_score is 0.9595618388804907 ,precison is 0.9465680773301942, recall is 0.9729173024107415
For the column security, f1_score is 0.9748955532841066 ,precison is 0.9667144035388173, recall is 0.9832163564235581
For the column military, f1_score is 0.

In [14]:
pickle.dump(model.best_estimator_, open('test_model.pkl', 'wb'))

In [15]:
pickled_model = pickle.load(open('test_model.pkl', 'rb'))
pickled_model.predict(X_test)

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])