In [16]:
# import libraries
import os
import re

import pandas as pd
from sqlalchemy import create_engine
import sqlite3 as sql

import nltk
from nltk import pos_tag, ne_chunk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.metrics import classification_report

In [17]:
# load data from database
conn = create_engine('sqlite:///C:/Users/gradi/Documents/projects/machine_learning/disaster_response_model/data/DisasterResponse.db')
df = pd.read_sql_table('messages', conn)

X = df['message'].values
y = df.iloc[:, 4:].values

In [18]:
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# tokenization function
def tokenize(text):
    clean_text = text.lower() # convert all chars to lower case
    clean_text = re.sub(r"[^a-zA-Z0-9]", " ", clean_text) # remove non alpha-numeric characters
    clean_text = re.sub(' +', ' ', clean_text) # remove duplicate spaces
    
    # tokenize text
    words = word_tokenize(clean_text)
    words = [w for w in words if w not in stopwords.words("english")]
    
    # reduce words to their stems
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(w) for w in words]
    
    # reduce words to root form
    lemmatizer = WordNetLemmatizer()
    clean_tokens = [lemmatizer.lemmatize(w) for w in stemmed]
    
    return clean_tokens

In [20]:
def build_model():
    # build pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer(smooth_idf=False)),
        ('clf', RandomForestClassifier())
    ])
    
    # define parameters
    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2), (2, 3)],
        'clf__max_depth': [None, 4, 8],
        'clf__n_estimators': [50, 100, 200]
    }

    # create grid search object
    cv = GridSearchCV(pipeline, param_grid=parameters)
    
    return cv

In [None]:
# instantiate model and fit
model = build_model()
model.fit(X_train, y_train)

In [None]:
# predict on test data
y_pred = model.predict(X_test)

# print model results
print(classification_report(y_test, y_pred, target_names=labels))

In [None]:
model.best_params_

In [None]:
model.best_score_

In [None]:
class QuestionMarkCount(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # code here to transform data
        X_qcount = pd.Series(X).apply(lambda x: x.count('?'))
        
        return pd.DataFrame(X_qcount)

In [None]:
class ExclamationPointCount(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # code here to transform data
        X_expointcount = pd.Series(X).apply(lambda x: x.count('!'))
        
        return pd.DataFrame(X_expointcount)

In [None]:
class CapitalCount(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # code here to transform data
        X_capitalcount = pd.Series(X).apply(lambda text: sum(1 for c in text if c.isupper()))
        
        return pd.DataFrame(X_capitalcount)

In [None]:
class WordCount(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # code here to transform data
        X_wordcount = pd.Series(X).apply(lambda x: len(x.split()))
        
        return pd.DataFrame(X_wordcount)

In [72]:
def build_model_v2():
    # build pipeline
    pipeline = Pipeline([
        ('features', FeatureUnion([
            ('textpipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize, ngram_range=(1,2))),
                ('tfidf', TfidfTransformer(smooth_idf=False)),
            ])),
            ('qmark_count', QuestionMarkCount()),
            ('expoint_count', ExclamationPointCount()),
            ('capital_count', CapitalCount()),
            ('word_count', WordCount())
        ])),
        ('clf', RandomForestClassifier(n_estimators=200))
    ])
    
    # define parameters
    parameters = {
        'features__transformer_weights': (
            {'text_pipeline': 0.6, 'word_count': 0.1, 'qmark_count': 0.1, 'expoint_count': 0.1, 'capital_count': 0.1},
            {'text_pipeline': 0.8, 'word_count': 0.05, 'qmark_count': 0.05, 'expoint_count': 0.05, 'capital_count': 0.05},
            {'text_pipeline': 0.95, 'word_count': 0.0125, 'qmark_count': 0.0125, 'expoint_count': 0.0125, 'capital_count': 0.0125}
        )
    }

    # create grid search object
    cv = GridSearchCV(pipeline, param_grid=parameters)
    
    return cv

In [73]:
# instantiate model and fit
print('Building model...')
model_v2 = build_model_v2()

print('Fitting model...')
model_v2.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('textpipeline',
                                                                        Pipeline(steps=[('vect',
                                                                                         CountVectorizer(ngram_range=(1,
                                                                                                                      2),
                                                                                                         tokenizer=<function tokenize at 0x000001F66638D670>)),
                                                                                        ('tfidf',
                                                                                         TfidfTransformer(smooth_idf=False))])),
                                                                       ('qmark_count',
                                              

In [74]:
print('Validating model...')
# predict on test data
y_pred = model_v2.predict(X_test)

# print model results
print(classification_report(y_test, y_pred, target_names=labels))

                        precision    recall  f1-score   support

               request       0.87      0.67      0.75       733
                 offer       0.00      0.00      0.00         1
           aid_related       0.86      0.65      0.74       801
          medical_help       0.33      0.01      0.02       111
      medical_products       0.00      0.00      0.00        71
     search_and_rescue       0.00      0.00      0.00        46
              security       0.00      0.00      0.00        24
              military       0.00      0.00      0.00        12
           child_alone       0.00      0.00      0.00         0
                 water       0.96      0.62      0.75       153
                  food       0.94      0.68      0.79       312
               shelter       0.87      0.33      0.48       211
              clothing       0.00      0.00      0.00        15
                 money       0.00      0.00      0.00        22
        missing_people       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [75]:
model_v2.best_params_

{'features__transformer_weights': {'text_pipeline': 0.8,
  'word_count': 0.05,
  'qmark_count': 0.05,
  'expoint_count': 0.05,
  'capital_count': 0.05}}

In [None]:
model_v2.best_score_

Trying with an AdaBoostClassifier

In [96]:
def build_model_v3():
    # build pipeline
    pipeline = Pipeline([
        ('features', FeatureUnion([
            ('textpipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize, ngram_range=(1,2))),
                ('tfidf', TfidfTransformer(smooth_idf=False)),
            ])),
            ('qmark_count', QuestionMarkCount()),
            ('expoint_count', ExclamationPointCount()),
            ('capital_count', CapitalCount()),
            ('word_count', WordCount())
        ])),
        ('clf', MultiOutputClassifier(AdaBoostClassifier()))
    ])
    
    # define parameters
    parameters = {
        'clf__estimator__learning_rate': [0.8, 1.0, 1.4],
        'clf__estimator__n_estimators': [50, 100, 200],
        'features__transformer_weights': [{'text_pipeline': 0.8, 'word_count': 0.05, 'qmark_count': 0.05, 'expoint_count': 0.05, 'capital_count': 0.05}]
    }

    # create grid search object
    cv = GridSearchCV(pipeline, param_grid=parameters)
    
    return cv

In [97]:
# instantiate model and fit
print('Building model v3...')
model_v3 = build_model_v3()

print('Fitting model v3...')
model_v3.fit(X_train, y_train)

Building model v3...
Fitting model v3...


GridSearchCV(estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('textpipeline',
                                                                        Pipeline(steps=[('vect',
                                                                                         CountVectorizer(ngram_range=(1,
                                                                                                                      2),
                                                                                                         tokenizer=<function tokenize at 0x000001F66638D670>)),
                                                                                        ('tfidf',
                                                                                         TfidfTransformer(smooth_idf=False))])),
                                                                       ('qmark_count',
                                              

In [98]:
print('Validating model...')
# predict on test data
y_pred = model_v3.predict(X_test)

# print model results
print(classification_report(y_test, y_pred, target_names=labels))

Validating model...
                        precision    recall  f1-score   support

               request       0.83      0.67      0.74       733
                 offer       0.00      0.00      0.00         1
           aid_related       0.82      0.69      0.75       801
          medical_help       0.72      0.26      0.38       111
      medical_products       0.75      0.42      0.54        71
     search_and_rescue       0.29      0.04      0.08        46
              security       0.00      0.00      0.00        24
              military       0.50      0.17      0.25        12
           child_alone       0.00      0.00      0.00         0
                 water       0.92      0.88      0.90       153
                  food       0.90      0.87      0.88       312
               shelter       0.78      0.69      0.73       211
              clothing       0.38      0.20      0.26        15
                 money       0.29      0.18      0.22        22
        missing_peo

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [99]:
print('Best model parameters...')
model_v3.best_params_

Best model parameters...


{'clf__estimator__learning_rate': 0.8,
 'clf__estimator__n_estimators': 50,
 'features__transformer_weights': {'text_pipeline': 0.8,
  'word_count': 0.05,
  'qmark_count': 0.05,
  'expoint_count': 0.05,
  'capital_count': 0.05}}

In [12]:
def build_model_v4():
    # build pipeline
    pipeline = Pipeline([
        ('features', FeatureUnion([
            ('textpipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize, ngram_range=(1,2))),
                ('tfidf', TfidfTransformer(smooth_idf=False)),
            ])),
            ('qmark_count', QuestionMarkCount()),
            ('expoint_count', ExclamationPointCount()),
            ('capital_count', CapitalCount()),
            ('word_count', WordCount())
        ])),
        ('clf', MultiOutputClassifier(GradientBoostingClassifier()))
    ])
    
    # define parameters
    parameters = {
        'clf__estimator__max_depth': [3, 5, 8],
        'clf__estimator__n_estimators': [50, 100, 200],
        'clf__estimator__learning_rate': [0.08, 0.1, 0.2],
        'features__transformer_weights': [{'text_pipeline': 0.8, 'word_count': 0.05, 'qmark_count': 0.05, 'expoint_count': 0.05, 'capital_count': 0.05}]
    }

    # create grid search object
    cv = GridSearchCV(pipeline, param_grid=parameters)
    
    return cv

In [13]:
# instantiate model and fit
print('Building model v4...')
model_v4 = build_model_v4()

print('Fitting model v4...')
model_v4.fit(X_train, y_train)

Building model v4...
Fitting model v4...


GridSearchCV(estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('textpipeline',
                                                                        Pipeline(steps=[('vect',
                                                                                         CountVectorizer(ngram_range=(1,
                                                                                                                      2),
                                                                                                         tokenizer=<function tokenize at 0x000002019B0B7E50>)),
                                                                                        ('tfidf',
                                                                                         TfidfTransformer(smooth_idf=False))])),
                                                                       ('qmark_count',
                                              

In [15]:
print('Best model v4 parameters...')
model_v4.best_params_

Best model parameters...


{'clf__estimator__learning_rate': 0.08,
 'clf__estimator__max_depth': 5,
 'clf__estimator__n_estimators': 50,
 'features__transformer_weights': {'text_pipeline': 0.8,
  'word_count': 0.05,
  'qmark_count': 0.05,
  'expoint_count': 0.05,
  'capital_count': 0.05}}

In [14]:
print('Validating model v4...')
# predict on test data
y_pred = model_v4.predict(X_test)

# print model results
print(classification_report(y_test, y_pred, target_names=labels))

Validating model v4...
                        precision    recall  f1-score   support

               request       0.83      0.67      0.74       733
                 offer       0.00      0.00      0.00         1
           aid_related       0.84      0.68      0.76       801
          medical_help       0.74      0.25      0.38       111
      medical_products       0.68      0.42      0.52        71
     search_and_rescue       0.19      0.07      0.10        46
              security       0.08      0.08      0.08        24
              military       0.00      0.00      0.00        12
                 water       0.89      0.91      0.90       153
                  food       0.88      0.92      0.90       312
               shelter       0.82      0.67      0.74       211
              clothing       0.82      0.60      0.69        15
                 money       0.30      0.32      0.31        22
        missing_people       0.13      0.12      0.13        16
              re

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Measure Reported: weighted averages

| Model | Fitting | Precision | Recall | F1 Precision |
| :---- | :---- | :-------: | :----: | :----------: |
| Random Forrest Classifier | single fit | 0.76 | 0.55 | 0.61 |
| Random Forrest Classifier | grid search cross-validation | 0.74 | 0.46 | 0.55 |
| Random Forrest Classifier | previous + 4 extra features | 0.74 | 0.46 | 0.54 |
| AdaBoost Classifier | as previous | 0.74 | 0.58 | 0.64 |
| Gradient Boosting Classifier | as previous | 0.75 | 0.59 | 0.65 |

In [16]:
def build_model_final():
    # build pipeline
    pipeline = Pipeline([
        ('features', FeatureUnion([
            ('textpipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize, ngram_range=(1,2))),
                ('tfidf', TfidfTransformer(smooth_idf=False)),
            ])),
            ('qmark_count', QuestionMarkCount()),
            ('expoint_count', ExclamationPointCount()),
            ('capital_count', CapitalCount()),
            ('word_count', WordCount())
        ])),
        ('clf', MultiOutputClassifier(GradientBoostingClassifier(max_depth=5, n_estimators=50, learning_rate=0.08)))
    ])
    
    # define parameters
    parameters = {
        'features__transformer_weights': [{'text_pipeline': 0.8, 'word_count': 0.05, 'qmark_count': 0.05, 'expoint_count': 0.05, 'capital_count': 0.05}]
    }

    # create grid search object
    cv = GridSearchCV(pipeline, param_grid=parameters)
    
    return cv

In [17]:
import pickle

In [19]:
# instantiate model and fit
print('Building finsl model...')
final_model = build_model_final()

print('Fitting final model...')
final_model.fit(X_train, y_train)

# save model to disk
print('Saving final model to disk...')
filename = 'disaster_response_model.sav'
pickle.dump(final_model, open(filename, 'wb'))

Building finsl model...
Fitting final model...
Saving final model to disk...


In [20]:
os.getcwd()

'C:\\Users\\gradi\\Documents\\projects\\machine_learning\\disaster_response'

In [40]:
for label in labels:
    category_rows_mask = df[label] == 1
    category_df = df[category_rows_mask]
    
    category_size = category_df.shape[0]
    
    if category_size > 0:
        
        sample_size = 5
        if category_size < 20:
            sample_size = int(category_size / 4)
            if sample_size < 1:
                sample_size = 1

        print("{} ({}) \n----".format(label, category_size))

        sample = category_df['message'].sample(sample_size)
        
        for index, text in sample.iteritems():
            print("{}:\t{}".format(index, text))
              
        print('\n\n')

request (3607) 
----
2683:	i m still waiting for your help. .. i'm starving, please bring me food
4686:	THERE IS A MISTAKE IN THE FOOD DISTRIBUTION,SOME PEOPLE GIVE CARDS TO ONLY TO PEOPLE THEY KNOW..!! I HAVE TO BEG OTHER PEOPLE SO THEY CAN EAT,IT'S NOT FAIR. 
2618:	hello we are in ile a vache. in the trou milieu area. we have 13 people 2 babys among them
879:	SORRY I GOT NOTHING TO HEAR NO POWER NO RADIO ONLY MY CELLPHONE PLEASE WRITE ME OR CALL ME I NEED YOUR HELP
3928:	Oh my Gosh, we are dying with hunger and thirst in LIlavois 47. 



offer (10) 
----
255:	How can we help the victims at Les Cayes?
3573:	i want to give blood where do I go 



aid_related (3931) 
----
4759:	Carrefour Feuilles needs food, drinking water and tents. 
2465:	We did not find any help in La Grenade, we still have people under the Rubble. We have no food and water.
570:	IN MY CITY. WE WANTED YOUR HELP PLEASE WE NEED OF THE FOODS, WATERS. WEARS. HOUSES. BEACAUSE OURS HOUSES IS DESTROYED BY THE CATASTROPH. We