## Initial setup
Set up imports and read in data

In [1]:
# Libraries
from collections import namedtuple
import nltk
import pandas as pd
import pickle
import re
from sqlalchemy import create_engine

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\clone\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\clone\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\clone\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\clone\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [2]:
# Data preparation
engine = create_engine('sqlite:///data/DisasterResponse.db')
qry = "SELECT * FROM scored_messages ORDER BY message"
df = pd.read_sql(qry, engine)
print("Data ingestd from database")
engine.dispose()

X = df.iloc[:, :3]
y = df.iloc[:, 3:]

Data ingestd from database


In [3]:
# Tokenizer to use in pipeline
def tokenize(text):
    """Turn string into lemmatized tokens"""
   
    # Basic cleaning
    text = text.lower()
    text = re.sub(r"[^a-z0-9]", " ", text)
    
    # Get word tokens and remove stopwords
    swrds = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    tokens = [tk.strip() for tk in tokens if tk not in swrds]
    
    # Lemmatize thoroughly
    lem = nltk.stem.wordnet.WordNetLemmatizer()
    lem_tok = []
    for tok in tokens:
        tok = lem.lemmatize(tok)
        tok = lem.lemmatize(tok, pos='v')
        tok = lem.lemmatize(tok, pos='a')
        lem_tok.append(tok)

    return lem_tok


In [None]:
# Note, this ends up not being needed
class ColSelect(BaseEstimator, TransformerMixin):
    """Transformer for splitting data by type"""
    def __init__(self, cols, vect=False):
        print(cols)
        self._cols = cols
        self._vect = vect

    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        # NOTE: This may not be stable, but works here
        if self._vect:
            return X[self._cols[0]]
        else:
            return X[self._cols] 

In [4]:
def text_gscv():
    """Generate a GridSearchCV object for a text classification pipeline"""
    
    msgs = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer())
    ])
    
    pipeline = Pipeline([
        ('feats', ColumnTransformer([
            ('msgs', msgs, 'message'),
            ('ohe', OneHotEncoder(), ['genre', 'translated'])
        ], remainder='drop')),
        ('clss', MultiOutputClassifier(
            RandomForestClassifier()))
    ])
    
    parameters = {
        'clss__estimator__min_samples_leaf': range(1,2), #range(1,4),
        'clss__estimator__min_samples_split': range(2,3), #range(2,5),
        'clss__estimator__n_estimators': [10], #[10, 25, 50, 100, 120],
        'feats__msgs__tfidf__smooth_idf': [False], #[False, True],
        'feats__msgs__vect__ngram_range': [(1,1)], #[(1,1), (1,2)],        
    }
    
    gscv = GridSearchCV(pipeline, parameters, cv=5)
    
    return gscv

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2290)
model = text_gscv()
model.fit(X_train, y_train)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('feats',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('msgs',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('vect',
                                                                                          CountVectorizer(analyzer='word',
                                                                                                          binary=False,
                          

In [7]:
y_pred = model.predict(X_test)

In [8]:
scor_lst = []
for col in range(y_test.shape[1]):
    rpt = classification_report(
        y_test.iloc[:, col], y_pred[:, col], output_dict=True)
    scor_lst.append([
        y_test.columns[col],
        rpt['accuracy'], 
        rpt['macro avg']['precision'],
        rpt['macro avg']['recall'],
        rpt['macro avg']['f1-score']
    ])

scor_df = pd.DataFrame(scor_lst, columns=['class','accuracy', 'precision', 'recall','f1-score'])
scor_df.sort_values('f1-score', ascending=False)

  'precision', 'predicted', average, warn_for)


Unnamed: 0,class,accuracy,precision,recall,f1-score
31,earthquake,0.963484,0.933575,0.84601,0.883603
27,weather_related,0.860199,0.845413,0.790049,0.810972
1,request,0.893659,0.84901,0.735279,0.774578
10,food,0.924064,0.876825,0.69873,0.752342
29,storm,0.933995,0.8658,0.690741,0.744383
3,aid_related,0.75508,0.753242,0.73349,0.738501
28,floods,0.943774,0.912952,0.66249,0.724519
11,shelter,0.933843,0.859636,0.658484,0.712095
9,water,0.950497,0.901776,0.649739,0.710594
34,direct_report,0.85424,0.788482,0.665032,0.698086


In [18]:
# Package model with evaluation and output to pickle
ModelOutput = namedtuple('ModelOutput', 'class_rept mod_obj')
mod_out = ModelOutput(scor_df, model)

with open('text_scorer.pkl', 'wb') as f:
    pickle.dump(mod_out, f)