In [1]:
# import libraries
import sys
import re
import time
import pickle

import pandas as pd
from sqlalchemy import create_engine

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = stopwords.words("english")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
# Create pipline with different model, support vector machine (SVM)

# load data from database
engine = create_engine("sqlite:///DisasterResponse.db")
connection = engine.connect()
query = "SELECT * FROM DisasterResponse"  ## limit 1000 WHERE related <> 2
df = pd.read_sql(query, connection)
connection.close()

# Keep only the predictors in the X
predictors = ["message"]
X = df[predictors].message.values
print("Dimensions of X are:", X.ndim)
print("Shape of X is", X.shape)
print("Size of X is", X.size)

# keep ony the 35 response variables in y; dropped child_alone since all values are 0
y = df.loc[:, ~df.columns.
           isin(['id', 'message', 'original', 'genre', 'child_alone'])]
y.head()
print("Dimensions of y are:", y.ndim)
print("Shape of y is", y.shape)
print("Size of y is", y.size)


def tokenize(text):
    # Normalize case and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

    tokens = word_tokenize(text)
    # Remove stop words and lemmatize
    tokens = [w for w in tokens if w not in stopwords.words("english")]

    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens


# let's split the data as train test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.75,
                                                    random_state=42)

# let's check the train test split results
print("X_train Dim are:", X_train.ndim, "Shape=", X_train.shape, "Size =",
      X_train.size)
print("y_train Dim are:", y_train.ndim, "Shape=", y_train.shape, "Size =",
      y_train.size)
print("X_test  Dim are:", X_test.ndim, "Shape=", X_test.shape, "Size =",
      X_test.size)
print("y_test  Dim are:", y_test.ndim, "Shape=", y_test.shape, "Size =",
      y_test.size)

# Define pipeline for SVC
pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                     ('tfidf', TfidfTransformer()),
                     ('scaler', StandardScaler()),
                     ('svc', MultiOutputClassifier(SVC(gamma='auto')))])

# from sklearn.model_selection import GridSearchCV
print(pipeline.get_params().keys())

# parms for pipline with different model
parameters = {
    'scaler__with_mean': [False],
    'svc__estimator__C': [1.0],  # Regularization parameter
#    'svc__n_jobs': [2],  # Number of CPU cores used when parallelizing over classes
    'svc__estimator__verbose': [1]
}
# define GridSearchCV
cv = GridSearchCV(pipeline, param_grid=parameters)

# Fit the pipline with different model
import time

starttm = time.time()
cv.fit(X_train, y_train)
endtm = time.time()
execTmsec = (endtm - starttm) * 10**6
print("execution time for the fit=", execTmsec, "seconds")

# what are the best parms
cv.best_params_

# what are the overall results from the new model
cv.cv_results_

# Use the pipeline with GridSearch to make predictions on test data
y_pred = cv.predict(X_test)

# score on training dataset
print("\n score on training dataset:", cv.score(X_train, y_train))

# score on test dataset
print("\n score on test dataset:", cv.score(X_test, y_test))

Dimensions of X are: 1
Shape of X is (26216,)
Size of X is 26216
Dimensions of y are: 2
Shape of y is (26216, 35)
Size of y is 917560
X_train Dim are: 1 Shape= (19662,) Size = 19662
y_train Dim are: 2 Shape= (19662, 35) Size = 688170
X_test  Dim are: 1 Shape= (6554,) Size = 6554
y_test  Dim are: 2 Shape= (6554, 35) Size = 229390
dict_keys(['memory', 'steps', 'vect', 'tfidf', 'scaler', 'svc', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'scaler__copy', 'scaler__with_mean', 'scaler__with_std', 'svc__estimator__C', 'svc__estimator__cache_size', 'svc__estimator__class_weight', 'svc__estimator__coef0', 'svc__estimator__decision_function_sha



[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]