# ML Pipeline Preparation


In [28]:
import os
import re
import pickle

import pandas as pd
from sqlalchemy import create_engine

import nltk

nltk.download(['punkt', 'wordnet', 'stopwords'])
    
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize,sent_tokenize

from sklearn.base import clone
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
def build_pipeline(**kwargs):
    """
    Builds machine learning pipeline, sets parameters for components if given.
    :param text: free text
    :return tokenized words
    """
    
    tf_args = {}
    clf_args = {}
    vect_args = {}
    
    classifier = kwargs.get('classifier', DecisionTreeClassifier())

    for key, value in kwargs.items() :
        if key.startswith('tf'):
            tf_args.update({key.split('__')[-1] :  value})
        elif key.startswith('clf'):
            clf_args.update({key.split('__')[-1] :  value})
        elif key.startswith('vect'):
            vect_args.update({key.split('__')[-1] :  value})            
            
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize, **vect_args)),
                         ('tfidf', TfidfTransformer(**tf_args)),
                         ('clf', MultiOutputClassifier(classifier.set_params(**clf_args)))
                        ])

    return pipeline


In [41]:
def load_model(filename):
    """
    Returns model dumped in pickle file
    :param filename: model pickle file
    :return model in pickle
    """
    return pickle.load(open(filename, 'rb'))

In [42]:
# load data from database
def load_data(dbname, tablename='messages'):
    """
    Loads data saved in db.table
    :param dbname: sql db name
    :param tablename: sql table name
    :return input and label data as tuple
    """
    engine = create_engine('sqlite:///{}'.format(dbname))
    df = pd.read_sql_table(tablename, con=engine)
    X = df['message']
    Y = df.drop(['id','original','message', 'genre'], axis=1)
    return X, Y


In [43]:
def tokenize(text):
    """
    Removes special characters, stopwords.First lemmatizes and then stems the tokens
    :param text: free text
    :return tokenized words
    """
    
    text = re.sub('[^(a-zA-Z0-9)]',' ',text.lower())
    words = word_tokenize(text)
    clean= [word for word in words if word not in stopwords.words("english") ]
    lemmed = [WordNetLemmatizer().lemmatize(w) for w in clean]
    stemmed = [PorterStemmer().stem(w) for w in lemmed]
    
    return stemmed

In [44]:
def get_best_estimator(pipeline, param_grid, X_train, Y_train):
    """
    Implements grid search for given param_grid on pipeline
    :param pipeline: sklearn pipeline
    :param param_grid: Dictionary with parameters names and values
    :param X_train: Training Input Data
    :param Y_train: Training Label Data
    :return best estimator
    """

    cv = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=-1)
    cv.fit(X_train, Y_train)

    print("Grid Search results for pipeline:\n best parameters : {}".format(cv.best_params_))
    return cv.best_estimator_


In [45]:
def report_scores(pipeline, X_test, Y_test):
    """
    Runs the model on test dataset
    Calculates f1 score, precision and recall for each category
    :param pipeline: trained ML pipeline
    :param X_test: Test Input Data
    :param Y_test: Test Label Data
    :returns pandas dataframe with metrics for each class of each category
    
    """
    
    Y_preds = pipeline.predict(X_test)
    Y_preds = pd.DataFrame(Y_preds, columns=Y_test.columns)
    
    report = []
    for col in Y_test.columns.tolist():
        report.append({
            'category': col,
            'precision': precision_score(Y_test[col], Y_preds[col], average='micro'),
            'recall': recall_score(Y_test[col], Y_preds[col], average='micro'),
            'f1_score': f1_score(Y_test[col], Y_preds[col], average='micro')})

    df_report = pd.DataFrame(report) 
    return df_report

In [35]:
def create_pickle(model, filename):
#     filename = 'final_model.sav'
    pickle.dump(model, open(filename, 'wb'))


In [36]:
#  main 
X, Y = load_data('data/disaster.db', 'messages')

In [37]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=23)

In [46]:
# grid search takes significant time to find best estimator
# parameters = {
#         'tfidf__norm': ['l1', 'l2'],
#         'clf__estimator__max_depth': [12, None]  
# }

# pipeline_0 = build_pipeline()

# pipeline = get_best_estimator(pipeline=pipeline_0, param_grid=parameters, X_train=X_train, Y_train=Y_train)

In [25]:
best_params = {'clf__estimator__max_depth': 12, 'tfidf__norm': 'l2'}
pipeline = build_pipeline(**best_params)

pipeline.fit(X_train, Y_train)
df_report_dtree = report_scores(pipeline, X_test, Y_test)

In [14]:
# saves report as csv
# df_report_dtree.to_csv('decision_tree_improved_report.csv')

### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [15]:
pipeline_2 = clone(pipeline)
pipeline_2.steps.pop(2)
pipeline_2.steps.append(['clf_new',MultiOutputClassifier(RandomForestClassifier(random_state=13, n_estimators=50))])

pipeline_2.fit(X_train, Y_train)
df_report_rf = report_scores(pipeline_2, X_test, Y_test)

In [19]:
# read previously generated report for improved pipeline with decision tree classifier
df_report_dtree = pd.read_csv('decision_tree_improved_report.csv')

In [20]:
merged = df_report_dtree.merge(df_report_rf, on='category')
merged.drop('Unnamed: 0' , axis=1, inplace=True)

In [21]:
merged['diff_f1'] = merged.f1_score_y-merged.f1_score_x
merged['diff_precision'] = merged.precision_y-merged.precision_x
merged['diff_recall'] = merged.recall_y-merged.recall_x


In [22]:
merged.sum()

category          relatedrequestofferaid_relatedmedical_helpmedi...
f1_score_x                                                  34.0337
precision_x                                                 34.0337
recall_x                                                    34.0337
f1_score_y                                                   34.126
precision_y                                                  34.126
recall_y                                                     34.126
diff_f1                                                   0.0923077
diff_precision                                            0.0923077
diff_recall                                               0.0923077
dtype: object

In [24]:
create_pickle(pipeline_2, 'model.pickle')