# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [2]:
# import necessary libraries to load data from the database
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

def load_data(sqlite_path):
    engine = create_engine(sqlite_path)
    df = pd.read_sql_table('response_table',con=engine)
    X = df.loc[:,'message'].values
    Y = df.iloc[:,4:]
    
    return X, Y

In [3]:
sqlite_path = 'sqlite:///DisasterResponse.db'
X,Y = load_data(sqlite_path)

### 2. Write a tokenization function to process your text data

Lookup a complicated text to work on
- create a dictionary keeping all urls
- look up a complicated text
- test our code on this text

In [4]:
# import necessary libraries
import re
from sqlalchemy import create_engine
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download(['punkt', 'words', 'stopwords', 'averaged_perceptron_tagger', 'wordnet'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [5]:
def tokenize(text):
    """
    Input: 
    a text string found in each reccord (str)
    Output:
    a list of stems 
    
    Desscription:
    Function that cterates stems - word tokens
    1. replaces urls with the 'url' string
    2. replaces punctuation marks with white spaces
    3. creates lists of words out of the initial text
    4. assigns Parts of speech to every word
    5. reduces words to their root form by specifying verb parts of speech
    6. reduces words to their stems - not necessary words to be understood by humans
    
    
    """
    # regex pattern to identify an url
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    # replace urls with a 'url' string
    text = re.sub(url_regex, 'url', text)
    # text normalization - remove punctuation and lower case
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    # tokenize text to words
    words = [w for w in word_tokenize(text) if w not in stopwords.words("english")]
    # assign "Parts of Speech": POS to every word - words output is a tupple
    words = pos_tag(words)
    # Reduce words to their root form by specifying Part of Speech: verb
    lemmed = [WordNetLemmatizer().lemmatize(w[0], pos = 'v') for w in words]
    # Reduce words to their stems - that is their root form not exactly a word to be understood 
    stemmed = [PorterStemmer().stem(w) for w in lemmed]
    
    return stemmed

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [6]:
#import necessary libraries
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import fbeta_score, make_scorer

import time

In [8]:
pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer = tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', RandomForestClassifier())
    ])

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [7]:
def split_n_train(X, Y):
    """
    input: 
    multi-labels Y which is a dataframe holding information about our 34 categories to classify,
    a numpy array X keeping text to be classified
    a tokenizer function to tokenize our text
    
    output:
    a trained classification model
    X_train: 60% of the X  array for trainning purposes
    X_test: remaining 40% of the X array for testing purposes
    y_train: 60% rows of the Y dataframe to train our classifier
    y_test: 40% remaining 
    ])
    
    Description 
    splits and trains the classifier
    """
    #split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4)
    
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer = tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', RandomForestClassifier())
    ])
    start = time.time()/60
    # train classifier
    fitted = pipeline.fit(X_train, y_train)
    stop = time.time()/60
    print(f"Model calculation time: {round(stop - start)} minutes") 
    
    return fitted, X_train, X_test, y_train, y_test

In [8]:
fitted_model, X_train, X_test, y_train, y_test = split_n_train(X, Y)

Model calculation time: 2 minutes


In [9]:
# get the params of the fitted model
[key for key in fitted_model.get_params().keys()]

['memory',
 'steps',
 'vect',
 'tfidf',
 'clf',
 'vect__analyzer',
 'vect__binary',
 'vect__decode_error',
 'vect__dtype',
 'vect__encoding',
 'vect__input',
 'vect__lowercase',
 'vect__max_df',
 'vect__max_features',
 'vect__min_df',
 'vect__ngram_range',
 'vect__preprocessor',
 'vect__stop_words',
 'vect__strip_accents',
 'vect__token_pattern',
 'vect__tokenizer',
 'vect__vocabulary',
 'tfidf__norm',
 'tfidf__smooth_idf',
 'tfidf__sublinear_tf',
 'tfidf__use_idf',
 'clf__bootstrap',
 'clf__class_weight',
 'clf__criterion',
 'clf__max_depth',
 'clf__max_features',
 'clf__max_leaf_nodes',
 'clf__min_impurity_decrease',
 'clf__min_impurity_split',
 'clf__min_samples_leaf',
 'clf__min_samples_split',
 'clf__min_weight_fraction_leaf',
 'clf__n_estimators',
 'clf__n_jobs',
 'clf__oob_score',
 'clf__random_state',
 'clf__verbose',
 'clf__warm_start']

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [64]:
start = time.time()/60
y_pred = pipeline.predict(X_test)
stop = time.time()/60
print(f"Predicting time: {stop - start}")

Predicting time: 0.6064376123249531


In [10]:
def eval_model(X_test, fitted_model):
    """
    input:
    X_test = 60% of the X array to test our model
    The fitted model we have trained on our classifier
    
    output:
    y_pred = predicted outputs that indicate what
    kind of request each text is refering to our
    of our 34 categories we train
    
    Descriprion:
    takes a trained model and aplies the test dataset
    """
    
    start = time.time()/60
    y_pred = fitted_model.predict(X_test)
    stop = time.time()/60
    print(f"Model evaluation time: {round(stop - start)} minutes") 
    
    return y_pred

In [11]:
y_pred = eval_model(X_test, fitted_model)

Model evaluation time: 1 minutes


In [50]:
def display_results(y_test, y_pred, fitted_model, *cvd):
    '''
    input:
    y_pred = predicted outputs that indicate what
    kind of request each text is refering to our
    of our 34 categories we train
    
    output:
    displays the accuracy of each predictor of our classifier
    displays Pipeline Parameters
    displays the best parameters for a model run with Grid Search
    
    '''
    accuracy = (y_pred == y_test).mean()
    
    print("Accuracy for each predictor:")
    print(accuracy)
    
    # get the params of the fitted model
    print("Get Pipeline parameters")
    for key in fitted_model.get_params().keys():
        print(key)
        
    # check best parameters in cross validated models
    for cv in cvd:
        print("\nBest Parameters:", cv.best_params_)

In [43]:
display_results(y_test, y_pred, fitted_model)

Accuracy for each predictor:
request                   0.880330
offer                     0.995390
aid_related               0.726373
medical_help              0.922301
medical_products          0.952363
search_and_rescue         0.973588
security                  0.982328
military                  0.966769
water                     0.953323
food                      0.928064
shelter                   0.928064
clothing                  0.987706
money                     0.975605
missing_people            0.987803
refugees                  0.966673
death                     0.958125
other_aid                 0.867172
infrastructure_related    0.935363
transport                 0.956685
buildings                 0.951498
electricity               0.978294
tools                     0.993853
hospitals                 0.988283
shops                     0.995582
aid_centers               0.988859
other_infrastructure      0.956685
weather_related           0.845947
floods                    

### 6. Improve your model
Use grid search to find better parameters. 

In [44]:
def build_model():
    """
    Input:
    no input
    
    output:
    improoved model
    
    Description:
    a cross validated fitted model with improved parameters using GridSearch.
    In our case the number of trees in the forest.

    """
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer = tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', RandomForestClassifier())
    ])

    parameters = {'clf__n_estimators': [10, 50, 100]}

    cv = GridSearchCV(pipeline, param_grid=parameters)

    return cv

In [46]:
start = time.time()/60
cross_validated = build_model()
cross_validated.fit(X_train, y_train)
stop = time.time()/60
print(f"Cross validated fitted model time: {round(stop - start)} minutes") 

Cross validated fitted model time: 35 minutes


### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [47]:
y_pred_cv = eval_model(X_test, cross_validated)

Model evaluation time: 1 minutes


In [51]:
display_results(y_test, y_pred_cv, fitted_model, cross_validated)

Accuracy for each predictor:
request                   0.891663
offer                     0.995390
aid_related               0.756339
medical_help              0.922397
medical_products          0.951306
search_and_rescue         0.973684
security                  0.982424
military                  0.967249
water                     0.950442
food                      0.928544
shelter                   0.929793
clothing                  0.986650
money                     0.975797
missing_people            0.987899
refugees                  0.966673
death                     0.958317
other_aid                 0.867941
infrastructure_related    0.935075
transport                 0.956589
buildings                 0.950730
electricity               0.978390
tools                     0.993853
hospitals                 0.988283
shops                     0.995486
aid_centers               0.988859
other_infrastructure      0.956589
weather_related           0.860642
floods                    

### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [52]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.multioutput import MultiOutputClassifier

In [53]:
pipeline_ada = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('count_vectorizer', CountVectorizer(tokenizer=tokenize)),
                ('tfidf_transformer', TfidfTransformer())
            ]))
            
        ])),

        ('classifier', MultiOutputClassifier(AdaBoostClassifier()))
    ])

In [54]:
start = time.time()/60
pipeline_ada.fit(X_train, y_train)
stop = time.time()/60
print(f"Model calculation time: {round(stop - start)} minutes") 

Model calculation time: 3 minutes


In [55]:
pipeline_ada.get_params().get('classifier__estimator__learning_rate')

1.0

In [59]:
y_pred_ada = eval_model(X_test, pipeline_ada)

Model evaluation time: 1 minutes


In [60]:
display_results(y_test, y_pred_ada, pipeline_ada)

Accuracy for each predictor:
request                   0.889262
offer                     0.993853
aid_related               0.764983
medical_help              0.931041
medical_products          0.953995
search_and_rescue         0.974453
security                  0.980215
military                  0.969842
water                     0.961679
food                      0.944679
shelter                   0.944295
clothing                  0.989147
money                     0.978582
missing_people            0.986938
refugees                  0.969554
death                     0.966673
other_aid                 0.866884
infrastructure_related    0.934595
transport                 0.960334
buildings                 0.960334
electricity               0.979735
tools                     0.993085
hospitals                 0.986650
shops                     0.994814
aid_centers               0.987514
other_infrastructure      0.953515
weather_related           0.880811
floods                    

### 9. Export your model as a pickle file

In [56]:
model_filepath = 'classifier'

In [61]:
import pickle
def
with open(model_filepath, 'wb') as f:
        pickle.dump(pipeline_ada, f)

### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.

In [64]:
# import necessary libraries to load data from the database
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

# import necessary libraries
import re
from sqlalchemy import create_engine
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download(['punkt', 'words', 'stopwords', 'averaged_perceptron_tagger', 'wordnet'])

#import necessary libraries
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import fbeta_score, make_scorer

import time
import pickle

def load_data(sqlite_path):
    '''
    Input: 
    Database filepath
    Output: 
    multi-labels Y which is a dataframe holding information about our 34 categories to classify,
    a numpy array X keeping text to be classified
    
    Description:
    Provides curated data from sql database to X numpy.arrary and y dataframe multiclass variables
    '''
    engine = create_engine(sqlite_path)
    df = pd.read_sql_table('response_table',con=engine)
    X = df.loc[:,'message'].values
    Y = df.iloc[:,4:]
    
    return X, Y

def tokenize(text):
    """
    Input: 
    a text string found in each reccord (str)
    Output:
    a list of stems 
    
    Desscription:
    Function that cterates stems - word tokens
    1. replaces urls with the 'url' string
    2. replaces punctuation marks with white spaces
    3. creates lists of words out of the initial text
    4. assigns Parts of speech to every word
    5. reduces words to their root form by specifying verb parts of speech
    6. reduces words to their stems - not necessary words to be understood by humans
    
    
    """
    # regex pattern to identify an url
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    # replace urls with a 'url' string
    text = re.sub(url_regex, 'url', text)
    # text normalization - remove punctuation and lower case
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    # tokenize text to words
    words = [w for w in word_tokenize(text) if w not in stopwords.words("english")]
    # assign "Parts of Speech": POS to every word - words output is a tupple
    words = pos_tag(words)
    # Reduce words to their root form by specifying Part of Speech: verb
    lemmed = [WordNetLemmatizer().lemmatize(w[0], pos = 'v') for w in words]
    # Reduce words to their stems - that is their root form not exactly a word to be understood 
    stemmed = [PorterStemmer().stem(w) for w in lemmed]
    
    return stemmed

def split_n_train(X, Y):
    """
    input: 
    multi-labels Y which is a dataframe holding information about our 34 categories to classify,
    a numpy array X keeping text to be classified
    a tokenizer function to tokenize our text
    
    output:
    a trained classification model
    X_train: 60% of the X  array for trainning purposes
    X_test: remaining 40% of the X array for testing purposes
    y_train: 60% rows of the Y dataframe to train our classifier
    y_test: 40% remaining 
    ])
    
    Description 
    splits and trains the classifier
    """
    #split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4)
    
    pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('count_vectorizer', CountVectorizer(tokenizer=tokenize)),
                ('tfidf_transformer', TfidfTransformer())
            ]))
            
        ])),

        ('classifier', MultiOutputClassifier(AdaBoostClassifier()))
    ])
    start = time.time()/60
    # train classifier
    fitted = pipeline.fit(X_train, y_train)
    stop = time.time()/60
    print(f"Model calculation time: {round(stop - start)} minutes") 
    
    return fitted, X_train, X_test, y_train, y_test

def eval_model(X_test, fitted_model):
    """
    input:
    X_test = 60% of the X array to test our model
    The fitted model we have trained on our classifier
    
    output:
    y_pred = predicted outputs that indicate what
    kind of request each text is refering to our
    of our 34 categories we train
    
    Descriprion:
    takes a trained model and aplies the test dataset
    """
    
    start = time.time()/60
    y_pred = fitted_model.predict(X_test)
    stop = time.time()/60
    print(f"Model evaluation time: {round(stop - start)} minutes") 
    
    return y_pred

def save_model(fitted_model, model_filepath):
    '''
    Input:
    the fitted model
    model_filepath (str) is the path of the pickle file to be saved
    '''

    with open(model_filepath, 'wb') as f:
        pickle.dump(fitted_model, f)
        
        
def main():
    sqlite_path = 'sqlite:///DisasterResponse.db'
    X,Y = load_data(sqlite_path)
    fitted_model, X_train, X_test, y_train, y_test = split_n_train(X, Y)
    y_pred = eval_model(X_test, fitted_model)
    display_results(y_test, y_pred, fitted_model)
    save_model(fitted_model, model_filepath)
        
main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Model calculation time: 3 minutes
Model evaluation time: 1 minutes
Accuracy for each predictor:
request                   0.888206
offer                     0.994333
aid_related               0.759124
medical_help              0.923838
medical_products          0.953035
search_and_rescue         0.973780
security                  0.980599
military       

Notes

In [None]:
# import libraries
import pandas as pd
import re
from sqlalchemy import create_engine

In [None]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql_table('response_table',con=engine)

In [None]:
# load data from database
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql_table('response_table',con=engine)
X = df.loc[:,'message'].values
Y = df.iloc[:,4:]

In [None]:
# regex pattern to identify an url
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

# a dictionaly comprehension keeping all indexes and urls in case we wishesd to furtehr process later
urls_dict = \
{df[df.message==text].index[0]:re.findall(url_regex, text) for text in df.message if len(re.findall(url_regex, text))>1}

In [None]:
# lookup check out a complicated text to work on
text = df.message[12409]
text

In [None]:
# we use the above mentioned text to get the index od the data frame, which will later help us identify urls 
# and where these are located in the dataframe - their index
df[df.message == 'Wind 16.0 mph NNE. Barometer 982.09 mb, gust 31.0, Temp 56.9 &amp;deg;F. Rain 0.00 in. Humidity 95% hurricane cam http://t.co/Sq2ekENuWind 16.0 mph NNE. Barometer 981.68 mb, gust 26.0, Temp 57.1 &amp;deg;F. Rain 0.00 in. Humidity 95% hurricane cam http://t.co/Sq2ekENu'].index

In [None]:
# Index in pandas is basic object storing axis labels for all pandas objects - we need the index value
type(df[df.message == 'Wind 16.0 mph NNE. Barometer 982.09 mb, gust 31.0, Temp 56.9 &amp;deg;F. Rain 0.00 in. Humidity 95% hurricane cam http://t.co/Sq2ekENuWind 16.0 mph NNE. Barometer 981.68 mb, gust 26.0, Temp 57.1 &amp;deg;F. Rain 0.00 in. Humidity 95% hurricane cam http://t.co/Sq2ekENu'].index[0])

In [None]:
# replace all url found in each text with the string "url"
text = re.sub(url_regex, 'url', text)
text

In [None]:
# normalize with lower case
text = text.lower()
text

In [None]:
# remove punctuation
text = re.sub(r"[^a-zA-Z0-9]", " ", text)
text

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
nltk.download(['punkt', 'words', 'stopwords', 'averaged_perceptron_tagger'])

In [None]:
from nltk.corpus import stopwords
print(stopwords.words("english"))

In [None]:
# remove stopwords
words = [w for w in word_tokenize(text) if w not in stopwords.words("english")]
words

In [None]:
# import "Parts of Speech": POS and Name Entity Recognition NER
from nltk import pos_tag, ne_chunk
words = pos_tag(words)
words

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
# Reduce words to their root form
lemmed = [WordNetLemmatizer().lemmatize(w[0], pos = 'v') for w in words]
lemmed

In [None]:
from nltk.stem.porter import PorterStemmer

# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in lemmed]
stemmed

In [None]:
# credit to https://github.com/iris-theof/Disaster_response_pipeline
def tokenize(text):
    '''
    Function that splits text into words and return the root form of the words
    after removing the stop words
    
    Input: text(str): the message
    Output: lemm(list of str): a list of the root form of the message words
    '''
    #Regex to find urls
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    
    # Finds all urls from the provided text
    detected_urls = re.findall(url_regex, text)
    
    #Replaces all urls found with the "urlplaceholder"
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
        
    # Normalize text
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())    
        
    # Extracts the word tokens from the provided text    
    tokens = word_tokenize(text)
      
    # Remove stop words
    stop = stopwords.words("english")
    words = [t for t in tokens if t not in stop]
    
    #Lemmanitizer to remove inflectional and derivationally related forms of a word
    lemmatizer = WordNetLemmatizer()

    # Makes a list of clean tokens
    clean_tokens = []
    for tok in words:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens


In [None]:
import re
from sqlalchemy import create_engine
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download(['punkt', 'words', 'stopwords', 'averaged_perceptron_tagger', 'wordnet'])
def tokenize(text):
    """
    Input: 
    a text string found in each reccord (str)
    Output:
    a list of stems 
    
    Desscription:
    Function that cterates stems - word tokens
    1. replaces urls with the 'url' string
    2. replaces punctuation marks with white spaces
    3. creates lists of words out of the initial text
    4. assigns Parts of speech to every word
    5. reduces words to their root form by specifying verb parts of speech
    6. reduces words to their stems - not necessary words to be understood by humans
    
    
    """
    # regex pattern to identify an url
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    # replace urls with a 'url' string
    text = re.sub(url_regex, 'url', text)
    # text normalization - remove punctuation and lower case
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    # tokenize text to words
    words = [w for w in word_tokenize(text) if w not in stopwords.words("english")]
    # assign "Parts of Speech": POS to every word - words output is a tupple
    words = pos_tag(words)
    # Reduce words to their root form by specifying Part of Speech: verb
    lemmed = [WordNetLemmatizer().lemmatize(w[0], pos = 'v') for w in words]
    # Reduce words to their stems - that is their root form not exactly a word to be understood 
    stemmed = [PorterStemmer().stem(w) for w in lemmed]
    
    return stemmed

In [None]:
text = df.message[12409]
text

In [None]:
text = tokenize(text)
text

 `CountVectorizer` (Bag of Words)

In [None]:
# Instantiate transformers and classifier
vect = CountVectorizer(tokenizer=tokenize)
tfidf = TfidfTransformer()
clf = RandomForestClassifier()

# Fit and/or transform each to the data
X_train_counts = vect.fit_transform(X_train)
X_train_tfidf = tfidf.fit_transform(X_train_counts)
clf.fit(X_train_tfidf, Y_train)

In [None]:
# convert sparse matrix to numpy array to view
X_train_counts.toarray()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# initialize tf-idf vectorizer object
vectorizer = TfidfVectorizer()

In [None]:
X_test_vect = vectorizer.fit_transform(X_test)

In [None]:
# Transform test data
X_test_counts = vect.transform(X_test)
X_test_tfidf = tfidf.transform(X_test_counts)

# Predict test labels
y_pred = clf.predict(X_test_tfidf)

`TfidfTransformer`

In [None]:
# initialize tf-idf transformer object
transformer = TfidfTransformer(smooth_idf=False)

In [None]:
# use counts from count vectorizer results to compute tf-idf values
tfidf = transformer.fit_transform(X_count)

In [None]:
# convert sparse matrix to numpy array to view
tfidf.toarray()

`TfidfVectorizer` = `CountVectorizer` + `TfidfTransformer`

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# initialize tf-idf vectorizer object
vectorizer = TfidfVectorizer()

In [None]:
# compute bag of word counts and tf-idf values
X_vect = vectorizer.fit_transform(X_train)

In [None]:
# convert sparse matrix to numpy array to view
X_vect.toarray()

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_vect, Y_train)

In [None]:
start = time.time()/60
# train classifier
pipeline.fit(X_train, y_train)
stop = time.time()/60
print(f"Model calculation time: {round(stop - start)} minutes") 

In [None]:
pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer = tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(GradientBoostingClassifier(max_depth=6)))
    ])

In [None]:
# choose parameters
parameters = {'clf__estimator__n_estimators': [100, 140]}

    # create grid search object
model = GridSearchCV(pipeline, param_grid=parameters, scoring='recall_micro', cv=4)

In [29]:
from sklearn.base import BaseEstimator,TransformerMixin

# Build a custom transformer which will extract the starting verb of a sentence
class StartingVerbExtractor(BaseEstimator, TransformerMixin):
    """
    Starting Verb Extractor class
    
    This class extract the starting verb of a sentence,
    creating a new feature for the ML classifier
    """

    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
        return False

    # Given it is a tranformer we can return the self 
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

In [None]:
start = time.time()/60
# train classifier
pipeline.fit(X_train, y_train)
stop = time.time()/60
print(f"Model calculation time: {round(stop - start)} minutes") 