# ML Pipeline Preparation
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [25]:
# import libraries
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
import pandas as pd
from sqlalchemy import create_engine
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import multioutput
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
import re
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.model_selection import GridSearchCV
import pickle

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
# load data from database
engine = create_engine('sqlite:///InsertDatabaseName.db')
df = pd.read_sql ('SELECT * FROM InsertTableName', engine)
X = df ['message']
y = df.iloc[:,4:]
display (y.head (n=3))

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 2. Write a tokenization function to process your text data

In [14]:
def tokenize(text):
    #normalize text
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # stopword list 
    stop_words = stopwords.words("english")
    
    #tokenize
    words = word_tokenize(text)
    
    #stemming
    stemmed = [PorterStemmer().stem(w) for w in words]
    
    #lemmatizing
    words_lemmed = [WordNetLemmatizer().lemmatize(w) for w in stemmed if w not in stop_words]
   
    return words_lemmed

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. 

In [15]:
pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', multioutput.MultiOutputClassifier (RandomForestClassifier()))
        ])

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 22)
# train classifier
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...oob_score=False, random_state=None, verbose=0,
            warm_start=False),
           n_jobs=1))])

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [17]:
y_pred = pipeline.predict(X_test)
# Get results and add them to a dataframe.
def get_results(y_test, y_pred):
    results = pd.DataFrame(columns=['Category', 'f_score', 'precision', 'recall'])
    num = 0
    for cat in y_test.columns:
        precision, recall, f_score, support = precision_recall_fscore_support(y_test[cat], y_pred[:,num], average='weighted')
        results.set_value(num+1, 'Category', cat)
        results.set_value(num+1, 'f_score', f_score)
        results.set_value(num+1, 'precision', precision)
        results.set_value(num+1, 'recall', recall)
        num += 1
    print('Aggregated f_score:', results['f_score'].mean())
    print('Aggregated precision:', results['precision'].mean())
    print('Aggregated recall:', results['recall'].mean())
    return results

In [18]:
results = get_results(y_test, y_pred)
results

Aggregated f_score: 0.932966807433
Aggregated precision: 0.93445859964
Aggregated recall: 0.944864035534


  
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  'precision', 'predicted', average, warn_for)


Unnamed: 0,Category,f_score,precision,recall
1,related,0.797554,0.79432,0.807598
2,request,0.868622,0.875084,0.882972
3,offer,0.994053,0.992082,0.996033
4,aid_related,0.743114,0.748337,0.748551
5,medical_help,0.895275,0.895609,0.921422
6,medical_products,0.93364,0.948917,0.952701
7,search_and_rescue,0.963121,0.962147,0.97162
8,security,0.972468,0.963711,0.981385
9,military,0.95596,0.959599,0.969179
10,child_alone,1.0,1.0,1.0


### 6. Improving the model
Use grid search to find better parameters. 

In [19]:
parameters = {'clf__estimator__max_depth': [1, 2, None]}

cv = GridSearchCV(pipeline, parameters)

In [20]:
model = cv
model.fit(X_train.as_matrix(), y_train.as_matrix())
y_pred_tuned = model.predict(X_test)
#converting to a dataframe
#y_pred_tuned = pd.DataFrame(y_pred_tuned, columns = y_test.columns)



  


### 7. Test your model
Shows the accuracy, precision, and recall of the tuned model.  


In [21]:
results_tuned = get_results(y_test, y_pred_tuned)
results_tuned

Aggregated f_score: 0.932282535943
Aggregated precision: 0.933908247539
Aggregated recall: 0.944491065677


  
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  'precision', 'predicted', average, warn_for)


Unnamed: 0,Category,f_score,precision,recall
1,related,0.793698,0.790201,0.803937
2,request,0.862909,0.870098,0.878853
3,offer,0.994053,0.992082,0.996033
4,aid_related,0.739279,0.747529,0.746262
5,medical_help,0.897521,0.899631,0.922643
6,medical_products,0.934899,0.943556,0.952548
7,search_and_rescue,0.962419,0.963266,0.972078
8,security,0.972239,0.963703,0.980928
9,military,0.958753,0.959427,0.969484
10,child_alone,1.0,1.0,1.0


### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [27]:
new_pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', multioutput.MultiOutputClassifier (AdaBoostClassifier()))
        ])

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 22)
# train classifier
new_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...mator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
           n_jobs=1))])

In [29]:
y_pred_tuned_ada = new_pipeline.predict(X_test)
results_tuned = get_results(y_test, y_pred_tuned_ada)
results_tuned

Aggregated f_score: 0.940272096531
Aggregated precision: 0.94022097111
Aggregated recall: 0.947979181501


  
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,Category,f_score,precision,recall
1,related,0.721412,0.742689,0.774794
2,request,0.882093,0.883332,0.890906
3,offer,0.993519,0.992077,0.994965
4,aid_related,0.757429,0.762564,0.762283
5,medical_help,0.918307,0.917655,0.930729
6,medical_products,0.947268,0.946316,0.955447
7,search_and_rescue,0.964356,0.963521,0.972078
8,security,0.974579,0.973006,0.981385
9,military,0.968348,0.967079,0.972383
10,child_alone,1.0,1.0,1.0


### 9. Export your model as a pickle file

In [30]:
pickle.dump(new_pipeline, open('final_model.sav', 'wb'))