# ML Pipeline Preparation

### 1. Import libraries and load data from database.

In [23]:
# import libraries
import nltk
nltk.download(['punkt', 'wordnet'])

import os
import re
import numpy as np
import pandas as pd
import sqlite3
from sqlalchemy import create_engine,inspect

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import hamming_loss, confusion_matrix, classification_report, precision_recall_fscore_support, accuracy_score, precision_score, recall_score, f1_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sinde\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sinde\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Move to datasets folder
original_directory = os.getcwd()
dataset_directory = './dataset'
os.chdir(dataset_directory)

In [3]:
#Look for the tables name in the SQL database
engine = create_engine('sqlite:///DisasterResponse.db')

# Create an inspector
inspector = inspect(engine)

# Get the list of table names
table_names = inspector.get_table_names()

table_names

['messages']

In [4]:
# load data from database
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql("SELECT * FROM messages", engine)
X = df['message']
Y = df.drop(['id','message','original','genre'], axis=1)

In [5]:
X.head()

0    Weather update - a cold front from Cuba that c...
1              Is the Hurricane over or is it not over
2                      Looking for someone but no name
3    UN reports Leogane 80-90 destroyed. Only Hospi...
4    says: west side of Haiti, rest of the country ...
Name: message, dtype: object

In [6]:
Y.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 2. Write a tokenization function to process your text data

In [7]:
def tokenize(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### 3. Build a machine learning pipeline
I´ve used the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html)

In [8]:
pipeline = Pipeline([
    ('vect',  CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf',   MultiOutputClassifier(RandomForestClassifier()))
])

### 4. Train pipeline

In [50]:
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# Train the model
pipeline.fit(X_train, Y_train)

# Predict on test data
Y_pred = pipeline.predict(X_test)

### 5. Test your model

In [60]:
# # Classification report for each label
for i, column in enumerate(Y.columns):
    print(f'\nClassification Report for {column}:')
    print(classification_report(Y_test[column], Y_pred[:, i]))


Classification Report for related:
              precision    recall  f1-score   support

           0       0.73      0.26      0.39      1873
           1       0.80      0.97      0.88      5934
           2       0.60      0.05      0.10        58

    accuracy                           0.79      7865
   macro avg       0.71      0.43      0.45      7865
weighted avg       0.78      0.79      0.75      7865


Classification Report for request:
              precision    recall  f1-score   support

           0       0.89      0.99      0.94      6533
           1       0.90      0.41      0.56      1332

    accuracy                           0.89      7865
   macro avg       0.89      0.70      0.75      7865
weighted avg       0.89      0.89      0.87      7865


Classification Report for offer:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7829
           1       0.00      0.00      0.00        36

    accuracy          

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      7787
           1       0.00      0.00      0.00        78

    accuracy                           0.99      7865
   macro avg       0.50      0.50      0.50      7865
weighted avg       0.98      0.99      0.99      7865


Classification Report for shops:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7837
           1       0.00      0.00      0.00        28

    accuracy                           1.00      7865
   macro avg       0.50      0.50      0.50      7865
weighted avg       0.99      1.00      0.99      7865


Classification Report for aid_centers:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      7762
           1       0.00      0.00      0.00       103

    accuracy                           0.99      7865
   macro avg       0.49      0.50      0.50      7865


In [61]:
# Initialize lists to store the precision, recall, and f1-score for each label
precision_list = []
recall_list = []
f1_list = []

# Calculate precision, recall, and f1-score for each label
for i, column in enumerate(Y.columns):
    precision = precision_score(Y_test[column], Y_pred[:, i], average='weighted', zero_division=0)
    recall = recall_score(Y_test[column], Y_pred[:, i], average='weighted', zero_division=0)
    f1 = f1_score(Y_test[column], Y_pred[:, i], average='weighted', zero_division=0)
    
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

# Compute macro averages
precision_macro = np.mean(precision_list)
recall_macro = np.mean(recall_list)
f1_macro = np.mean(f1_list)

# Overall metrics
overall_accuracy = (Y_pred == Y_test).mean().mean()

print(f'Overall Accuracy: {overall_accuracy:.4f}')
print(f'Macro Average Precision: {precision_macro:.4f}')
print(f'Macro Average Recall: {recall_macro:.4f}')
print(f'Macro Average F1 Score: {f1_macro:.4f}')

Overall Accuracy: 0.9452
Macro Average Precision: 0.9364
Macro Average Recall: 0.9452
Macro Average F1 Score: 0.9307


### 6. Improve model

### 7. Test your model

### 9. Export model as a pickle file

# ML Pipeline Preparation - One Function