# ML Pipeline Preparation

### 1. Import libraries and load data from database.

In [63]:
# import libraries
import nltk
nltk.download(['punkt', 'wordnet'])

import os
import re
import numpy as np
import pandas as pd
import sqlite3
from sqlalchemy import create_engine,inspect

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import hamming_loss, confusion_matrix, classification_report, precision_recall_fscore_support, accuracy_score, precision_score, recall_score, f1_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sinde\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sinde\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Move to datasets folder
original_directory = os.getcwd()
dataset_directory = './dataset'
os.chdir(dataset_directory)

In [3]:
#Look for the tables name in the SQL database
engine = create_engine('sqlite:///DisasterResponse.db')

# Create an inspector
inspector = inspect(engine)

# Get the list of table names
table_names = inspector.get_table_names()

table_names

['messages']

In [4]:
# load data from database
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql("SELECT * FROM messages", engine)
X = df['message']
Y = df.drop(['id','message','original','genre'], axis=1)

In [5]:
X.head()

0    Weather update - a cold front from Cuba that c...
1              Is the Hurricane over or is it not over
2                      Looking for someone but no name
3    UN reports Leogane 80-90 destroyed. Only Hospi...
4    says: west side of Haiti, rest of the country ...
Name: message, dtype: object

In [6]:
Y.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 2. Write a tokenization function to process your text data

In [7]:
def tokenize(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### 3. Build a machine learning pipeline
I´ve used the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html)

In [8]:
pipeline = Pipeline([
    ('vect',  CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf',   MultiOutputClassifier(RandomForestClassifier()))
])

### 4. Train pipeline

In [50]:
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# Train the model
pipeline.fit(X_train, Y_train)

# Predict on test data
Y_pred = pipeline.predict(X_test)

### 5. Test your model

In [61]:
# Initialize lists to store the precision, recall, and f1-score for each label
precision_list = []
recall_list = []
f1_list = []

# Calculate precision, recall, and f1-score for each label
for i, column in enumerate(Y.columns):
    precision = precision_score(Y_test[column], Y_pred[:, i], average='weighted', zero_division=0)
    recall = recall_score(Y_test[column], Y_pred[:, i], average='weighted', zero_division=0)
    f1 = f1_score(Y_test[column], Y_pred[:, i], average='weighted', zero_division=0)
    
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

# Compute macro averages
precision_macro = np.mean(precision_list)
recall_macro = np.mean(recall_list)
f1_macro = np.mean(f1_list)

# Overall metrics
overall_accuracy = (Y_pred == Y_test).mean().mean()

print(f'Overall Accuracy: {overall_accuracy:.4f}')
print(f'Macro Average Precision: {precision_macro:.4f}')
print(f'Macro Average Recall: {recall_macro:.4f}')
print(f'Macro Average F1 Score: {f1_macro:.4f}')

Overall Accuracy: 0.9452
Macro Average Precision: 0.9364
Macro Average Recall: 0.9452
Macro Average F1 Score: 0.9307


### 6. Improve model

In [72]:
parameters_initial = {
    'clf__estimator__n_estimators': [50, 100, 200],
    'clf__estimator__min_samples_split': [2, 4, 6]
}

cv = GridSearchCV(pipeline, param_grid=parameters_initial, cv=2, verbose=3)
cv.fit(X_train, Y_train)

print("Initial Best Parameters:", cv.best_params_)

Fitting 2 folds for each of 9 candidates, totalling 18 fits
[CV 1/2] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=50;, score=0.234 total time=  50.9s
[CV 2/2] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=50;, score=0.233 total time=  57.7s
[CV 1/2] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100;, score=0.233 total time= 1.9min
[CV 2/2] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100;, score=0.232 total time= 2.0min
[CV 1/2] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=200;, score=0.237 total time= 3.7min
[CV 2/2] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=200;, score=0.234 total time=12.5min
[CV 1/2] END clf__estimator__min_samples_split=4, clf__estimator__n_estimators=50;, score=0.230 total time=  54.0s
[CV 2/2] END clf__estimator__min_samples_split=4, clf__estimator__n_estimators=50;, score=0.229 total time=  57.2s
[CV 1/2] END clf

In [None]:
#Considering performance (time and criteria 'gini index' for RandomForestClassifier Machine Learning Model)
pipeline.set_params(clf__estimator__n_estimators=50,clf__estimator__min_samples_split=2)

# Train the model
pipeline.fit(X_train, Y_train)

# Predict on test data
Y_pred = pipeline.predict(X_test)

# Initialize lists to store the precision, recall, and f1-score for each label
precision_list = []
recall_list = []
f1_list = []

# Calculate precision, recall, and f1-score for each label
for i, column in enumerate(Y.columns):
    precision = precision_score(Y_test[column], Y_pred[:, i], average='weighted', zero_division=0)
    recall = recall_score(Y_test[column], Y_pred[:, i], average='weighted', zero_division=0)
    f1 = f1_score(Y_test[column], Y_pred[:, i], average='weighted', zero_division=0)
    
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

# Compute macro averages
precision_macro = np.mean(precision_list)
recall_macro = np.mean(recall_list)
f1_macro = np.mean(f1_list)

# Overall metrics
overall_accuracy = (Y_pred == Y_test).mean().mean()

print(f'Overall Accuracy: {overall_accuracy:.4f}')
print(f'Macro Average Precision: {precision_macro:.4f}')
print(f'Macro Average Recall: {recall_macro:.4f}')
print(f'Macro Average F1 Score: {f1_macro:.4f}')

### 7. Test your model - Text the model effectiveness using a new data

In [124]:
# import datsets
twitter = pd.read_csv('Twitter-sentiment-self-drive-DFE.csv')

In [128]:
#Lets look the data
for i in twitter.text.head(10):
    print(i)
#It looks like there are many situations where text doesn't seem related to disasters. We should have considered this before in our ML

Two places I'd invest all my money if I could: 3D printing and Self-driving cars!!!
Awesome! Google driverless cars will help the blind travel more often; https://t.co/QWuXR0FrBpv
If Google maps can't keep up with road construction, how am I supposed to trust a driverless car to get around here?
Autonomous cars seem way overhyped given the technology challenges; pilotless planes seem much more doable and needed.
Just saw Google self-driving car on I-34. It was painted green and blue.
Will driverless cars eventually replace taxi drivers in cities?
Chicago metro expected to be fully autonomous by 2020
I love the infotainment system in my new car. This thing can almost drive itself.
Autonomous vehicles could reduce traffic fatalities by 90%...I'm in!
Driverless cars are not worth the risk.  Don't want to be on the highway when the server crashes #SadMacFace #BlueScreenofDeath


In [129]:
# Predict on new data
new_predictions = pipeline.predict(twitter.text)
teste = pd.concat([twitter.text,pd.DataFrame(new_predictions)],axis=1)
teste.columns = teste.columns = ['text'] + list(Y.columns)
analysis = teste.iloc[:,1:].sum() / teste.shape[0]
analysis

# As we can see, most of the data is considered related, but we have seen that this is not the case

related                   0.975266
request                   0.000419
offer                     0.000000
aid_related               0.005450
medical_help              0.000000
medical_products          0.000000
search_and_rescue         0.000000
security                  0.000000
military                  0.000279
child_alone               0.000000
water                     0.000000
food                      0.000140
shelter                   0.000000
clothing                  0.000000
money                     0.000000
missing_people            0.000000
refugees                  0.000000
death                     0.000000
other_aid                 0.000000
infrastructure_related    0.000000
transport                 0.000838
buildings                 0.000000
electricity               0.000000
tools                     0.000000
hospitals                 0.000000
shops                     0.000000
aid_centers               0.000000
other_infrastructure      0.000000
weather_related     

### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

### 9. Export model as a pickle file

# ML Pipeline Preparation - One Function