In [13]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.stem import WordNetLemmatizer
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, mean_absolute_error

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gabrielpilao1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gabrielpilao1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gabrielpilao1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier

In [27]:
engine = create_engine('sqlite:///../data/DisasterResponse.db')

In [28]:
df = pd.read_sql_table('mess', engine)
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
df['genre'].value_counts()

news      13128
direct    10852
social     2406
Name: genre, dtype: int64

In [30]:
Y = df.iloc[:,4:]

In [31]:
X = df['message']

In [32]:
def tokenize(text):
    # normalize case and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # tokenize text
    tokens = word_tokenize(text)
    
    # lemmatize andremove stop words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]

    return tokens

In [33]:
pipeline = Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer()),
                ('clf', MultiOutputClassifier(estimator = RandomForestClassifier())),
                    ])

In [34]:
from sklearn.model_selection import train_test_split

# perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# train classifier
pipeline.fit(X_train, y_train)

# predict on test data
y_pred = pipeline.predict(X_test)

# display results
accuracy = (y_pred == y_test).mean()

In [35]:
category_names = Y.columns

In [36]:
y_test_np = y_test.values

In [39]:
print(X_test)

18018    Leaflets have been handed out, with pictures o...
428      Hello, we thank every organization, every inst...
4189     I would like to get information on the earthqu...
5325     When do inscriptions start for the Fullbright ...
17743    **Project sites difficult to reach** Cordaid p...
                               ...                        
7585     if I have a peste where I will find a doctor f...
18305    On the day of our visit, preparations were und...
3529              A new friend says hi to all my friends. 
4420     What will we get from solidarity with 4636, wh...
604                 How to find help and what kind of help
Name: message, Length: 5278, dtype: object


In [40]:
print(y_pred)

[['1' '0' '0' ... '0' '0' '0']
 ['1' '0' '0' ... '0' '0' '0']
 ['1' '0' '0' ... '0' '0' '0']
 ...
 ['1' '0' '0' ... '0' '0' '0']
 ['1' '0' '0' ... '0' '0' '0']
 ['1' '1' '0' ... '0' '0' '0']]


In [41]:
from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.metrics import classification_report

In [42]:
for i, c in enumerate(category_names): 
    print(c) 
    print(classification_report(y_test_np[i], y_pred[i]))

related
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        33
           1       1.00      0.33      0.50         3

    accuracy                           0.94        36
   macro avg       0.97      0.67      0.74        36
weighted avg       0.95      0.94      0.93        36

request
              precision    recall  f1-score   support

           0       0.91      1.00      0.95        31
           1       1.00      0.40      0.57         5

    accuracy                           0.92        36
   macro avg       0.96      0.70      0.76        36
weighted avg       0.92      0.92      0.90        36

offer
              precision    recall  f1-score   support

           0       0.87      0.96      0.92        28
           1       0.80      0.50      0.62         8

    accuracy                           0.86        36
   macro avg       0.84      0.73      0.77        36
weighted avg       0.86      0.86      0.85        3

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.91      1.00      0.96        32
           1       1.00      0.25      0.40         4

    accuracy                           0.92        36
   macro avg       0.96      0.62      0.68        36
weighted avg       0.92      0.92      0.89        36

food
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       1.00      1.00      1.00         1

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36

shelter
              precision    recall  f1-score   support

           0       0.88      1.00      0.93        28
           1       1.00      0.50      0.67         8

    accuracy                           0.89        36
   macro avg       0.94      0.75      0.80        36
weighted avg       0.90      0.89      0.87        36

clothi

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.97      1.00      0.99        35
           1       0.00      0.00      0.00         1

    accuracy                           0.97        36
   macro avg       0.49      0.50      0.49        36
weighted avg       0.95      0.97      0.96        36

storm
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        33
           1       1.00      0.67      0.80         3

    accuracy                           0.97        36
   macro avg       0.99      0.83      0.89        36
weighted avg       0.97      0.97      0.97        36

fire
              precision    recall  f1-score   support

           0       0.86      1.00      0.92        30
           1       1.00      0.17      0.29         6

    accuracy                           0.86        36
   macro avg       0.93      0.58      0.60        36
weighted avg       0.88      0.86      0.82        36

earthqua

In [36]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'clf__estimator__n_estimators': [50, 100, 150, 200],
    'clf__estimator__min_samples_leaf': [5, 10, 20, 50],
    'clf__estimator__max_features': [0.5, 1, 2]
}

cv = GridSearchCV(pipeline, parameters)

In [37]:
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

In [38]:
pipeline_adaboost = Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer()),
                ('clf', MultiOutputClassifier(estimator = AdaBoostClassifier())),
                    ])

# train classifier
pipeline_adaboost.fit(X_train, y_train)

# predict on test data
y_pred_adaboost = pipeline_adaboost.predict(X_test)

In [39]:
for i, c in enumerate(category_names): 
    print(c) 
    print(classification_report(y_test_np[i], y_pred_adaboost[i]))

related
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        36
           1       0.00      0.00      0.00         0

    accuracy                           0.97        36
   macro avg       0.50      0.49      0.49        36
weighted avg       1.00      0.97      0.99        36

request
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       1.00      1.00      1.00         1

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36

offer
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        36
           1       0.00      0.00      0.00         0

    accuracy                           0.97        36
   macro avg       0.50      0.49      0.49        36
weighted avg       1.00      0.97      0.99        3

  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
import pickle
with open('model.pkl', 'wb') as model_file:
    pickle.dump(pipeline, model_file)