In [2]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.stem import WordNetLemmatizer
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, mean_absolute_error

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gabrielpilao1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gabrielpilao1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gabrielpilao1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier

In [5]:
engine = create_engine('sqlite:///etl_figure_eight.db')

In [6]:
df = pd.read_sql_table('etl_figure_eight', 'sqlite:///etl_figure_eight.db')
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
df['genre'].value_counts()

news      13036
direct    10747
social     2394
Name: genre, dtype: int64

In [8]:
Y = df.iloc[:,4:]

In [9]:
X = df['message']

In [10]:
def tokenize(text):
    # normalize case and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # tokenize text
    tokens = word_tokenize(text)
    
    # lemmatize andremove stop words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]

    return tokens

In [11]:
pipeline = Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer()),
                ('clf', MultiOutputClassifier(estimator = RandomForestClassifier())),
                    ])

In [12]:
from sklearn.model_selection import train_test_split

# perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

#spliting the training set again to reduce training time
X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(X_train, y_train, train_size = 6000)

# train classifier
pipeline.fit(X_train_sub, y_train_sub)

# predict on test data
y_pred = pipeline.predict(X_test_sub)

# display results
accuracy = (y_pred == y_test_sub).mean()

In [13]:
category_names = Y.columns

In [14]:
y_pred[1]

array(['1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], dtype=object)

In [15]:
y_test_np = y_test_sub.values

In [16]:
from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.metrics import classification_report

In [17]:
for i, c in enumerate(category_names): 
    print(c) 
    print(classification_report(y_test_np[i], y_pred[i]))

related
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        32
           1       1.00      0.50      0.67         4

    accuracy                           0.94        36
   macro avg       0.97      0.75      0.82        36
weighted avg       0.95      0.94      0.94        36

request
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        33
           1       1.00      0.33      0.50         3

    accuracy                           0.94        36
   macro avg       0.97      0.67      0.74        36
weighted avg       0.95      0.94      0.93        36

offer
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        32
           1       1.00      0.75      0.86         4

    accuracy                           0.97        36
   macro avg       0.98      0.88      0.92        36
weighted avg       0.97      0.97      0.97        3

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       1.00      1.00      1.00         1

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'clf__estimator__n_estimators': [50, 100, 150, 200],
    'clf__estimator__min_samples_leaf': [5, 10, 20, 50],
    'clf__estimator__max_features': [0.5, 1, 2]
}

cv = GridSearchCV(pipeline, parameters)

In [None]:
cv.fit(X_train_sub, y_train_sub)

In [None]:
y_pred_grid_sub = cv.predict(X_test_sub)

In [None]:
for i, c in enumerate(category_names): 
    print(c) 
    print(classification_report(y_test_np[i], y_pred_grid_sub[i]))

In [None]:
pipeline_svc = Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer()),
                ('clf', MultiOutputClassifier(estimator = SVC())),
                    ])

# train classifier
pipeline_svc.fit(X_train_sub, y_train_sub)

# predict on test data
y_pred_svc = pipeline_svc.predict(X_test_sub)

In [None]:
for i, c in enumerate(category_names): 
    print(c) 
    print(classification_report(y_test_np[i], y_pred_svc[i]))

In [None]:
pipeline_adaboost = Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer()),
                ('clf', MultiOutputClassifier(estimator = AdaBoostClassifier())),
                    ])

# train classifier
pipeline_adaboost.fit(X_train_sub, y_train_sub)

# predict on test data
y_pred_adaboost = pipeline_adaboost.predict(X_test_sub)


In [None]:
for i, c in enumerate(category_names): 
    print(c) 
    print(classification_report(y_test_np[i], y_pred_adaboost[i]))