# Import Libraries

In [None]:
#Import libraries
import pandas as pd
import numpy as np
import string
from sqlalchemy import create_engine
from sqlalchemy.exc import SQLAlchemyError
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import pandas as pd
import json

# Load the Data

In [None]:
FEATURE_COLUMNS =['message']
TARGET_COLUMNS = [
    'related', 'request', 'offer', 'aid_related', 'medical_help', 'medical_products', 
    'search_and_rescue', 'security', 'military', 'child_alone', 'water', 'food', 
    'shelter', 'clothing', 'money', 'missing_people', 'refugees', 'death', 
    'other_aid', 'infrastructure_related', 'transport', 'buildings', 'electricity', 
    'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
    'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold', 
    'other_weather', 'direct_report']

STOPWORDS_SET = set(stopwords.words('english'))
URL_REGEX = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
URL_PLACE_HOLDER = "urlplaceholder"

def load_data(db_filepath, table_name, feature_columns, target_columns):
    engine = create_engine(db_filepath)
    # Create a dataframe from the engine
    df = pd.read_sql_table(table_name, engine)
    X = df.message.values
    y = df[target_columns].values
    return X, y

#Load the data
X, y = load_data(
    'sqlite:///data/02_stg//stg_disaster_response.db',
    'stg_disaster_response',
    FEATURE_COLUMNS, 
    TARGET_COLUMNS
)

# Write a Function to Tokenize the Text

In [None]:
def tokenize(text):
    """
    This function is designed to tokenize the message data
    """
    # Detect and replace URLs
    text = re.sub(URL_REGEX, URL_PLACE_HOLDER, text)
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [token for token in tokens if token not in STOPWORDS_SET]
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    cleaned_tokens = [lemmatizer.lemmatize(token.lower().strip()) for token in tokens]
    return cleaned_tokens


# Build a Machine Learning Pipeline

In [None]:
# Instantiate and configure the pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)), # Tokenize and vectorize text
    ('tfidf', TfidfTransformer(smooth_idf=False)), # Apply TF-IDF transformation
    ('clf', MultiOutputClassifier(RandomForestClassifier(n_jobs=9))) # Use MultiOutputClassifier with RandomForest, n_jobs specifies cores
])

# Train the Pipeline

In [None]:
#Note this code takes about a minute and 30 seconds to run with 9 cores

# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Test the Model and Export the Test Results

In [None]:
# Assuming y_test and y_pred are your test labels and predicted labels respectively
y_pred = pipeline.predict(X_test)

# Create an empty list to store the results
results = []

for i, col in enumerate(TARGET_COLUMNS):
    report = classification_report(y_test[:, i], y_pred[:, i], output_dict=True, zero_division=0)
    for output_class, metrics in report.items():
        if isinstance(metrics, dict):  # Ensure metrics is a dictionary
            temp = metrics.copy()  # Create a copy of metrics to avoid modifying the original dictionary
            temp['output_class'] = output_class
            temp['category'] = col
            results.append(temp)

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

#Re-arrange the columns so that category and output_class are the first two columns
results_df = results_df[['category', 'output_class', 'precision', 'recall', 'f1-score', 'support']]
results_df.to_csv('data\\04_fct\\fct_prediction_results.csv', index=False)
results_df.head()

# Improve the Model

## Use GridSearch to find the best parameters based on the weighted f-1 score

In [None]:
from models.train_classifier import tokenize #This needs to be here otherwise there will be a memory error

pipeline = Pipeline([
    #The default ngram_range is (1, 1)
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer(smooth_idf=False)),
    # This defaults to n_estimators=100 and min_samples_split=2
    ('clf', MultiOutputClassifier(RandomForestClassifier(n_jobs=9)))
])

#Set up the grid search parameters
parameters = {
    'vect__ngram_range': ((1, 1), (1, 2)),
    'clf__estimator__n_estimators': [50, 100, 200],
    'clf__estimator__min_samples_split': [2, 3, 4]
}

cv = GridSearchCV(pipeline, param_grid=parameters, scoring='f1_weighted', n_jobs=9)

#Note: this takes 164 minutes to run with 6 cores
cv.fit(X_train, y_train)

# Get the results of the grid search
cv_results = cv.cv_results_

# Prepare a list to store the results
results = []

# For each set of parameters, store the parameters and the associated mean test score
for params, mean_score in zip(cv_results['params'], cv_results['mean_test_score']):
    results.append({'params': params, 'score': mean_score})

# Write the results to a .json file
with open('data\\04_fct\\fct_weighted_f1_paramters.json', 'w') as f:
    json.dump(results, f)

## Inspect the json file and identify the best parameters

In [22]:
#will need to input the best parameters into the pipeline
pipeline_v2 = Pipeline([
    #The improved model considers 1-grams and 2-grams instead of just 1-grams
    ('vect', CountVectorizer(ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer(smooth_idf=False)),
    # It also increases the n_estimators from 100 to 200. The min_samples_split remains the same
    ('clf', MultiOutputClassifier(RandomForestClassifier(min_samples_split=2, n_estimators=200, n_jobs=9)))
])

# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Fit the pipeline to the training data - This takes about 9 minutes to run with 9 cores
pipeline_v2.fit(X_train, y_train)

# Assuming y_test and y_pred are your test labels and predicted labels respectively
y_pred_opt = pipeline_v2.predict(X_test)

# Create an empty list to store the results
results = []

for i, col in enumerate(TARGET_COLUMNS):
    report = classification_report(y_test[:, i], y_pred_opt[:, i], output_dict=True, zero_division=0)
    for output_class, metrics in report.items():
        if isinstance(metrics, dict):  # Ensure metrics is a dictionary
            temp = metrics.copy()  # Create a copy of metrics to avoid modifying the original dictionary
            temp['output_class'] = output_class
            temp['category'] = col
            results.append(temp)

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

#Re-arrange the columns so that category and output_class are the first two columns
results_df = results_df[['category', 'output_class', 'precision', 'recall', 'f1-score', 'support']]
results_df.to_csv('data\\04_fct\\fct_prediction_results_optimized.csv', index=False)
results_df.head()

Unnamed: 0,category,output_class,precision,recall,f1-score,support
0,related,0,0.713208,0.276047,0.398034,2054.0
1,related,1,0.806018,0.963422,0.877719,6534.0
2,related,2,0.574468,0.421875,0.486486,64.0
3,related,macro avg,0.697898,0.553781,0.587413,8652.0
4,related,weighted avg,0.782272,0.796232,0.760947,8652.0


# Compare the original parameters to the new parameters

In [54]:
#Import fct_prediction_results
df_results = pd.read_csv('data\\04_fct\\fct_prediction_results.csv')

#Import fct_prediction_results_optimized
df_results_optimized = pd.read_csv('data\\04_fct\\fct_prediction_results_optimized.csv')

# Set 'category' and 'output_class' as index
df_results.set_index(['category', 'output_class', 'support'], inplace=True)
df_results_optimized.set_index(['category', 'output_class'], inplace=True)

# Calculate percent change for specific columns
columns = ['precision', 'recall', 'f1-score']
epsilon = 1e-7
percent_change = round((df_results_optimized[columns] - df_results[columns]) / (df_results[columns] + epsilon) * 100, 2)

# Reset index
percent_change.reset_index(inplace=True)

#Re-arrange the columns so that support is the last column
percent_change = percent_change[['category', 'output_class', 'precision', 'recall', 'f1-score', 'support']]

#Inspect the first few rows of the DataFrame
# output_class should be read as the values for 'related' plus the macro and weighted averages
percent_change.head()

Unnamed: 0,category,output_class,precision,recall,f1-score,support
0,aid_centers,0,-0.0,-0.01,-0.01,8537.0
1,aid_centers,1,0.0,0.0,0.0,115.0
2,aid_centers,macro avg,-0.0,-0.01,-0.01,8652.0
3,aid_centers,weighted avg,-0.0,-0.01,-0.01,8652.0
4,aid_related,0,-4.77,7.08,0.67,5107.0


A few notes on the percent change table...
- Support is the count of messages that fall into the category. Therefore, the sum of the support values for 0, 1, 2 should be equal to the total number of messages for the macro avg and weighted avg categories
- The output class contains two different categories of values. 
    - The values 0, 1 and 2 carry the same meaning as the 0, 1 and 2 values in the 'related' column in the original dataset (i.e. 0 is unrelated, 1, is related, 2 is ambiguous).
    - The values macro avg and weighted avg are the macro and weighted averages of the precision, recall and f1-score metrics for the category
- f1-score: F1-score is the harmonic mean of precision and recall. It provides a balance between precision and recall. It should typically range from 0 to 1.
- The weighted avg provides the weighted average of precision, recall, and F1-score across all classes. Weighted average takes into account the support for each class, giving higher weight to classes with more samples.
- The macro avg provides the unweighted average of precision, recall, and F1-score across all classes. It treats all classes equally, regardless of their support.

In [55]:
# From the percent_change DataFrame, filter where output_class = 'weighted avg'
percent_change_weighted_avg = percent_change[percent_change['output_class'] == 'weighted avg'].copy()

# Drop the precision and recall columns
percent_change_weighted_avg.drop(columns=['precision', 'recall'], inplace=True)

# Export the percent_change_weighted_avg DataFrame to a CSV file
percent_change_weighted_avg.to_csv('data\\04_fct\\fct_percent_change_weighted_avg.csv', index=False)

# Inspect the first few rows of the DataFrame
percent_change_weighted_avg.head()


Unnamed: 0,category,output_class,f1-score,support
3,aid_centers,weighted avg,-0.01,8652.0
7,aid_related,weighted avg,-1.82,8652.0
11,buildings,weighted avg,-0.14,8652.0
14,child_alone,weighted avg,0.0,8652.0
18,clothing,weighted avg,-0.24,8652.0


# Inspect the Results

In [58]:
# Calculate the metrics
performance_data = {
    "Metric": ["New Model Better Count", "Original Model Better Count", "No Difference Count", "Weighted Avg F1-Score"],
    "Value": [
        round((percent_change_weighted_avg['f1-score'] > 0).sum()),
        round((percent_change_weighted_avg['f1-score'] < 0).sum()),
        round((percent_change_weighted_avg['f1-score'] == 0).sum()),
        round((percent_change_weighted_avg['f1-score'] * percent_change_weighted_avg['support']).sum() / percent_change_weighted_avg['support'].sum(),2)
    ]
}

# Create a DataFrame for the performance
performance_df = pd.DataFrame(performance_data)

# Adding interpretation for weighted avg f1-score
performance_df.loc[performance_df['Metric'] == 'Weighted Avg F1-Score', 'Interpretation'] = performance_df['Value'].apply(
    lambda x: "New model performs better" if x > 0 else ("Original model performs better" if x < 0 else "No overall difference in performance")
)

performance_df

Unnamed: 0,Metric,Value,Interpretation
0,New Model Better Count,8.0,
1,Original Model Better Count,23.0,
2,No Difference Count,5.0,
3,Weighted Avg F1-Score,-0.4,Original model performs better


The above results are confusing to me. Why would the original model be better? Shouldn't the optimization result in a better score?

The problem is with the grid search. Apparently grid search defaults to 'accuracy' instead of the weighted average. I need to change my code so that my reports are in line with what I am searching for. 

Original paramters
{"clf__estimator__min_samples_split": 2, "clf__estimator__n_estimators": 100, "vect__ngram_range": [1, 1]}
Best parameters
{"clf__estimator__min_samples_split": 2, "clf__estimator__n_estimators": 200, "vect__ngram_range": [1, 2]}