# Import Libraries

In [1]:
#nltk downloads
# import nltk
# nltk.download('words')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('maxent_ne_chunker')
# nltk.download('wordnet')

# Standard library imports
import gzip
import json
import os
import pickle
import re
import string
import time

# Third-party imports
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sqlalchemy import create_engine
from sqlalchemy.exc import SQLAlchemyError

from IPython.display import display, Markdown

# Load the Data

In [8]:
FEATURE_COLUMNS =['message']
TARGET_COLUMNS = [
    'related', 'request', 'offer', 'aid_related', 'medical_help', 'medical_products', 
    'search_and_rescue', 'security', 'military', 'child_alone', 'water', 'food', 
    'shelter', 'clothing', 'money', 'missing_people', 'refugees', 'death', 
    'other_aid', 'infrastructure_related', 'transport', 'buildings', 'electricity', 
    'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
    'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold', 
    'other_weather', 'direct_report']

STOPWORDS_SET = set(stopwords.words('english'))
URL_REGEX = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
URL_PLACE_HOLDER = "urlplaceholder"

def load_data(db_filepath, table_name, feature_columns, target_columns):
    engine = create_engine(db_filepath)
    # Create a dataframe from the engine
    df = pd.read_sql_table(table_name, engine)
    X = df.message.values
    y = df[target_columns].values
    return X, y

#Load the data
X, y = load_data(
    'sqlite:///data/02_stg//stg_disaster_response.db',
    'stg_disaster_response',
    FEATURE_COLUMNS, 
    TARGET_COLUMNS
)

_Note_: This is a multi-label classification problem because a message can belong to 0, 1 or multiple categories

# Write a Function to Tokenize the Text

In [9]:
def tokenize(text):
    """
    This function is designed to tokenize the message data
    """
    # Detect and replace URLs
    text = re.sub(URL_REGEX, URL_PLACE_HOLDER, text)
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [token for token in tokens if token not in STOPWORDS_SET]
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    cleaned_tokens = [lemmatizer.lemmatize(token.lower().strip()) for token in tokens]
    return cleaned_tokens


# Build a Machine Learning Pipeline

In [10]:
# Instantiate and configure the pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)), # Tokenize and vectorize text
    ('tfidf', TfidfTransformer(smooth_idf=False)), # Apply TF-IDF transformation
    ('clf', MultiOutputClassifier(RandomForestClassifier(n_jobs=9))) # Use MultiOutputClassifier with RandomForest, n_jobs specifies cores
])

# Train the Pipeline

In [11]:
# Start the timer
start_time = time.time()

# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# End the timer and calculate the runtime
end_time = time.time()
pipeline_runtime = end_time - start_time



# Test the Model and Export the Test Results

In [12]:
# Assuming y_test and y_pred are your test labels and predicted labels respectively
y_pred = pipeline.predict(X_test)

# Create an empty list to store the results
results = []

for i, col in enumerate(TARGET_COLUMNS):
    report = classification_report(y_test[:, i], y_pred[:, i], output_dict=True, zero_division=0)
    for output_class, metrics in report.items():
        if isinstance(metrics, dict):  # Ensure metrics is a dictionary
            temp = metrics.copy()  # Create a copy of metrics to avoid modifying the original dictionary
            temp['output_class'] = output_class
            temp['category'] = col
            results.append(temp)

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

#Re-arrange the columns so that category and output_class are the first two columns
results_df = results_df[['category', 'output_class', 'precision', 'recall', 'f1-score', 'support']]
results_df.to_csv('data\\04_fct\\fct_prediction_results.csv', index=False)
results_df.head()

Unnamed: 0,category,output_class,precision,recall,f1-score,support
0,related,0,0.702391,0.388155,0.5,2043.0
1,related,1,0.832462,0.948679,0.886779,6547.0
2,related,macro avg,0.767427,0.668417,0.693389,8590.0
3,related,weighted avg,0.801527,0.815367,0.79479,8590.0
4,request,0,0.905425,0.976705,0.939715,7126.0


# Improve the Model

## Use GridSearch to find the best parameters based on the Accuracy Score (the default for GridSearch)

In [13]:
# from models.train_classifier import tokenize #This needs to be here otherwise there will be a memory error

# pipeline = Pipeline([
#     #The default ngram_range is (1, 1)
#     ('vect', CountVectorizer(tokenizer=tokenize)),
#     ('tfidf', TfidfTransformer(smooth_idf=False)),
#     # This defaults to n_estimators=100 and min_samples_split=2
#     ('clf', MultiOutputClassifier(RandomForestClassifier(n_jobs=9)))
# ])

# #Set up the grid search parameters
# parameters = {
#     'vect__ngram_range': ((1, 1), (1, 2)),
#     'clf__estimator__n_estimators': [50, 100, 200],
#     'clf__estimator__min_samples_split': [2, 3, 4]
# }

# cv = GridSearchCV(pipeline, param_grid=parameters, scoring='accuracy', n_jobs=9)

# # Fit the GridSearchCV object to the full training data
# cv.fit(X_train, y_train)

# # Get the results of the grid search
# cv_results = cv.cv_results_

# # Prepare a list to store the results
# results = []

# # For each set of parameters, store the parameters and the associated mean test score
# for params, mean_score in zip(cv_results['params'], cv_results['mean_test_score']):
#     results.append({'params': params, 'score': mean_score})

# for result in results:
#     print(result)
    
# # Write the results to a .json file
# with open('data\\04_fct\\fct_accurancy_parameters.json', 'w') as f:
#     json.dump(results, f)

## Accuracy Scoring Results
Here are the best parameters if we score by accuracy:
 - clf__estimator__min_samples_split: 2
 - clf__estimator__n_estimators: 200 
 - vect__ngram_range: [1, 2]

## Generate a Classification Report Based Upon the Best Parameters for Accuracy Scoring

#### Train a New Pipeline with the Optimized Parameters

In [14]:
# Start the timer
start_time = time.time()

#will need to input the best parameters into the pipeline
pipeline_v2 = Pipeline([
    #The improved model considers 1-grams and 2-grams instead of just 1-grams
    ('vect', CountVectorizer(ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer(smooth_idf=False)),
    # It also increases the n_estimators from 100 to 200. The min_samples_split remains the same
    ('clf', MultiOutputClassifier(RandomForestClassifier(min_samples_split=2, n_estimators=200, n_jobs=9)))
])

# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Fit the pipeline to the training data - This takes about 9 minutes to run with 9 cores
pipeline_v2.fit(X_train, y_train)

# End the timer and calculate the runtime
end_time = time.time()
pipeline_v2_runtime = end_time - start_time

### Test the Optimized Model and Export the Results

In [15]:
# Assuming y_test and y_pred are your test labels and predicted labels respectively
y_pred_opt = pipeline_v2.predict(X_test)

# Create an empty list to store the results
results = []

for i, col in enumerate(TARGET_COLUMNS):
    report = classification_report(y_test[:, i], y_pred_opt[:, i], output_dict=True, zero_division=0)
    for output_class, metrics in report.items():
        if isinstance(metrics, dict):  # Ensure metrics is a dictionary
            temp = metrics.copy()  # Create a copy of metrics to avoid modifying the original dictionary
            temp['output_class'] = output_class
            temp['category'] = col
            results.append(temp)

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

#Re-arrange the columns so that category and output_class are the first two columns
results_df = results_df[['category', 'output_class', 'precision', 'recall', 'f1-score', 'support']]
results_df.to_csv('data\\04_fct\\fct_prediction_results_optimized.csv', index=False)
results_df.head()

Unnamed: 0,category,output_class,precision,recall,f1-score,support
0,related,0,0.741655,0.250122,0.374085,2043.0
1,related,1,0.8061,0.972812,0.881645,6547.0
2,related,macro avg,0.773878,0.611467,0.627865,8590.0
3,related,weighted avg,0.790773,0.800931,0.760929,8590.0
4,request,0,0.903836,0.985265,0.942796,7126.0


## Compare the Original Accuracy Parameters to the New Accuracy Parameters

### Calculate the Percent Change per Category

In [36]:
#Import fct_prediction_results
df_results = pd.read_csv('data\\04_fct\\fct_prediction_results.csv')

#Import fct_prediction_results_optimized
df_results_optimized = pd.read_csv('data\\04_fct\\fct_prediction_results_optimized.csv')

# Set 'category' and 'output_class' as index
df_results.set_index(['category', 'output_class', 'support'], inplace=True)
df_results_optimized.set_index(['category', 'output_class'], inplace=True)

# Calculate percent change for specific columns
columns = ['precision', 'recall', 'f1-score']
epsilon = 1e-7
percent_change = round((df_results_optimized[columns] - df_results[columns]) / (df_results[columns] + epsilon) * 100, 2)

# Reset index
percent_change.reset_index(inplace=True)

#Re-arrange the columns so that support is the last column
percent_change = percent_change[['category', 'output_class', 'precision', 'recall', 'f1-score', 'support']]

#Export the DataFrame to a .csv file
percent_change.to_csv('data\\04_fct\\fct_percent_change_results.csv', index=False)

#Inspect the first few rows of the DataFrame
percent_change.head()

Unnamed: 0,category,output_class,precision,recall,f1-score,support
0,aid_centers,0,0.21,0.0,0.1,5139.0
1,aid_centers,1,0.0,0.0,0.0,67.0
2,aid_centers,macro avg,0.21,0.0,0.1,5206.0
3,aid_centers,weighted avg,0.41,0.21,0.31,5206.0
4,aid_related,0,-7.08,6.76,-0.79,3085.0


### Calculate the Output Class Medians

In [37]:
def calculate_output_class_medians(df, output_file):
    """
    Calculate the median precision, recall, and f1-score by output class.

    Args:
    df (DataFrame): The DataFrame containing the results.
    output_file (str): The path to the file where the results will be saved.

    Returns:
    DataFrame: The DataFrame containing the median metrics by output class.
    """
    # Calculate the median precision, recall, and f1-score by output class
    output_class_medians = df.groupby('output_class')[['precision', 'recall', 'f1-score']].median().round(2)

    # Convert output_class_medians to a DataFrame
    output_class_medians = output_class_medians.reset_index()

    # Multiply precision, recall, and f1-score by 100 to get percentages and round to 2 decimal places.
    output_class_medians[['precision', 'recall', 'f1-score']] = output_class_medians[['precision', 'recall', 'f1-score']] * 100
    output_class_medians[['precision', 'recall', 'f1-score']] = output_class_medians[['precision', 'recall', 'f1-score']].round(2)

    # Save output_class_medians to a .csv file
    output_class_medians.to_csv(output_file, index=False)

    return output_class_medians

### Calculate the Percent Difference between the base and the optimized models

In [40]:
#Calculate the percent difference between the base and optimized models
def calculate_percent_difference(base_df, optimized_df, output_file):
    # Set 'output_class' as the index for calculation
    base_df.set_index('output_class', inplace=True)
    optimized_df.set_index('output_class', inplace=True)

    # Calculate percent change for specific columns
    columns = ['precision', 'recall', 'f1-score']
    epsilon = 1e-7
    percent_change = round((optimized_df[columns] - base_df[columns]) / (base_df[columns] + epsilon) * 100, 2)

    # Reset index to make 'output_class' a column again
    percent_change.reset_index(inplace=True)

    # Export the DataFrame to a .csv file
    percent_change.to_csv(output_file, index=False)

    return percent_change

In [39]:
base_median_metrics_df = calculate_output_class_medians(df_results, 'data\\04_fct\\fct_median_metrics_by_output_class_base.csv')
optimized_median_metrics_df = calculate_output_class_medians(df_results_optimized, 'data\\04_fct\\fct_median_metrics_by_output_class_optimized.csv')
percent_difference_df = calculate_percent_difference(base_median_metrics_df, optimized_median_metrics_df, 'data\\04_fct\\fct_percent_difference.csv')
percent_difference_df

Unnamed: 0,output_class,precision,recall,f1-score
0,0,0.0,0.0,0.0
1,1,4.0,-50.0,-50.0
2,macro avg,0.0,-3.7,-7.02
3,weighted avg,-1.04,0.0,-1.05


In [37]:
# Calculate the difference in training time between the original and optimized pipelines
training_time_difference = pipeline_v2_runtime - pipeline_runtime

# Convert the times to minutes and seconds
pipeline_runtime_minutes, pipeline_runtime_seconds = divmod(pipeline_runtime, 60)
pipeline_v2_runtime_minutes, pipeline_v2_runtime_seconds = divmod(pipeline_v2_runtime, 60)
training_time_difference_minutes, training_time_difference_seconds = divmod(training_time_difference, 60)

# Convert the minutes to integers
pipeline_runtime_minutes = int(pipeline_runtime_minutes)
pipeline_v2_runtime_minutes = int(pipeline_v2_runtime_minutes)
training_time_difference_minutes = int(training_time_difference_minutes)

display(Markdown(
    f'''<p style="font-size:16px">
    The optimized pipeline took {training_time_difference_minutes} minutes and {training_time_difference_seconds:.2f} seconds longer to train than the original pipeline
    (from {pipeline_runtime_minutes} minutes and {pipeline_runtime_seconds:.2f} seconds to {pipeline_v2_runtime_minutes} minutes and {pipeline_v2_runtime_seconds:.2f} seconds).
    </p>
    '''
))

<p style="font-size:16px">
    The optimized pipeline took 5 minutes and 49.84 seconds longer to train than the original pipeline
    (from 1 minutes and 20.10 seconds to 7 minutes and 9.94 seconds).
    </p>
    

# Export the original model as a pickle file

In [42]:
#Note: compressing the files takes significantly longer than pickling them
# Start the timer
start_time = time.time()

# Save the model
with open('models/original_accuracy_model.pkl', 'wb') as f:
    pickle.dump(pipeline, f)
    
end_time = time.time()
original_accuracy_model_compression_time = end_time - start_time
minutes, seconds = divmod(original_accuracy_model_compression_time, 60)

# Print the size of the original model file
print("Size of original model file, compressed:", os.path.getsize('models/original_accuracy_model.pkl') / (1024 * 1024), "MB")
print("Pickling time: {} minutes and {} seconds".format(minutes, round(seconds)))

# Start the timer
start_time = time.time()

# # Save the optimized model
with open('models/gs_optimized_accuracy_model.pkl', 'wb') as f:
    pickle.dump(pipeline_v2, f)
    
end_time = time.time()
optimized_accuracy_model_compression_time = end_time - start_time
minutes, seconds = divmod(optimized_accuracy_model_compression_time, 60)

# Print the size of the optimized model file
print("Size of optimized model file, compressed:", os.path.getsize('models/gs_optimized_accuracy_model.pkl') / (1024 * 1024), "MB")
print("Pickling time: {} minutes and {} seconds".format(minutes, round(seconds)))


Size of original model file, compressed: 862.134449005127 MB
Pickling time: 0.0 minutes and 3 seconds
Size of optimized model file, compressed: 1423.9854917526245 MB
Pickling time: 0.0 minutes and 5 seconds


In [46]:
#Calculate the difference in model file size
original_model_size = os.path.getsize('models/original_accuracy_model.pkl')
optimized_model_size = os.path.getsize('models/gs_optimized_accuracy_model.pkl')
model_size_difference = optimized_model_size - original_model_size
model_size_difference = round(model_size_difference / (1024 * 1024), 2)
print("The optimized model is {} MB larger than the original model".format(model_size_difference))

The optimized model is 561.85 MB larger than the original model


In [21]:
#Test to make sure the model is working
# Load the original trained model
with open('models/original_accuracy_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# Assuming X_test is your test features
y_pred = loaded_model.predict(X_test[:10])

# Now y_pred contains the predictions from the loaded model
print(y_pred)

[[1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0]
 [1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0]
 [1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0]]
