# Import Libraries

In [None]:
#Import libraries
import pandas as pd
import numpy as np
import string
from sqlalchemy import create_engine
from sqlalchemy.exc import SQLAlchemyError
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import pandas as pd
import json

# Load the Data

In [None]:
FEATURE_COLUMNS =['message']
TARGET_COLUMNS = [
    'related', 'request', 'offer', 'aid_related', 'medical_help', 'medical_products', 
    'search_and_rescue', 'security', 'military', 'child_alone', 'water', 'food', 
    'shelter', 'clothing', 'money', 'missing_people', 'refugees', 'death', 
    'other_aid', 'infrastructure_related', 'transport', 'buildings', 'electricity', 
    'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
    'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold', 
    'other_weather', 'direct_report']

STOPWORDS_SET = set(stopwords.words('english'))
URL_REGEX = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
URL_PLACE_HOLDER = "urlplaceholder"

def load_data(db_filepath, table_name, feature_columns, target_columns):
    engine = create_engine(db_filepath)
    # Create a dataframe from the engine
    df = pd.read_sql_table(table_name, engine)
    X = df.message.values
    y = df[target_columns].values
    return X, y

#Load the data
X, y = load_data(
    'sqlite:///data/02_stg//stg_disaster_response.db',
    'stg_disaster_response',
    FEATURE_COLUMNS, 
    TARGET_COLUMNS
)

_Note_: This is a multi-label classification problem because a message can belong to 0, 1 or multiple categories

# Check the Data

In [85]:
# I need to check for a few things here:
# - Check if X contains empty strings
# - Check if y contains missing or infinite values
# - Check if y is a 2D numpy array
# - Check if x is a 1D numpy array
# If all of these checks passs, then the rest of the code should run


# Write a Function to Tokenize the Text

In [None]:
def tokenize(text):
    """
    This function is designed to tokenize the message data
    """
    # Detect and replace URLs
    text = re.sub(URL_REGEX, URL_PLACE_HOLDER, text)
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [token for token in tokens if token not in STOPWORDS_SET]
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    cleaned_tokens = [lemmatizer.lemmatize(token.lower().strip()) for token in tokens]
    return cleaned_tokens


# Build a Machine Learning Pipeline

In [None]:
# Instantiate and configure the pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)), # Tokenize and vectorize text
    ('tfidf', TfidfTransformer(smooth_idf=False)), # Apply TF-IDF transformation
    ('clf', MultiOutputClassifier(RandomForestClassifier(n_jobs=9))) # Use MultiOutputClassifier with RandomForest, n_jobs specifies cores
])

# Train the Pipeline

In [None]:
# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Test the Model and Export the Test Results

In [None]:
# Assuming y_test and y_pred are your test labels and predicted labels respectively
y_pred = pipeline.predict(X_test)

# Create an empty list to store the results
results = []

for i, col in enumerate(TARGET_COLUMNS):
    report = classification_report(y_test[:, i], y_pred[:, i], output_dict=True, zero_division=0)
    for output_class, metrics in report.items():
        if isinstance(metrics, dict):  # Ensure metrics is a dictionary
            temp = metrics.copy()  # Create a copy of metrics to avoid modifying the original dictionary
            temp['output_class'] = output_class
            temp['category'] = col
            results.append(temp)

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

#Re-arrange the columns so that category and output_class are the first two columns
results_df = results_df[['category', 'output_class', 'precision', 'recall', 'f1-score', 'support']]
results_df.to_csv('data\\04_fct\\fct_prediction_results.csv', index=False)
results_df.head()

# Improve the Model

## Use GridSearch to find the best parameters based on the Accuracy Score (the default for GridSearch)

In [None]:
from models.train_classifier import tokenize #This needs to be here otherwise there will be a memory error

pipeline = Pipeline([
    #The default ngram_range is (1, 1)
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer(smooth_idf=False)),
    # This defaults to n_estimators=100 and min_samples_split=2
    ('clf', MultiOutputClassifier(RandomForestClassifier(n_jobs=9)))
])

#Set up the grid search parameters
parameters = {
    'vect__ngram_range': ((1, 1), (1, 2)),
    'clf__estimator__n_estimators': [50, 100, 200],
    'clf__estimator__min_samples_split': [2, 3, 4]
}

cv = GridSearchCV(pipeline, param_grid=parameters, scoring='accuracy', n_jobs=9)

# Fit the GridSearchCV object to the full training data
cv.fit(X_train, y_train)

# Get the results of the grid search
cv_results = cv.cv_results_

# Prepare a list to store the results
results = []

# For each set of parameters, store the parameters and the associated mean test score
for params, mean_score in zip(cv_results['params'], cv_results['mean_test_score']):
    results.append({'params': params, 'score': mean_score})

for result in results:
    print(result)
    
# Write the results to a .json file
with open('data\\04_fct\\fct_accurancy_parameters.json', 'w') as f:
    json.dump(results, f)

## Accuracy Scoring Results
Here are the best parameters if we score by accuracy:
 - clf__estimator__min_samples_split: 2
 - clf__estimator__n_estimators: 200 
 - vect__ngram_range: [1, 2]

## Generate a Classification Report Based Upon the Best Parameters for Accuracy Scoring

In [None]:
#will need to input the best parameters into the pipeline
pipeline_v2 = Pipeline([
    #The improved model considers 1-grams and 2-grams instead of just 1-grams
    ('vect', CountVectorizer(ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer(smooth_idf=False)),
    # It also increases the n_estimators from 100 to 200. The min_samples_split remains the same
    ('clf', MultiOutputClassifier(RandomForestClassifier(min_samples_split=2, n_estimators=200, n_jobs=9)))
])

# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Fit the pipeline to the training data - This takes about 9 minutes to run with 9 cores
pipeline_v2.fit(X_train, y_train)

# Assuming y_test and y_pred are your test labels and predicted labels respectively
y_pred_opt = pipeline_v2.predict(X_test)

# Create an empty list to store the results
results = []

for i, col in enumerate(TARGET_COLUMNS):
    report = classification_report(y_test[:, i], y_pred_opt[:, i], output_dict=True, zero_division=0)
    for output_class, metrics in report.items():
        if isinstance(metrics, dict):  # Ensure metrics is a dictionary
            temp = metrics.copy()  # Create a copy of metrics to avoid modifying the original dictionary
            temp['output_class'] = output_class
            temp['category'] = col
            results.append(temp)

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

#Re-arrange the columns so that category and output_class are the first two columns
results_df = results_df[['category', 'output_class', 'precision', 'recall', 'f1-score', 'support']]
results_df.to_csv('data\\04_fct\\fct_prediction_results_optimized.csv', index=False)
results_df.head()

## Compare the Original Accuracy Parameters to the New Accuracy Parameters

### Compare Scores per Category

In [90]:
#Import fct_prediction_results
df_results = pd.read_csv('data\\04_fct\\fct_prediction_results.csv')

#Import fct_prediction_results_optimized
df_results_optimized = pd.read_csv('data\\04_fct\\fct_prediction_results_optimized.csv')

# Set 'category' and 'output_class' as index
df_results.set_index(['category', 'output_class', 'support'], inplace=True)
df_results_optimized.set_index(['category', 'output_class'], inplace=True)

# Calculate percent change for specific columns
columns = ['precision', 'recall', 'f1-score']
epsilon = 1e-7
percent_change = round((df_results_optimized[columns] - df_results[columns]) / (df_results[columns] + epsilon) * 100, 2)

# Reset index
percent_change.reset_index(inplace=True)

#Re-arrange the columns so that support is the last column
percent_change = percent_change[['category', 'output_class', 'precision', 'recall', 'f1-score', 'support']]

#Export the DataFrame to a .csv file
percent_change.to_csv('data\\04_fct\\fct_percent_change_results.csv', index=False)

#Inspect the first few rows of the DataFrame
percent_change.head()

Unnamed: 0,category,output_class,precision,recall,f1-score,support
0,aid_centers,0,-0.0,-0.01,-0.01,8537.0
1,aid_centers,1,0.0,0.0,0.0,115.0
2,aid_centers,macro avg,-0.0,-0.01,-0.01,8652.0
3,aid_centers,weighted avg,-0.0,-0.01,-0.01,8652.0
4,aid_related,0,-4.77,7.08,0.67,5107.0


### Compare Median Scores per Output Class

In [103]:
# Calculate the median percent change for precision, recall, and f1-score by output_class
avg_changes_by_output_class = percent_change.groupby('output_class')[['precision', 'recall', 'f1-score']].median().round(2)

#Convert avg_changes_by_output_class to a DataFrame
avg_changes_by_output_class = avg_changes_by_output_class.reset_index()

#Export the DataFrame to a .csv file
avg_changes_by_output_class.to_csv('data\\04_fct\\fct_avg_changes_by_output_class.csv', index=False)

avg_changes_by_output_class

Unnamed: 0,output_class,precision,recall,f1-score
0,0,-0.02,0.01,-0.0
1,1,2.48,-8.33,-5.92
2,2,60.37,12.5,32.77
3,macro avg,0.74,-0.83,-1.28
4,weighted avg,0.0,-0.01,-0.04


The data shows the new ML model significantly boosts precision in class 2, but at the cost of reduced recall in class 1. This trade-off decreases the F1-score for class 1, indicating a shift towards specialization in the model's performance across classes. 