# Import Libraries

In [57]:
#Import libraries
import pandas as pd
import numpy as np
import string
from sqlalchemy import create_engine
from sqlalchemy.exc import SQLAlchemyError
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import pandas as pd

import json

# Load the Data

In [47]:
FEATURE_COLUMNS =['message']
TARGET_COLUMNS = [
    'related', 'request', 'offer', 'aid_related', 'medical_help', 'medical_products', 
    'search_and_rescue', 'security', 'military', 'child_alone', 'water', 'food', 
    'shelter', 'clothing', 'money', 'missing_people', 'refugees', 'death', 
    'other_aid', 'infrastructure_related', 'transport', 'buildings', 'electricity', 
    'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
    'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold', 
    'other_weather', 'direct_report']

STOPWORDS_SET = set(stopwords.words('english'))
URL_REGEX = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
URL_PLACE_HOLDER = "urlplaceholder"

def load_data(db_filepath, table_name, feature_columns, target_columns):
    engine = create_engine(db_filepath)
    # Create a dataframe from the engine
    df = pd.read_sql_table(table_name, engine)
    X = df.message.values
    y = df[target_columns].values
    return X, y

#Load the data
X, y = load_data(
    'sqlite:///data/02_stg//stg_disaster_response.db',
    'stg_disaster_response',
    FEATURE_COLUMNS, 
    TARGET_COLUMNS
)

# Write a Function to Tokenize the Text

In [48]:
def tokenize(text):
    """
    This function is designed to tokenize the message data
    """
    # Detect and replace URLs
    text = re.sub(URL_REGEX, URL_PLACE_HOLDER, text)
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [token for token in tokens if token not in STOPWORDS_SET]
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    cleaned_tokens = [lemmatizer.lemmatize(token.lower().strip()) for token in tokens]
    return cleaned_tokens


# Build a Machine Learning Pipeline

In [36]:
# Instantiate and configure the pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)), # Tokenize and vectorize text
    ('tfidf', TfidfTransformer(smooth_idf=False)), # Apply TF-IDF transformation
    ('clf', MultiOutputClassifier(RandomForestClassifier(n_jobs=6))) # Use MultiOutputClassifier with RandomForest, n_jobs specifies cores
])

# Train the Pipeline

In [37]:
#Note this code takes about a minute and 30 seconds to run with 6 cores
# perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)



# Test the Model and Export the Test Results

In [38]:
# Assuming y_test and y_pred are your test labels and predicted labels respectively
y_pred = pipeline.predict(X_test)

# Create an empty list to store the results
results = []

for i, col in enumerate(TARGET_COLUMNS):
    report = classification_report(y_test[:, i], y_pred[:, i], output_dict=True, zero_division=0)
    for output_class, metrics in report.items():
        if isinstance(metrics, dict):  # Ensure metrics is a dictionary
            temp = metrics.copy()  # Create a copy of metrics to avoid modifying the original dictionary
            temp['output_class'] = output_class
            temp['category'] = col
            results.append(temp)

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

#Re-arrange the columns so that category and output_class are the first two columns
results_df = results_df[['category', 'output_class', 'precision', 'recall', 'f1-score', 'support']]
results_df.to_csv('data\\04_fct\\fct_prediction_results.csv', index=False)
results_df.head()

Unnamed: 0,category,output_class,precision,recall,f1-score,support
0,related,0,0.682334,0.409932,0.512165,2054.0
1,related,1,0.833152,0.936945,0.882005,6534.0
2,related,2,0.371429,0.40625,0.38806,64.0
3,related,macro avg,0.628971,0.584376,0.594077,8652.0
4,related,weighted avg,0.793932,0.807906,0.790551,8652.0


### Results
- High performance on class 1 with precision, recall, and F1-score all above 0.8.
- Moderate performance on class 0 with metrics around 0.5 to 0.7.
- Lower performance on class 2 with all metrics around 0.4, indicating difficulty in accurately predicting this class.

Overall, the model has a weighted average precision of 0.799, recall of 0.812, and F1-score of 0.794, reflecting good performance, particularly skewed towards the well-represented classes. Class 2, being the least represented, shows areas where the model could improve.

# Improve the Model

In [50]:
from models.train_classifier import tokenize

pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer(smooth_idf=False)),
    ('clf', MultiOutputClassifier(RandomForestClassifier(n_jobs=6)))
])

#Set up the grid search parameters
parameters = {
    'vect__ngram_range': ((1, 1), (1, 2)),
    'clf__estimator__n_estimators': [50, 100, 200],
    'clf__estimator__min_samples_split': [2, 3, 4]
}

cv = GridSearchCV(pipeline, param_grid=parameters, n_jobs=9)

#Note: this takes 164 minutes to run with 6 cores
# cv.fit(X_train, y_train)



In [56]:
best_params = cv.best_params_  # Retrieve the best parameters found by the grid search
print(best_params)



# Store best parameters in a JSON file
with open('data\\04_fct\\fct_best_params.json', 'w') as f:
    json.dump(best_params, f)

{'clf__estimator__min_samples_split': 2, 'clf__estimator__n_estimators': 200, 'vect__ngram_range': (1, 2)}


In [None]:
y_pred_opt = cv.predict(X_test)  # Use the best model found by the grid search to make predictions on the test data

In [None]:
report_orig = classification_report(y_test, y_pred, output_dict=True)

report_opt = classification_report(y_test, y_pred_opt, output_dict=True)

In [None]:
# Convert the original report to a DataFrame
df_orig = pd.DataFrame(report_orig).transpose()

# Convert the optimized report to a DataFrame
df_opt = pd.DataFrame(report_opt).transpose()

# Add a column to each DataFrame to indicate which report it is
df_orig['report'] = 'original'
df_opt['report'] = 'optimized'

# Concatenate the two DataFrames
comparison = pd.concat([df_orig, df_opt])

# Print the comparison
print(comparison)