# Import Libraries

In [1]:
#Import libraries
import pandas as pd
import numpy as np
import string
from sqlalchemy import create_engine
from sqlalchemy.exc import SQLAlchemyError
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# from data.process_data import *

# Define the Feature and Target Columns

In [2]:
#feature columns
feature_columns =['message']

#target columns
target_columns = ['related', 'request', 'offer', 'aid_related', 'medical_help', 'medical_products', 'search_and_rescue',
                   'security', 'military', 'child_alone', 'water', 'food', 'shelter', 'clothing', 'money', 
                   'missing_people', 'refugees', 'death', 'other_aid', 'infrastructure_related', 'transport', 
                   'buildings', 'electricity', 'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
                   'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold', 'other_weather', 'direct_report']

In [4]:
def load_data_from_db(db_filepath, table_name, feature_columns, target_columns):
    try:
        # Load data from database
        engine = create_engine(db_filepath)

        # Create a dataframe from the engine
        df = pd.read_sql_table(table_name, engine)
        
        if df.empty:
            print("The table is empty or does not exist.")
            return None, None
    
    except SQLAlchemyError as e:
        print(f"Error loading data from database: {e}")
        return None, None

    try:
        # Define the features and target variables X and y
        x = df[feature_columns].copy().values
        y = df[target_columns].copy().values
    
    except (KeyError, AttributeError) as e:
        print(f"Error extracting features/targets: {e}")
        return None, None

    return x, y

#Regular expression for URL detection
URL_REGEX = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
URL_PLACE_HOLDER = "urlplaceholder"

def detect_and_replace_urls(text_array):
    """
    Detects URLs in each element of the text array and replaces them with a placeholder.
    
    :param text_array: numpy array of strings containing the original text.
    :return: numpy array of text with URLs replaced by placeholder.
    """
    if not isinstance(text_array, np.ndarray):
        raise ValueError("Input must be a numpy array")
    
    # Vectorize allows a function that accepts a single value to be called in an element-wise manner
    vectorized_replace = np.vectorize(lambda text: re.sub(URL_REGEX, URL_PLACE_HOLDER, text))
    
    return vectorized_replace(text_array)
    
def remove_punctuation(text_array):
    """
    Removes punctuation from the input numpy array of strings.
    
    :param text_array: Numpy array containing the original texts.
    :return: Numpy array with punctuation removed from each text.
    """
    vfunc = np.vectorize(lambda text: text.translate(str.maketrans("", "", string.punctuation)))
    return vfunc(text_array)

def tokenize_text(text):
    if isinstance(text, np.ndarray):
        return [word_tokenize(t.decode('utf-8')) if isinstance(t, bytes) else word_tokenize(t) for t in text.flatten()]
    else:
        return word_tokenize(text.decode('utf-8')) if isinstance(text, bytes) else word_tokenize(text)

def remove_stop_words(words_array):
    """
    Removes English stopwords from the numpy array of words.
    
    :param words_array: Numpy array of words.
    :return: List of words with stopwords removed.
    """
    stopwords_set = set(stopwords.words("english"))
    return [[word for word in words if word not in stopwords_set] for words in words_array]

def clean_tokens_generator(tokens):
    """
    Cleans and normalizes tokens using lemmatization. Uses a generator to yield tokens one at a time.
    
    :param tokens: Iterable of tokens to clean.
    :yield: Cleaned token one at a time.
    """
    lemmatizer = WordNetLemmatizer()
    for token in tokens:
        yield lemmatizer.lemmatize(token.lower().strip())
        
def tokenize(text_array):
    # Detect and replace URLs
    text_array = detect_and_replace_urls(text_array)
    
    # Remove punctuation
    text_array = remove_punctuation(text_array)
    
    # Tokenize text
    tokenized_text = tokenize_text(text_array)
    
    # Remove stopwords
    tokenized_text = remove_stop_words(tokenized_text)
    
    # Clean and normalize tokens
    cleaned_tokens = [list(clean_tokens_generator(tokens)) for tokens in tokenized_text]
    
    return cleaned_tokens

#Load the data
x, y = load_data_from_db(
                        'sqlite:///data/02_stg//stg_disaster_response.db',
                        'stg_disaster_response',
                        feature_columns, 
                        target_columns)

#Call the functions
results = tokenize(x)

print(results)



# To Do
- Make everything lowercase
- remove punctuation
- remove stop words