### Import the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scripts.eda_functions import *

### Load the Data

In [9]:
# Define the data types for each column
boolean_columns = ['request', 'offer', 'aid_related', 'medical_help', 'medical_products', 'search_and_rescue',
                   'security', 'military', 'child_alone', 'water', 'food', 'shelter', 'clothing', 'money', 
                   'missing_people', 'refugees', 'death', 'other_aid', 'infrastructure_related', 'transport', 
                   'buildings', 'electricity', 'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
                   'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold', 'other_weather', 'direct_report']


int_columns = ['id', 'related']

string_columns = ['message', 'original', 'genre']

# Create a dictionary specifying the data types for each column
dtype_dict = {col: bool for col in boolean_columns}
dtype_dict.update({col: int for col in int_columns})
dtype_dict.update({col: str for col in string_columns})

# Create a pandas dataframe from the csv file. Specify the data types of the columns
df = pd.read_csv('data\\02_stg\\stg_disaster_messages.csv', dtype=dtype_dict)
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,False,False,True,False,False,...,False,False,True,False,True,False,False,False,False,False
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,True,False,True,False,True,...,False,False,False,False,False,False,False,False,False,False
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Summarize the Consolidated and Deduplicate the Disaster Message Data

In [14]:
#Summarize the consolidated and deduplicated disaster messages data
summary_df = summarize_data(df)

#Import definitions for the disaster messages data
definitions = pd.read_csv('data\\02_stg\\stg_disaster_messages_definitions.csv')

#Left join the summary dataframe with the definitions dataframe on 'columns'
summary_df = summary_df.merge(definitions, how='left', on='columns')

Unnamed: 0,columns,unique_values,unique_percentage,missing_values,missing_percentage,type,definition
0,id,26180,99.86,0,0.0,int32,Unique identifier for each message


#### Taking a Look at Columns where Type = Object

In [13]:
#Return the summary dataframe where type is object
summary_df[summary_df['type'] == 'object']

Unnamed: 0,columns,unique_values,unique_percentage,missing_values,missing_percentage,type,definition
1,message,26177,99.85,0,0.0,object,Content of the message
2,original,9630,94.69,16046,61.21,object,Original message text (if available). Given th...
3,genre,3,0.01,0,0.0,object,"Genre of the message. Values should be news, d..."


For me, the above table is interesting because only 61.21% of the messages have an 'original' version. I want to understand why this percentage is so low. To do this, I want to look at the original messages group by genre since all of the messages have a genre.

In [5]:
# Calculate the total number of rows in the dataframe
total_rows = df.shape[0]
print(f'There are {total_rows} rows in the dataframe')

#Group by genre, count the number of original messages. Sort by count in descending order
genre_message_count = df.groupby('genre')['original'].count().sort_values(ascending=False)

#Convert the series to a dataframe
genre_message_count_df = genre_message_count.reset_index()
genre_message_count_df.columns = ['genre', 'message_count']

#Sum the message counts
original_genre_count = genre_message_count_df['message_count'].sum()
print(f"All {original_genre_count} of the original messages are classified as 'direct'")
genre_message_count_df

There are 26216 rows in the dataframe
All 10170 of the original messages are classified as 'direct'


Unnamed: 0,genre,message_count
0,direct,10170
1,news,0
2,social,0


In [6]:
# Calculate the total number of rows
total_rows = df.shape[0]
print(f'Total rows: {total_rows}')

#Calculate how many of the original messages have a genre
original_genre_count = df['original'].count()
print(f'Original messages with genre: {original_genre_count}')

#Calculate the percentage of original messages with genre
original_genre_percentage = round(original_genre_count / total_rows * 100, 2)
print(f'Percentage of original messages with genre: {original_genre_percentage}%')

#Group by genre, count the number of original messages. Sort by count in descending order
genre_message_count = df.groupby('genre')['original'].count().sort_values(ascending=False)

#Convert the series to a dataframe
genre_message_count_df = genre_message_count.reset_index()
genre_message_count_df.columns = ['genre', 'message_count']
genre_message_count_df


Total rows: 26216
Original messages with genre: 10170
Percentage of original messages with genre: 38.79%


Unnamed: 0,genre,message_count
0,direct,10170
1,news,0
2,social,0


In [7]:
#Using df, group by genre and count the number of messages where 'original' is NaN. Sort by count in descending order
genre_message_nan_count = df[df['original'].isna()].groupby('genre')['message'].count().sort_values(ascending=False)

#Convert the series to a dataframe
genre_message_nan_count = genre_message_nan_count.reset_index()
genre_message_nan_count.columns = ['genre', 'original_nan_count']

#Print the sum of the values in 'original_nan_count'
total_original_nan_count = genre_message_nan_count['original_nan_count'].sum()
print("There are", total_original_nan_count, "messages with NaN 'original' values.")
print(original_genre_count + total_original_nan_count)
genre_message_nan_count

There are 16046 messages with NaN 'original' values.
26216


Unnamed: 0,genre,original_nan_count
0,news,13054
1,social,2396
2,direct,596


It looks like the messages in the dataset can be divided up into two categories:
- Messages that are relevant to the Hurricane Sandy damage in Haiti
- Messages that are related to Hurricane Sandy damage in Haiti
- Messages that are related to Hurricane Sandy damage outside of Haiti
It looks like all of the original messages are direct messages, and can be in either Haitian Creole or English

### Taking a Look at the Columns where Type = Boolean

In [8]:
#From df, create analysis_df
analysis_df = df.copy()

#Create a boolean indicating whether the 'original' column is null
analysis_df['has_original_message'] = analysis_df['original'].notna()

#Drop the following columns: 'id', 'message', 'original'
analysis_df.drop(columns=['id', 'message', 'original'], inplace=True)

# Create a new column that counts the number of True values in the boolean columns
analysis_df['boolean_sum'] = analysis_df[boolean_columns].sum(axis=1)

# Group by 'has_original_message', 'genre' and 'related'
grouped = analysis_df.groupby(['has_original_message', 'genre', 'related'])

# Calculate the metrics
more_than_one_category = grouped['boolean_sum'].apply(lambda x: np.sum(x > 1))
one_category = grouped['boolean_sum'].apply(lambda x: np.sum(x == 1))
zero_categories = grouped['boolean_sum'].apply(lambda x: np.sum(x == 0))

# Combine the Series into a DataFrame
result = pd.concat([more_than_one_category, one_category, zero_categories], axis=1)
result.columns = ['more_than_one_category', 'one_category', 'zero_categories']

#Convert the series to a dataframe
result_df = result.reset_index()

# Export the dataframe to a csv file
export_if_changed(result_df, 'data\\04_fct\\fct_message_categorization_summary.csv', dtypes=dtype_dict)

result_df

No changes in the data. The file data\04_fct\fct_message_categorization_summary.csv has not been updated.


Unnamed: 0,has_original_message,genre,related,more_than_one_category,one_category,zero_categories
0,False,direct,0,0,0,57
1,False,direct,1,421,7,111
2,False,news,0,0,0,2365
3,False,news,1,7674,218,2779
4,False,news,2,0,0,18
5,False,social,0,0,0,305
6,False,social,1,1601,53,399
7,False,social,2,0,0,38
8,True,direct,0,0,0,3395
9,True,direct,1,4646,165,1832


##### Analysis
1. **Presence of the Original Message**:
   - Messages **without** the original message tend to be categorized into multiple categories more frequently than those with the original message.
   - A substantial number of messages, regardless of the presence of the original message, do not fall into any category, indicating a potential area for improving categorization effectiveness.

2. **Genre**:
   - The **news** genre has the highest tendency for messages to be categorized into multiple categories, followed by direct and then social genres.
   - The **social** genre has the lowest count of messages that do not fall into any category, suggesting that social messages are somewhat more likely to be categorized, although often into multiple categories.

3. **Related Status**:
   - Messages that are **related** (`1`) have the highest counts across both single-category and multiple-category classifications, indicating a clear categorization pattern for messages deemed related to a certain topic or event.
   - Messages classified as not related (`0`) or having a different level of relation
   - Messages classified as (`2`) do not fall into more than one category, and a significant portion of them are not categorized at all (`zero_categories`).
4. Other
   - *news** genre and messages deemed **related** are more frequently associated with multiple categories.