### Import the libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scripts.eda_functions import *

### Load the Data

In [3]:
# Define the data types for each column
boolean_columns = ['request', 'offer', 'aid_related', 'medical_help', 'medical_products', 'search_and_rescue',
                   'security', 'military', 'child_alone', 'water', 'food', 'shelter', 'clothing', 'money', 
                   'missing_people', 'refugees', 'death', 'other_aid', 'infrastructure_related', 'transport', 
                   'buildings', 'electricity', 'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
                   'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold', 'other_weather', 'direct_report']


int_columns = ['id', 'related']

string_columns = ['message', 'original', 'genre']

# Create a dictionary specifying the data types for each column
dtype_dict = {col: bool for col in boolean_columns}
dtype_dict.update({col: int for col in int_columns})
dtype_dict.update({col: str for col in string_columns})

# Create a pandas dataframe from the csv file. Specify the data types of the columns
df = pd.read_csv('data\\02_stg\\stg_disaster_messages.csv', dtype=dtype_dict)
df.head(n=2)

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,False,False,True,False,False,...,False,False,True,False,True,False,False,False,False,False


### Summarize the Consolidated and Deduplicate the Disaster Message Data

In [4]:
#Summarize the consolidated and deduplicated disaster messages data
summary_df = summarize_data(df)

#Import definitions for the disaster messages data
definitions = pd.read_csv('data\\02_stg\\stg_disaster_messages_definitions.csv')

#Left join the summary dataframe with the definitions dataframe on 'columns'
summary_df = summary_df.merge(definitions, how='left', on='columns')

#### Taking a Look at Columns where Type = Object

In [5]:
#Return the summary dataframe where type is object
#Note: Percentages are calculated from the row totals
summary_df[summary_df['type'] == 'object']

Unnamed: 0,columns,total_values,unique_values,unique_percentage,values_missing,data_completeness,type,definition
1,message,26216,26177,99.85,0,100.0,object,Content of the message
2,original,10170,9630,94.69,16046,38.79,object,Original message text (if available). Given th...
3,genre,26216,3,0.01,0,100.0,object,"Genre of the message. Values should be news, d..."


In the above table, we can see that only 38.79 percent of the messages are 'original'. This seems low. Let's take a further look into this data

In [6]:
#Calculate the number of rows in the df
total_rows = df.shape[0]

#Group by genre, count the unqiue number of original messages and sort in descending order
genre_message_count = df.groupby('genre')['original'].nunique().sort_values(ascending=False)

#Convert the series to a dataframe
genre_message_count_df = genre_message_count.reset_index()
genre_message_count_df.columns = ['genre', 'messages']

#Sum the messages in the dataframe
unique_original_messages = genre_message_count_df['messages'].sum()

print(f"There are {total_rows:,} rows in the dataframe")
print(f"There are {unique_original_messages:,} unique original messages in the dataframe")
print(f"This represent {unique_original_messages/total_rows:.2%} of the total messages in the dataframe")

genre_message_count_df

There are 26,216 rows in the dataframe
There are 9,630 unique original messages in the dataframe
This represent 36.73% of the total messages in the dataframe


Unnamed: 0,genre,messages
0,direct,9630
1,news,0
2,social,0


In the above table, we can see that all of the original messages have genre = 'direct'. This seems to imply that they are direct messages to the organization that's coordinating the disaster response. Let's take a look at the relevant column and see how it applies to the original messages

In [7]:
#Create a new dataframe from df where original is not null and drop duplicates
orignal_messages = df[df['original'].notnull()].drop_duplicates(subset='original')

#Group by 'related' and count the number of original messages. Sort by count in descending order
related_message_count = orignal_messages.groupby('related')['original'].count().sort_values(ascending=False)

#Convert the series to a dataframe and reset the index
related_message_count_df = related_message_count.reset_index()
related_message_count_df.columns = ['related', 'message']

#Sum the message column
total_original_messages = related_message_count_df['message'].sum()
print("Total number of unique original_messages:",total_original_messages)

#Add a new column to the dataframe that calculates the percentage of messages for each related category
related_message_count_df['percentage'] = round(related_message_count_df['message'] / total_original_messages * 100, 2)

#Translate the related column to a string
related_rational = {0: 'Unrelated', 1: 'Related', 2: 'Ambiguous'}

#Create a new column that applies the related_rational dictionary to the related column
related_message_count_df['related_rational'] = related_message_count_df['related'].map(related_rational)

#Make the related_reational column the second column in the dataframe
related_message_count_df = related_message_count_df[['related', 'related_rational', 'message', 'percentage']]

related_message_count_df

Total number of unique original_messages: 9630


Unnamed: 0,related,related_rational,message,percentage
0,1,Related,6276,65.17
1,0,Unrelated,3225,33.49
2,2,Not Classified,129,1.34


So we know that there are 9,630 unique original messages in the dataframe representing 36.73% of the total messages (26,216). Of the original unique messages, only 6,276 (65.17%) are classified as relevant which represents 23.94% of the data

Let's see the genre breakdown of messages that do not have an original text

In [8]:
#Using df, group by genre and count the number of messages where 'original' is NaN. Sort by count in descending order
genre_message_nan_count = df[df['original'].isna()].groupby('genre')['message'].count().sort_values(ascending=False)

#Convert the series to a dataframe
genre_message_nan_count = genre_message_nan_count.reset_index()
genre_message_nan_count.columns = ['genre', 'messages']

#Sum the values in 'messages'
total_messages = genre_message_nan_count['messages'].sum()

#Divide the values in 'messages' by the total to get the percentage
genre_message_nan_count['percent_of_messages'] = round(genre_message_nan_count['messages'] / total_messages * 100, 2)

#Summarize the results
print(f"There are {total_messages:,} messages that are not considered 'original'")
print("Here's how they are distributed by genre:")
genre_message_nan_count

There are 16,046 messages that are not considered 'original'
Here's how they are distributed by genre:


Unnamed: 0,genre,messages,percent_of_messages
0,news,13054,81.35
1,social,2396,14.93
2,direct,596,3.71


### Taking a Look at the Columns where Type = Boolean

In [9]:
#From df, create analysis_df
analysis_df = df.copy()

#Create a boolean indicating whether the 'original' column is null
analysis_df['has_original_message'] = analysis_df['original'].notna()

#Drop the following columns: 'id', 'message', 'original'
analysis_df.drop(columns=['id', 'message', 'original'], inplace=True)

# Create a new column that counts the number of True values in the boolean columns
analysis_df['boolean_sum'] = analysis_df[boolean_columns].sum(axis=1)

# Group by 'has_original_message', 'genre' and 'related'
grouped = analysis_df.groupby(['has_original_message', 'genre', 'related'])

# Calculate the metrics
more_than_one_category = grouped['boolean_sum'].apply(lambda x: np.sum(x > 1))
one_category = grouped['boolean_sum'].apply(lambda x: np.sum(x == 1))
zero_categories = grouped['boolean_sum'].apply(lambda x: np.sum(x == 0))

# Combine the Series into a DataFrame
result = pd.concat([more_than_one_category, one_category, zero_categories], axis=1)
result.columns = ['more_than_one_category', 'one_category', 'zero_categories']

#Convert the series to a dataframe
result_df = result.reset_index()

# Export the dataframe to a csv file
export_if_changed(result_df, 'data\\04_fct\\fct_message_categorization_summary.csv', dtypes=dtype_dict)

result_df

No changes in the data. The file data\04_fct\fct_message_categorization_summary.csv has not been updated.


Unnamed: 0,has_original_message,genre,related,more_than_one_category,one_category,zero_categories
0,False,direct,0,0,0,57
1,False,direct,1,421,7,111
2,False,news,0,0,0,2365
3,False,news,1,7674,218,2779
4,False,news,2,0,0,18
5,False,social,0,0,0,305
6,False,social,1,1601,53,399
7,False,social,2,0,0,38
8,True,direct,0,0,0,3395
9,True,direct,1,4646,165,1832


In [10]:
#filter results on 'has_original_message' == True and genre == 'direct'
original_messages = result_df[(result_df['has_original_message'] == True) & (result_df['genre'] == 'direct')].copy().reset_index(drop=True)

#Create lists with dimensions and measures
dimensions = ['has_original_message', 'genre', 'related']
measures = ['more_than_one_category', 'one_category', 'zero_categories']

#Sum the measures
original_messages['row_total'] = original_messages[measures].sum(axis=1)
total_original_messsages = original_messages['row_total'].sum()
print('Total messages:', total_original_messsages)

#Add row_total to measures so we can calculate the percentage
measures = ['more_than_one_category', 'one_category', 'zero_categories', 'row_total']

#Divide the measures by the total number of original messages and multiply by 100 to get the percentage 
original_messages[measures] = (original_messages[measures] / total_original_messsages) * 100

# Round to two decimal places
original_messages[measures] = original_messages[measures].round(2)
original_messages

Total messages: 10170


Unnamed: 0,has_original_message,genre,related,more_than_one_category,one_category,zero_categories,row_total
0,True,direct,0,0.0,0.0,33.38,33.38
1,True,direct,1,45.68,1.62,18.01,65.32
2,True,direct,2,0.0,0.0,1.3,1.3


- All original messages are _direct_ messages. Messages that are relevant (1) are likely to be classified into multiple categories

### Taking a Look at Original Messages With More than One Associated Message

In [11]:
# Group by 'original' and count the unique values of 'message'
original_counts = df.groupby('original')['message'].nunique()

# Convert the series to a dataframe
original_counts = original_counts.reset_index()

# Sort the dataframe by the number of unique messages descending
original_counts = original_counts.sort_values('message', ascending=False)
original_counts.head()

Unnamed: 0,original,message
5634,Nap fe ou konnen ke apati de jodi a sevis SMS ...,20
7437,Un front froid se retrouve sur Cuba ce matin. ...,19
196,4636 : Nasyonzini di ou men m retire kounye a ...,7
8193,hmre vilage kachipul men flood ne bht nuksan k...,6
1633,Enfomasyon sou tranbleman ta a,6


In [16]:
#Count the number of original messages that have more than one message
more_than_one_message = original_counts[original_counts['message'] > 1].shape[0]
print(f"There are {more_than_one_message:,} messages than have more then one translation")

387
There are 387 messages than have more then one translation


In [17]:
df[df['original'] == 'Nap fe ou konnen ke apati de jodi a sevis SMS 4636 pou enfomasyon ijan'][['original', 'message']].head()

Unnamed: 0,original,message
8989,Nap fe ou konnen ke apati de jodi a sevis SMS ...,"NOTES: This message is not complete, there's m..."
9078,Nap fe ou konnen ke apati de jodi a sevis SMS ...,"WE want to let you know from now on,the servic..."
9107,Nap fe ou konnen ke apati de jodi a sevis SMS ...,We say you now sms service 4636 for urgent inf...
9124,Nap fe ou konnen ke apati de jodi a sevis SMS ...,we want you know that now a day the service of...
9175,Nap fe ou konnen ke apati de jodi a sevis SMS ...,We will tell you that from today the 4636 SMS ...


It looks like the top offender is an automated message from the cellphone company that has been mistranslated