### Import the libraries

In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scripts.eda_functions import *

### Load the Data

In [52]:
# Define the data types for each column
boolean_columns = ['request', 'offer', 'aid_related', 'medical_help', 'medical_products', 'search_and_rescue',
                   'security', 'military', 'child_alone', 'water', 'food', 'shelter', 'clothing', 'money', 
                   'missing_people', 'refugees', 'death', 'other_aid', 'infrastructure_related', 'transport', 
                   'buildings', 'electricity', 'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
                   'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold', 'other_weather', 'direct_report']


int_columns = ['id', 'related']

string_columns = ['message', 'original', 'genre']

# Create a dictionary specifying the data types for each column
dtype_dict = {col: bool for col in boolean_columns}
dtype_dict.update({col: int for col in int_columns})
dtype_dict.update({col: str for col in string_columns})

# Create a pandas dataframe from the csv file. Specify the data types of the columns
df = pd.read_csv('data\\02_stg\\stg_disaster_messages.csv', dtype=dtype_dict)

### Summarize the Consolidated and Deduplicate the Disaster Message Data

In [53]:
#Summarize the consolidated and deduplicated disaster messages data
summary_df = summarize_data(df)

#### Taking a Look at Columns where Type = Object

In [54]:
#Return the summary dataframe where type is object
summary_df[summary_df['type'] == 'object']

Unnamed: 0,columns,unique_values,unique_percentage,missing_values,missing_percentage,type
1,message,26177,99.85,0,0.0,object
2,original,9630,94.69,16046,61.21,object
3,genre,3,0.01,0,0.0,object


For me, the above table is interesting because only 61.21% of the messages have an 'original' version. I want to understand why this percentage is so low. To do this, I want to look at the original messages group by genre since all of the messages have a genre.

In [55]:
# Calculate the total number of rows in the dataframe
total_rows = df.shape[0]
print(f'There are {total_rows} rows in the dataframe')

#Group by genre, count the number of original messages. Sort by count in descending order
genre_message_count = df.groupby('genre')['original'].count().sort_values(ascending=False)

#Convert the series to a dataframe
genre_message_count_df = genre_message_count.reset_index()
genre_message_count_df.columns = ['genre', 'message_count']

#Sum the message counts
original_genre_count = genre_message_count_df['message_count'].sum()
print(f"All {original_genre_count} of the original messages are classified as 'direct'")
genre_message_count_df

There are 26216 rows in the dataframe
All 10170 of the original messages are classified as 'direct'


Unnamed: 0,genre,message_count
0,direct,10170
1,news,0
2,social,0


In [56]:
# Calculate the total number of rows
total_rows = df.shape[0]
print(f'Total rows: {total_rows}')

#Calculate how many of the original messages have a genre
original_genre_count = df['original'].count()
print(f'Original messages with genre: {original_genre_count}')

#Calculate the percentage of original messages with genre
original_genre_percentage = round(original_genre_count / total_rows * 100, 2)
print(f'Percentage of original messages with genre: {original_genre_percentage}%')

#Group by genre, count the number of original messages. Sort by count in descending order
genre_message_count = df.groupby('genre')['original'].count().sort_values(ascending=False)

#Convert the series to a dataframe
genre_message_count_df = genre_message_count.reset_index()
genre_message_count_df.columns = ['genre', 'message_count']
genre_message_count_df


Total rows: 26216
Original messages with genre: 10170
Percentage of original messages with genre: 38.79%


Unnamed: 0,genre,message_count
0,direct,10170
1,news,0
2,social,0


In [57]:
#Using df, group by genre and count the number of messages where 'original' is NaN. Sort by count in descending order
genre_message_nan_count = df[df['original'].isna()].groupby('genre')['message'].count().sort_values(ascending=False)

#Convert the series to a dataframe
genre_message_nan_count = genre_message_nan_count.reset_index()
genre_message_nan_count.columns = ['genre', 'original_nan_count']

#Print the sum of the values in 'original_nan_count'
total_original_nan_count = genre_message_nan_count['original_nan_count'].sum()
print("There are", total_original_nan_count, "messages with NaN 'original' values.")
print(original_genre_count + total_original_nan_count)
genre_message_nan_count

There are 16046 messages with NaN 'original' values.
26216


Unnamed: 0,genre,original_nan_count
0,news,13054
1,social,2396
2,direct,596


It looks like the messages in the dataset can be divided up into two categories:
- Messages that are relevant to the hurricane sandy damage in Haiti
- Messages that are not relevant to the hurricane sandy damage in
It looks like all of the original messages are direct messages, and can be in either Haitian Creole or English

### Taking a Look at the Columns where Type = Boolean

In [75]:
#From df, create analysis_df
analysis_df = df.copy()

#Create a boolean indicating whether the 'original' column is null
analysis_df['has_original_message'] = analysis_df['original'].notna()

#Drop the following columns: 'id', 'message', 'original'
analysis_df.drop(columns=['id', 'message', 'original'], inplace=True)

# Create a new column that counts the number of True values in the boolean columns
analysis_df['boolean_sum'] = analysis_df[boolean_columns].sum(axis=1)

# Group by 'has_original_message', 'genre' and 'related'
grouped = analysis_df.groupby(['has_original_message', 'genre', 'related'])

# Calculate the metrics
more_than_one_category = grouped['boolean_sum'].apply(lambda x: np.sum(x > 1))
one_category = grouped['boolean_sum'].apply(lambda x: np.sum(x == 1))
zero_categories = grouped['boolean_sum'].apply(lambda x: np.sum(x == 0))

# Combine the Series into a DataFrame
result = pd.concat([more_than_one_category, one_category, zero_categories], axis=1)
result.columns = ['more_than_one_category', 'one_category', 'zero_categories']

#Convert the series to a dataframe
result = result.reset_index()
result

Unnamed: 0,has_original_message,genre,related,more_than_one_category,one_category,zero_categories
0,False,direct,0,0,0,57
1,False,direct,1,421,7,111
2,False,news,0,0,0,2365
3,False,news,1,7674,218,2779
4,False,news,2,0,0,18
5,False,social,0,0,0,305
6,False,social,1,1601,53,399
7,False,social,2,0,0,38
8,True,direct,0,0,0,3395
9,True,direct,1,4646,165,1832
