In [2]:
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
import re

import plotly
from plotly.graph_objs import Bar
from plotly import offline

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet') 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to /Users/apple/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/apple/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
engine = create_engine('sqlite:///static/db/DisasterRes.db')
df = pd.read_sql_table('DisasterResponse', engine)
print(f"df.shape: {df.shape}")

df.shape: (26215, 40)


#### Genre Bar Plot

In [4]:
df['genre'].value_counts()

news      13054
direct    10766
social     2395
Name: genre, dtype: int64

In [5]:
x = df['genre'].value_counts().index.tolist()
x = [x.title() for x in x]

In [6]:
y = df['genre'].value_counts().values.tolist()

In [7]:
data = [{
    
    'type': 'bar',
    'x': x,
    'y': y
    
}]

In [8]:
my_layout = {
    'title': '',
    'xaxis': {'title': 'Genre'},
    'yaxis': {'title': 'Numbers'},
    
}

fig = {'data': data, 'layout': my_layout}

In [9]:
offline.plot(fig, filename='genre.html')

'genre.html'

#### Distribution of categories

In [10]:
df.iloc[:, -36:].columns.to_list()

['related',
 'request',
 'offer',
 'aid_related',
 'medical_help',
 'medical_products',
 'search_and_rescue',
 'security',
 'military',
 'child_alone',
 'water',
 'food',
 'shelter',
 'clothing',
 'money',
 'missing_people',
 'refugees',
 'death',
 'other_aid',
 'infrastructure_related',
 'transport',
 'buildings',
 'electricity',
 'tools',
 'hospitals',
 'shops',
 'aid_centers',
 'other_infrastructure',
 'weather_related',
 'floods',
 'storm',
 'fire',
 'earthquake',
 'cold',
 'other_weather',
 'direct_report']

In [11]:
df.iloc[:, -36:].columns.to_list()

cat_names = [x.replace('_', ' ').title() for x in df.iloc[:, -36:].columns.to_list()]

In [12]:
cat_names

['Related',
 'Request',
 'Offer',
 'Aid Related',
 'Medical Help',
 'Medical Products',
 'Search And Rescue',
 'Security',
 'Military',
 'Child Alone',
 'Water',
 'Food',
 'Shelter',
 'Clothing',
 'Money',
 'Missing People',
 'Refugees',
 'Death',
 'Other Aid',
 'Infrastructure Related',
 'Transport',
 'Buildings',
 'Electricity',
 'Tools',
 'Hospitals',
 'Shops',
 'Aid Centers',
 'Other Infrastructure',
 'Weather Related',
 'Floods',
 'Storm',
 'Fire',
 'Earthquake',
 'Cold',
 'Other Weather',
 'Direct Report']

In [13]:
cats_df = df.iloc[:, -36:]

In [14]:
cats_df

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26210,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26211,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26212,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26213,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
count_cats = cats_df.sum().sort_values(ascending=False)

count_cats

related                   20093
aid_related               10860
weather_related            7297
direct_report              5075
request                    4474
other_aid                  3446
food                       2923
earthquake                 2455
storm                      2443
shelter                    2314
floods                     2155
medical_help               2084
infrastructure_related     1705
water                      1672
other_weather              1376
buildings                  1333
medical_products           1313
transport                  1201
death                      1194
other_infrastructure       1151
refugees                    875
military                    860
search_and_rescue           724
money                       604
electricity                 532
cold                        530
security                    471
clothing                    405
aid_centers                 309
missing_people              298
hospitals                   283
fire    

In [16]:
count_cats_k = [x.replace('_', ' ').title() for x in count_cats.index.to_list()]

count_cats_k

['Related',
 'Aid Related',
 'Weather Related',
 'Direct Report',
 'Request',
 'Other Aid',
 'Food',
 'Earthquake',
 'Storm',
 'Shelter',
 'Floods',
 'Medical Help',
 'Infrastructure Related',
 'Water',
 'Other Weather',
 'Buildings',
 'Medical Products',
 'Transport',
 'Death',
 'Other Infrastructure',
 'Refugees',
 'Military',
 'Search And Rescue',
 'Money',
 'Electricity',
 'Cold',
 'Security',
 'Clothing',
 'Aid Centers',
 'Missing People',
 'Hospitals',
 'Fire',
 'Tools',
 'Shops',
 'Offer',
 'Child Alone']

In [17]:
count_cats_v = list(count_cats.values)

count_cats_v

[20093,
 10860,
 7297,
 5075,
 4474,
 3446,
 2923,
 2455,
 2443,
 2314,
 2155,
 2084,
 1705,
 1672,
 1376,
 1333,
 1313,
 1201,
 1194,
 1151,
 875,
 860,
 724,
 604,
 532,
 530,
 471,
 405,
 309,
 298,
 283,
 282,
 159,
 120,
 118,
 0]

In [18]:
cat_data = [{
    
    'type': 'bar',
    'x': count_cats_k,
    'y': count_cats_v
    
}]

cat_layout = {
    'title': '',
    'xaxis': {'title': 'Categories'},
    'yaxis': {'title': 'Numbers'},
    
}

fig = {'data': cat_data, 'layout': cat_layout}

In [19]:
offline.plot(fig, filename='categories.html')

'categories.html'

### Most Common Words by Categories

In [20]:
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

In [21]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [22]:
lemmatizer

<WordNetLemmatizer>

In [23]:
def tokenize(text):
    # replace url
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
    
    # normalize case and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # tokenize text
    tokens = word_tokenize(text)
    
    # lemmatize and remove stop words
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return tokens

In [24]:
corpus = list(df['message'].values)

corpus

['Weather update - a cold front from Cuba that could pass over Haiti',
 'Is the Hurricane over or is it not over',
 'Looking for someone but no name',
 'UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.',
 'says: west side of Haiti, rest of the country today and tonight',
 'Information about the National Palace-',
 'Storm at sacred heart of jesus',
 'Please, we need tents and water. We are in Silo, Thank you!',
 'I would like to receive the messages, thank you',
 'I am in Croix-des-Bouquets. We have health issues. They ( workers ) are in Santo 15. ( an area in Croix-des-Bouquets )',
 "There's nothing to eat and water, we starving and thirsty.",
 'I am in Petionville. I need more information regarding 4636',
 'I am in Thomassin number 32, in the area named Pyron. I would like to have some water. Thank God we are fine, but we desperately need water. Thanks',
 "Let's do it together, need food in Delma 75, in didine area",
 'More informati

In [25]:
len(corpus)

26215

In [26]:
# initialize count vectorizer object
cv = CountVectorizer(tokenizer=tokenize)

In [27]:
# get counts of each token (word) in text data
cv_fit = cv.fit_transform(corpus)

In [28]:
# convert sparse matrix to numpy array to view
cv_fit.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [29]:
# get vocabularies and indices (the number is not count)

voc = cv.vocabulary_ 

voc

{'weather': 30238,
 'update': 29379,
 'cold': 6340,
 'front': 11387,
 'cuba': 7416,
 'could': 7131,
 'pas': 20753,
 'haiti': 12561,
 'hurricane': 13480,
 'looking': 16810,
 'someone': 26030,
 'name': 18944,
 'un': 28973,
 'report': 23504,
 'leogane': 16388,
 '80': 1209,
 '90': 1318,
 'destroyed': 8201,
 'hospital': 13291,
 'st': 26378,
 'croix': 7342,
 'functioning': 11476,
 'need': 19122,
 'supply': 27017,
 'desperately': 8168,
 'say': 24686,
 'west': 30321,
 'side': 25545,
 'rest': 23660,
 'country': 7157,
 'today': 28113,
 'tonight': 28190,
 'information': 14169,
 'national': 19014,
 'palace': 20553,
 'storm': 26590,
 'sacred': 24260,
 'heart': 12879,
 'jesus': 14931,
 'please': 21473,
 'tent': 27627,
 'water': 30163,
 'silo': 25595,
 'thank': 27744,
 'would': 30654,
 'like': 16543,
 'receive': 23004,
 'message': 17897,
 'de': 7725,
 'bouquet': 4608,
 'health': 12865,
 'issue': 14693,
 'worker': 30620,
 'santo': 24584,
 '15': 198,
 'area': 2867,
 'nothing': 19558,
 'eat': 9297,
 'st

In [30]:
word_list = cv.get_feature_names_out()

In [None]:
count_list = np.asarray(cv_fit.sum(axis=0))[0]

In [None]:
wc_dict = dict(zip(word_list,count_list))

wc_dict

In [None]:
# np.asarray(cv_fit.sum(axis=0))

In [None]:
#word count data frame

wc_df = pd.DataFrame.from_dict(wc_dict, orient='index', columns=['number'])

wc_df

In [None]:
wc_df.sort_values(by=['number'], ascending=[False]).head(60)

In [None]:
# todo 

# subset a givin category message subset, and compute most common words 

# declarative function

df.iloc[:, -36:].columns.to_list()

caf_food_df = df[df['food'] == 1]['message']

caf_food_df

In [None]:


caf_df = df[df['weather_related'] == 1]['message']

caf_df

corpus = list(caf_df.values)

# initialize count vectorizer object
cv = CountVectorizer(tokenizer=tokenize)

# get counts of each token (word) in text data
cv_fit = cv.fit_transform(corpus)

word_list = cv.get_feature_names_out()

count_list = np.asarray(cv_fit.sum(axis=0))[0]

wc_dict = dict(zip(word_list,count_list))

wc_df = pd.DataFrame.from_dict(wc_dict, orient='index', columns=['number'])

wc_df.sort_values(by=['number'], ascending=[False]).head(60)

In [None]:
wc_df.sort_values(by=['number'], ascending=[False]).head(20)['number'].to_dict()

In [None]:
voc_df[
    (voc_df['number'] > 0) & 
    (voc_df['number'] < 500)
].sort_values(by=['number'], ascending=True).head(20).to_dict()

In [46]:
def get_category_top_words(df, category='all', size=30):
    # df: original df
    # category: column name: e.g 'weather_related', 'food'
    # size: top how many? e.g 20 = top 20 (most common 20 words)
    #
    # return a top word count dictionary
    
    print(f'Analyzing Most Common Terms Found in "{category}" ...')
    
    # double check category name valid. if not in column name, 
    # make it to all
    names = df.iloc[:, -36:].columns.to_list()
    
    if category not in names:
        print(f'Category {category} Not Found. Using All Messages.')
        category = 'all'
        messages = df['message']
    else:
        # subset the category message
        messages = df[df[category] == 1]['message']

    
    # assemble a corpus
    corpus = list(messages.values)

    # initialize count vectorizer object
    cv = CountVectorizer(tokenizer=tokenize)
    
    
    # exception handle
    try:
        # fit the transformer to get each token (word)
        cv_fit = cv.fit_transform(corpus)
        
        # all words (feature names)
        word_list = cv.get_feature_names_out()
    
        # count of each word (feature names) in array shape
        count_list = np.asarray(cv_fit.sum(axis=0))[0]

        # (word count dictionary) Concat feature names and count value together. 
        wc_dict = dict(zip(word_list,count_list))

        # assemble to a DataFrame for computation. 
        wc_df = pd.DataFrame.from_dict(wc_dict, orient='index', columns=['number'])
        
        # convert to dictionary 
        res = wc_df.sort_values(by=['number'], ascending=[False]).head(size)['number'].to_dict()
    except ValueError as e:
        print(f"[ValueError]: \n{e}")
        res = {'NoRecord': 0}
        

    return res
    

In [39]:
get_category_top_words(df, 'storm')

Analyzing Most Common Terms Found in "storm" ...


{'rain': 754,
 'sandy': 596,
 'storm': 459,
 'people': 439,
 'hurricane': 410,
 'urlplaceholder': 371,
 'heavy': 315,
 'water': 304,
 'area': 286,
 'flood': 273,
 'said': 261,
 'food': 226,
 'cyclone': 222,
 'wind': 201,
 'day': 196,
 'region': 187,
 'province': 185,
 'hit': 183,
 '000': 181,
 'power': 173,
 'rainfall': 173,
 'year': 163,
 'affected': 153,
 'also': 147,
 'tsunami': 146,
 'country': 145,
 'flooding': 139,
 'two': 136,
 'caused': 135,
 'need': 135}

### Fixing empty vocabulary error handling

In [47]:
get_category_top_words(df, 'child_alone')

Analyzing Most Common Terms Found in "child_alone" ...
[ValueError]: 
empty vocabulary; perhaps the documents only contain stop words


{'NoRecord': 0}

#### select field options code

In [None]:
list(df.iloc[:, -36:].columns)

In [None]:
[x.replace("_", ' ').title() for x in cat_names]

In [32]:
list(zip(list(df.iloc[:, -36:].columns), [x.replace("_", ' ').title() for x in cat_names]))

[('related', 'Related'),
 ('request', 'Request'),
 ('offer', 'Offer'),
 ('aid_related', 'Aid Related'),
 ('medical_help', 'Medical Help'),
 ('medical_products', 'Medical Products'),
 ('search_and_rescue', 'Search And Rescue'),
 ('security', 'Security'),
 ('military', 'Military'),
 ('child_alone', 'Child Alone'),
 ('water', 'Water'),
 ('food', 'Food'),
 ('shelter', 'Shelter'),
 ('clothing', 'Clothing'),
 ('money', 'Money'),
 ('missing_people', 'Missing People'),
 ('refugees', 'Refugees'),
 ('death', 'Death'),
 ('other_aid', 'Other Aid'),
 ('infrastructure_related', 'Infrastructure Related'),
 ('transport', 'Transport'),
 ('buildings', 'Buildings'),
 ('electricity', 'Electricity'),
 ('tools', 'Tools'),
 ('hospitals', 'Hospitals'),
 ('shops', 'Shops'),
 ('aid_centers', 'Aid Centers'),
 ('other_infrastructure', 'Other Infrastructure'),
 ('weather_related', 'Weather Related'),
 ('floods', 'Floods'),
 ('storm', 'Storm'),
 ('fire', 'Fire'),
 ('earthquake', 'Earthquake'),
 ('cold', 'Cold'),
 (

### Index Page Data


In [None]:
# average length of character 144 letters

In [85]:
round(df['message'].apply(lambda x: len(x)).describe()['mean'], 2)

144.76

In [59]:
df['message'].apply(lambda x: len(x)).median()

124.0

In [61]:
# number of records
df.shape

(26215, 40)

In [65]:
# top five category
df.iloc[:, -36:].sum().sort_values(ascending=False).head(10)

related            20093
aid_related        10860
weather_related     7297
direct_report       5075
request             4474
other_aid           3446
food                2923
earthquake          2455
storm               2443
shelter             2314
dtype: int64

In [71]:
df['message'].isnull().sum()

0

In [82]:
# random messages
list(df['message'].sample(n=100).values)

['WHAT PROGRAM YOU HAVE FOR TODAY (DAVE) ',
 'Is there cyclone or earthquake again? ',
 'When asked why she did not go to the hospital, she points out that in this village there is no transportation.',
 "How come Digicel have yet sent me my calling card that's what I am waiting for. ",
 'The Brahmaputra had crossed the danger mark in 11 main channels, including the state capital Guwahati, inundating low-lying areas.',
 "We are in l'Acul. We haven't gotten anything so far. ",
 'Water still inundates thousands of houses in Bekasi and one of the universities in the city.The Vilanculos primary school was one building destroyed by the violent winds.',
 "good afternoon, can we get into our house if it's not cracked? ",
 'Many other buildings have sustained significant structural damage, including the UN office at the Battagram View Hotel.',
 'will the earthquake comes stronger than the first time? ',
 'MoES reports that there is a need for tents, food, medication, and necessary household ite

### top 10 cats grid

In [146]:
names = list(df.iloc[:, -36:].sum().sort_values(ascending=False).head(10).index)

names

top_cats_stats = []

for n in names:
    title = n.replace('_', ' ').title()
    percent = f"{round(df[df[n] == 1].shape[0] / df.shape[0] * 100, 2)}%"
    mean_length = round(df[df[n] == 1]['message'].apply(lambda x: len(x)).describe()['mean'], 2)

    top_cats_stats.append((n, title, percent, mean_length))
    

top_cats_stats

[('related', 'Related', '76.65%', 154.52),
 ('aid_related', 'Aid Related', '41.43%', 173.26),
 ('weather_related', 'Weather Related', '27.84%', 184.42),
 ('direct_report', 'Direct Report', '19.36%', 126.71),
 ('request', 'Request', '17.07%', 128.41),
 ('other_aid', 'Other Aid', '13.15%', 175.38),
 ('food', 'Food', '11.15%', 193.79),
 ('earthquake', 'Earthquake', '9.36%', 178.91),
 ('storm', 'Storm', '9.32%', 226.95),
 ('shelter', 'Shelter', '8.83%', 220.39)]