In [91]:
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
import re

import plotly
from plotly.graph_objs import Bar
from plotly import offline

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet') 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to /Users/apple/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/apple/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [36]:
engine = create_engine('sqlite:///static/db/DisasterRes.db')
df = pd.read_sql_table('DisasterResponse', engine)
print(f"df.shape: {df.shape}")

df.shape: (26215, 40)


#### Genre Bar Plot

In [37]:
df['genre'].value_counts()

news      13054
direct    10766
social     2395
Name: genre, dtype: int64

In [38]:
x = df['genre'].value_counts().index.tolist()
x = [x.title() for x in x]

In [39]:
y = df['genre'].value_counts().values.tolist()

In [40]:
data = [{
    
    'type': 'bar',
    'x': x,
    'y': y
    
}]

In [41]:
my_layout = {
    'title': '',
    'xaxis': {'title': 'Genre'},
    'yaxis': {'title': 'Numbers'},
    
}

fig = {'data': data, 'layout': my_layout}

In [42]:
offline.plot(fig, filename='genre.html')

'genre.html'

#### Distribution of categories

In [226]:
df.iloc[:, -36:].columns.to_list()

['related',
 'request',
 'offer',
 'aid_related',
 'medical_help',
 'medical_products',
 'search_and_rescue',
 'security',
 'military',
 'child_alone',
 'water',
 'food',
 'shelter',
 'clothing',
 'money',
 'missing_people',
 'refugees',
 'death',
 'other_aid',
 'infrastructure_related',
 'transport',
 'buildings',
 'electricity',
 'tools',
 'hospitals',
 'shops',
 'aid_centers',
 'other_infrastructure',
 'weather_related',
 'floods',
 'storm',
 'fire',
 'earthquake',
 'cold',
 'other_weather',
 'direct_report']

In [50]:
df.iloc[:, -36:].columns.to_list()

cat_names = [x.replace('_', ' ').title() for x in df.iloc[:, -36:].columns.to_list()]

In [51]:
cat_names

['Related',
 'Request',
 'Offer',
 'Aid Related',
 'Medical Help',
 'Medical Products',
 'Search And Rescue',
 'Security',
 'Military',
 'Child Alone',
 'Water',
 'Food',
 'Shelter',
 'Clothing',
 'Money',
 'Missing People',
 'Refugees',
 'Death',
 'Other Aid',
 'Infrastructure Related',
 'Transport',
 'Buildings',
 'Electricity',
 'Tools',
 'Hospitals',
 'Shops',
 'Aid Centers',
 'Other Infrastructure',
 'Weather Related',
 'Floods',
 'Storm',
 'Fire',
 'Earthquake',
 'Cold',
 'Other Weather',
 'Direct Report']

In [57]:
cats_df = df.iloc[:, -36:]

In [58]:
cats_df

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26210,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26211,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26212,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26213,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
count_cats = cats_df.sum().sort_values(ascending=False)

count_cats

related                   20093
aid_related               10860
weather_related            7297
direct_report              5075
request                    4474
other_aid                  3446
food                       2923
earthquake                 2455
storm                      2443
shelter                    2314
floods                     2155
medical_help               2084
infrastructure_related     1705
water                      1672
other_weather              1376
buildings                  1333
medical_products           1313
transport                  1201
death                      1194
other_infrastructure       1151
refugees                    875
military                    860
search_and_rescue           724
money                       604
electricity                 532
cold                        530
security                    471
clothing                    405
aid_centers                 309
missing_people              298
hospitals                   283
fire    

In [62]:
count_cats_k = [x.replace('_', ' ').title() for x in count_cats.index.to_list()]

count_cats_k

['Related',
 'Aid Related',
 'Weather Related',
 'Direct Report',
 'Request',
 'Other Aid',
 'Food',
 'Earthquake',
 'Storm',
 'Shelter',
 'Floods',
 'Medical Help',
 'Infrastructure Related',
 'Water',
 'Other Weather',
 'Buildings',
 'Medical Products',
 'Transport',
 'Death',
 'Other Infrastructure',
 'Refugees',
 'Military',
 'Search And Rescue',
 'Money',
 'Electricity',
 'Cold',
 'Security',
 'Clothing',
 'Aid Centers',
 'Missing People',
 'Hospitals',
 'Fire',
 'Tools',
 'Shops',
 'Offer',
 'Child Alone']

In [69]:
count_cats_v = list(count_cats.values)

count_cats_v

[20093,
 10860,
 7297,
 5075,
 4474,
 3446,
 2923,
 2455,
 2443,
 2314,
 2155,
 2084,
 1705,
 1672,
 1376,
 1333,
 1313,
 1201,
 1194,
 1151,
 875,
 860,
 724,
 604,
 532,
 530,
 471,
 405,
 309,
 298,
 283,
 282,
 159,
 120,
 118,
 0]

In [74]:
cat_data = [{
    
    'type': 'bar',
    'x': count_cats_k,
    'y': count_cats_v
    
}]

cat_layout = {
    'title': '',
    'xaxis': {'title': 'Categories'},
    'yaxis': {'title': 'Numbers'},
    
}

fig = {'data': cat_data, 'layout': cat_layout}

In [75]:
offline.plot(fig, filename='categories.html')

'categories.html'

### Most Common Words by Categories

In [77]:
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

In [78]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [79]:
lemmatizer

<WordNetLemmatizer>

In [92]:
def tokenize(text):
    # replace url
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
    
    # normalize case and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # tokenize text
    tokens = word_tokenize(text)
    
    # lemmatize and remove stop words
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return tokens

In [87]:
corpus = list(df['message'].values)

corpus

['Weather update - a cold front from Cuba that could pass over Haiti',
 'Is the Hurricane over or is it not over',
 'Looking for someone but no name',
 'UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.',
 'says: west side of Haiti, rest of the country today and tonight',
 'Information about the National Palace-',
 'Storm at sacred heart of jesus',
 'Please, we need tents and water. We are in Silo, Thank you!',
 'I would like to receive the messages, thank you',
 'I am in Croix-des-Bouquets. We have health issues. They ( workers ) are in Santo 15. ( an area in Croix-des-Bouquets )',
 "There's nothing to eat and water, we starving and thirsty.",
 'I am in Petionville. I need more information regarding 4636',
 'I am in Thomassin number 32, in the area named Pyron. I would like to have some water. Thank God we are fine, but we desperately need water. Thanks',
 "Let's do it together, need food in Delma 75, in didine area",
 'More informati

In [88]:
len(corpus)

26215

In [159]:
# initialize count vectorizer object
cv = CountVectorizer(tokenizer=tokenize)

In [160]:
# get counts of each token (word) in text data
cv_fit = cv.fit_transform(corpus)

In [161]:
# convert sparse matrix to numpy array to view
cv_fit.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [164]:
# get vocabularies and indices (the number is not count)

voc = cv.vocabulary_ 

voc

{'weather': 30960,
 'update': 30072,
 'cold': 6586,
 'front': 11745,
 'cuba': 7679,
 'could': 7380,
 'pas': 21312,
 'haiti': 12939,
 'hurricane': 13875,
 'looking': 17285,
 'someone': 26665,
 'name': 19453,
 'un': 29662,
 'report': 24105,
 'leogane': 16859,
 '80': 1317,
 '90': 1435,
 'destroyed': 8485,
 'hospital': 13682,
 'st': 27019,
 'croix': 7598,
 'functioning': 11835,
 'need': 19634,
 'supply': 27664,
 'desperately': 8452,
 'say': 25301,
 'west': 31044,
 'side': 26168,
 'rest': 24262,
 'country': 7406,
 'today': 28775,
 'tonight': 28853,
 'information': 14581,
 'national': 19523,
 'palace': 21111,
 'storm': 27234,
 'sacred': 24873,
 'heart': 13264,
 'jesus': 15358,
 'please': 22038,
 'tent': 28280,
 'water': 30879,
 'silo': 26218,
 'thank': 28398,
 'would': 31393,
 'like': 17016,
 'receive': 23605,
 'message': 18385,
 'de': 8009,
 'bouquet': 4819,
 'health': 13250,
 'issue': 15108,
 'worker': 31358,
 'santo': 25199,
 '15': 205,
 'area': 3036,
 'nothing': 20080,
 'eat': 9614,
 'st

In [184]:
word_list = cv.get_feature_names_out()

In [157]:
voc_df[
    (voc_df['number'] > 0) & 
    (voc_df['number'] < 500)
].sort_values(by=['number'], ascending=True).tail(60)

Unnamed: 0,number
200gd,440
200km,441
200m3,442
200yrs,443
201,444
2010,445
2011,446
2012,447
2013,448
2014,449


In [185]:
count_list = np.asarray(cv_fit.sum(axis=0))[0]

In [188]:
wc_dict = dict(zip(word_list,count_list))

wc_dict

{'0': 162,
 '00': 39,
 '000': 1254,
 '0000': 4,
 '000ha': 1,
 '000l': 3,
 '000lt': 1,
 '000ltrs': 1,
 '000m2': 1,
 '000rmb': 1,
 '004': 3,
 '00h58': 1,
 '01': 20,
 '010': 1,
 '017': 1,
 '018': 1,
 '019': 2,
 '01st': 1,
 '02': 14,
 '020': 1,
 '024': 1,
 '027': 2,
 '027kiisfm': 1,
 '03': 36,
 '030': 1,
 '0300': 1,
 '0310': 1,
 '034': 1,
 '035': 1,
 '0350': 2,
 '037': 1,
 '039': 1,
 '04': 17,
 '040': 2,
 '0400': 1,
 '043': 2,
 '048': 1,
 '04m': 1,
 '05': 22,
 '050': 3,
 '0500': 1,
 '0511sitrep': 1,
 '052': 3,
 '053': 1,
 '056': 1,
 '05am': 1,
 '06': 21,
 '062': 1,
 '063': 3,
 '065': 1,
 '066': 1,
 '06f98jo5jersey': 1,
 '07': 15,
 '071': 3,
 '0730': 1,
 '077': 1,
 '07th': 1,
 '08': 19,
 '0800': 1,
 '081fmhg8': 1,
 '082': 1,
 '087': 1,
 '09': 19,
 '090': 1,
 '093': 1,
 '0h': 1,
 '0h4z': 1,
 '0m': 1,
 '0nwa0rb5': 1,
 '0tranbleman': 1,
 '0veupw3p': 1,
 '0wbiqkj3': 1,
 '1': 675,
 '10': 390,
 '100': 248,
 '1000': 28,
 '1000gdes': 1,
 '1003': 1,
 '100g': 5,
 '100gd': 1,
 '100gourdes': 1,
 '100ht

In [169]:
# np.asarray(cv_fit.sum(axis=0))

array([[ 162,   39, 1254, ...,    3,    1,    1]])

In [190]:
#word count data frame

wc_df = pd.DataFrame.from_dict(wc_dict, orient='index', columns=['number'])

wc_df

Unnamed: 0,number
0,162
00,39
000,1254
0000,4
000ha,1
...,...
zuoji,1
zurich,1
zwaar,3
zwkmjkkz,1


In [193]:
wc_df.sort_values(by=['number'], ascending=[False]).head(60)

Unnamed: 0,number
water,3040
people,3014
food,2904
help,2653
need,2496
please,2049
earthquake,1921
u,1757
area,1667
like,1532


In [203]:
# todo 

# subset a givin category message subset, and compute most common words 

# declarative function

df.iloc[:, -36:].columns.to_list()

caf_food_df = df[df['food'] == 1]['message']

caf_food_df

10       There's nothing to eat and water, we starving ...
13       Let's do it together, need food in Delma 75, i...
15       A Comitee in Delmas 19, Rue ( street ) Janvier...
16       We need food and water in Klecin 12. We are dy...
23       Those people who live at Sibert need food they...
                               ...                        
26052    Some 5 or 6 mt of sorghum seed were taken when...
26080    The gross relief food requirements for June-De...
26106    It is estimated that as many as 10 million peo...
26128    World Vision has swung into action in Mumbai, ...
26151    The death toll hit 650 at the close of 2014, b...
Name: message, Length: 2923, dtype: object

In [209]:


caf_df = df[df['weather_related'] == 1]['message']

caf_df

corpus = list(caf_df.values)

# initialize count vectorizer object
cv = CountVectorizer(tokenizer=tokenize)

# get counts of each token (word) in text data
cv_fit = cv.fit_transform(corpus)

word_list = cv.get_feature_names_out()

count_list = np.asarray(cv_fit.sum(axis=0))[0]

wc_dict = dict(zip(word_list,count_list))

wc_df = pd.DataFrame.from_dict(wc_dict, orient='index', columns=['number'])

wc_df.sort_values(by=['number'], ascending=[False]).head(60)

Unnamed: 0,number
earthquake,1754
people,1189
water,932
flood,920
http,872
rain,855
area,746
haiti,682
sandy,650
said,590


In [217]:
wc_df.sort_values(by=['number'], ascending=[False]).head(20)['number'].to_dict()

{'earthquake': 1754,
 'people': 1189,
 'water': 932,
 'flood': 920,
 'http': 872,
 'rain': 855,
 'area': 746,
 'haiti': 682,
 'sandy': 650,
 'said': 590,
 'food': 586,
 'help': 560,
 '000': 541,
 'storm': 487,
 'affected': 455,
 'hit': 449,
 'need': 447,
 'co': 444,
 'hurricane': 431,
 'country': 410}

In [213]:
voc_df[
    (voc_df['number'] > 0) & 
    (voc_df['number'] < 500)
].sort_values(by=['number'], ascending=True).head(20).to_dict()

{'number': {'00': 1,
  '000': 2,
  '0000': 3,
  '000ha': 4,
  '000l': 5,
  '000lt': 6,
  '000ltrs': 7,
  '000m2': 8,
  '000rmb': 9,
  '004': 10,
  '00h58': 11,
  '01': 12,
  '010': 13,
  '017': 14,
  '018': 15,
  '019': 16,
  '01st': 17,
  '02': 18,
  '020': 19,
  '024': 20}}

In [235]:
def get_category_top_words(df, category='all', size=30):
    # df: original df
    # category: column name: e.g 'weather_related', 'food'
    # size: top how many? e.g 20 = top 20 (most common 20 words)
    #
    # return a top word count dictionary
    
    print(f'Analyzing Most Common Terms Found in "{category}" ...')
    
    # double check category name valid. if not in column name, 
    # make it to all
    names = df.iloc[:, -36:].columns.to_list()
    
    if category not in names:
        print(f'Category {category} Not Found. Using All Messages.')
        category = 'all'
        messages = df['message']
    else:
        # subset the category message
        messages = df[df[category] == 1]['message']

    
    # assemble a corpus
    corpus = list(messages.values)

    # initialize count vectorizer object
    cv = CountVectorizer(tokenizer=tokenize)

    # fit the transformer to get each token (word)
    cv_fit = cv.fit_transform(corpus)
    
    # all words (feature names)
    word_list = cv.get_feature_names_out()
    
    # count of each word (feature names) in array shape
    count_list = np.asarray(cv_fit.sum(axis=0))[0]

    # (word count dictionary) Concat feature names and count value together. 
    wc_dict = dict(zip(word_list,count_list))
    
    # assemble to a DataFrame for computation. 
    wc_df = pd.DataFrame.from_dict(wc_dict, orient='index', columns=['number'])

    return wc_df.sort_values(by=['number'], ascending=[False]).head(size)['number'].to_dict()
    

In [237]:
get_category_top_words(df, 'storm')

Analyzing Most Common Terms Found in "storm" ...


{'rain': 754,
 'sandy': 596,
 'storm': 459,
 'people': 439,
 'http': 414,
 'hurricane': 410,
 'co': 404,
 'heavy': 315,
 'water': 304,
 'area': 286,
 'flood': 273,
 'said': 261,
 'food': 226,
 'cyclone': 223,
 'wind': 201,
 'day': 196,
 'region': 187,
 'province': 185,
 'hit': 183,
 '000': 181,
 'power': 173,
 'rainfall': 173,
 'year': 163,
 'affected': 153,
 'also': 147,
 'tsunami': 146,
 'country': 145,
 'flooding': 139,
 'two': 136,
 'caused': 135}