In [145]:
import pandas as pd
from sqlalchemy import create_engine
import plotly.graph_objects as go
import pycountry as pc
import nltk
nltk.download('crubadan')

import langdetect
from tqdm import tqdm
tqdm.pandas()

[nltk_data] Downloading package crubadan to
[nltk_data]     C:\Users\guischmitd\AppData\Roaming\nltk_data...
[nltk_data]   Package crubadan is already up-to-date!


In [3]:
engine = create_engine('sqlite:///../haystack/data/DisasterResponse.db')
df = pd.read_sql_table('categorized_messages', engine)
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
X, y = df['message'], df[[c for c in df.columns if c not in ['original', 'genre', 'message', 'id']]]

In [41]:
y.sum().sort_values(ascending=False).index.str.replace('_', ' ').str.capitalize()

Index(['Related', 'Aid related', 'Weather related', 'Direct report', 'Request',
       'Other aid', 'Food', 'Earthquake', 'Storm', 'Shelter', 'Floods',
       'Medical help', 'Infrastructure related', 'Water', 'Other weather',
       'Buildings', 'Medical products', 'Transport', 'Death',
       'Other infrastructure', 'Refugees', 'Military', 'Search and rescue',
       'Money', 'Electricity', 'Cold', 'Security', 'Clothing', 'Aid centers',
       'Missing people', 'Hospitals', 'Fire', 'Tools', 'Shops', 'Offer',
       'Child alone'],
      dtype='object')

In [42]:
values = y.sum().sort_values(ascending=False)
labels = values.index.str.replace('_', ' ').str.capitalize()

go.Figure([
    go.Bar(x=labels, y=values)
], layout={'title': 'Samples per category'})

In [61]:
cats_per_sample = y.sum(axis=1)
cats_per_sample[cats_per_sample <= 10]

go.Figure([
    go.Histogram(x=cats_per_sample[cats_per_sample <= 10])
], layout={
    'title': 'Multilabeled messages', 
    'yaxis': {
        'title': 'Messages'
        },
    'xaxis': {
        'title': 'Number of categories'
        }
    })

In [155]:
def get_language(text):
    try:
        guess = langdetect.detect(text)
        lang = pc.languages.get(alpha_2=guess)
        if lang is not None:
            lang = lang.name

    except Exception as e:
        print(f'TEXT: {text}\n    Raised Exception: {e}')
        lang = None

    return lang

langs = []
for text in tqdm(df.original.values):
    langs.append(get_language(text))

 Exception: expected string or bytes-like object
TEXT: None
    Raised Exception: expected string or bytes-like object
TEXT: None
    Raised Exception: expected string or bytes-like object
TEXT: None
    Raised Exception: expected string or bytes-like object
TEXT: None
    Raised Exception: expected string or bytes-like object
TEXT: None
    Raised Exception: expected string or bytes-like object
TEXT: None
    Raised Exception: expected string or bytes-like object
TEXT: None
    Raised Exception: expected string or bytes-like object
TEXT: None
    Raised Exception: expected string or bytes-like object
TEXT: None
    Raised Exception: expected string or bytes-like object
TEXT: None
    Raised Exception: expected string or bytes-like object
TEXT: None
    Raised Exception: expected string or bytes-like object
TEXT: None
    Raised Exception: expected string or bytes-like object
TEXT: None
    Raised Exception: expected string or bytes-like object
TEXT: None
    Raised Exception: expected

In [80]:
message_lens = df.groupby('related').apply(lambda x: x['message'].str.len())

go.Figure([
    go.Histogram(x=message_lens[1], opacity=0.75, name='Related'),
    go.Histogram(x=message_lens[0], opacity=0.75, name='Unrelated')
], layout={
    'title': 'Related/Unrelated message length (character count)', 
    'barmode': 'overlay',
    'xaxis': {
        'range': [30, 800]
    }
    })