In [25]:
import sys
import pandas as pd
from sqlalchemy import create_engine

In [26]:
def load_data(messages_filepath, categories_filepath):
    '''
       INPUT:
           messages_filepath (str): messages csv files path
           categories_filepath (str): categories csv file path
       OUTPUT:
           df: dataframe having messages and cateries details joined
       DESCRIPTION:
               read messages csv file as messages dataframe and
               categories csv file as categories dataframe
               merge both the dataframes as df applying inner join on ['id'] column
    '''

    df_messages = pd.read_csv(messages_filepath, encoding='latin-1')
    df_categories = pd.read_csv(categories_filepath, encoding='latin-1')

    # merge datasets on the ids.
    df = pd.merge(df_messages, df_categories, how='inner', on='id')
    return df

In [27]:
df = load_data('./data/disaster_messages.csv','./data/disaster_categories.csv')

In [28]:
df

Unnamed: 0,id,message,original,genre,categories
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,related-1;request-0;offer-0;aid_related-0;medi...
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,related-1;request-0;offer-0;aid_related-1;medi...
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,related-1;request-0;offer-0;aid_related-0;medi...
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,related-1;request-1;offer-0;aid_related-1;medi...
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,related-1;request-0;offer-0;aid_related-0;medi...
5,14,Information about the National Palace-,Informtion au nivaux palais nationl,direct,related-0;request-0;offer-0;aid_related-0;medi...
6,15,Storm at sacred heart of jesus,Cyclone Coeur sacr de jesus,direct,related-1;request-0;offer-0;aid_related-0;medi...
7,16,"Please, we need tents and water. We are in Sil...",Tanpri nou bezwen tant avek dlo nou zon silo m...,direct,related-1;request-1;offer-0;aid_related-1;medi...
8,17,"I would like to receive the messages, thank you",Mwen ta renmen jouin messag yo. Merci,direct,related-0;request-0;offer-0;aid_related-0;medi...
9,18,I am in Croix-des-Bouquets. We have health iss...,"Nou kwadebouke, nou gen pwoblem sant m yo nan ...",direct,related-1;request-1;offer-0;aid_related-1;medi...


In [None]:
# create a dataframe of the 36 individual category columns
categories = df.categories.str.split(pat=';', expand=True)

In [None]:
categories

In [None]:
# select the first row of the categories dataframe
row = categories.iloc[0,:]

In [None]:
# use this row to extract a list of new column names for categories.
# one way is to apply a lambda function that takes everything 
# up to the second to last character of each string with slicing
category_colnames = row.apply(lambda x: x[:-2])

In [None]:
# rename the columns of `categories`
categories.columns = category_colnames

In [None]:
for column in categories:
        # set each value to be the last character of the string
        categories[column] = categories[column].apply(lambda x: x[-1])
    
        # convert column from string to numeric
        categories[column] = categories[column].astype('int32')

In [None]:
categories

In [None]:
 # drop the original categories column from `df`
df = df.drop('categories', axis=1)

# concatenate the original dataframe with the new `categories` dataframe
df = pd.concat([df, categories], axis=1)

# drop duplicates
df = df.drop_duplicates()

In [None]:
df

In [3]:
def clean_data(df):
    '''
       INPUT:
          The function takes the dataframe as merges from 'load_data' and re-creates a columns from the data
          while dropping the category column.
          arg: dataframe
       OUTPUT:
           df: dataframe having messages and cateries details
    '''

    # create a dataframe of the each of the category type
    categories = df.categories.str.split(';', expand=True)

    # select the first row of the categories in the dataframe
    row = categories.iloc[0, :]

    # convert the row cells to columns using lambda expression.
    cols = row.apply(lambda x: x[:-2])

    # bind new columns to the `categories` dataframe.
    categories.columns = cols

    # convert category values to numbers 0 or 1
    for column in categories:
        # set each value to be the last character of the string
        categories[column] = categories[column].apply(lambda x: x[-1])

        # convert column from string to numeric
        categories[column] = pd.to_numeric(categories[column])

    df = df.drop('categories', axis=1)  # drop the original categories column from df
    df = pd.concat([df, categories], axis=1)

    df = df.drop_duplicates()  # drop the duplicates

    return df

In [22]:
def save_data(df, database_filename):
    '''
    INPUT:
        cleansed dataframe having messages and their belonging categories details
    OUTPUT: 
        database having Messages table
    DESCRIPTION:
        Insert dataframe into sql table<DisasterMessages> in database file to be used as input   
    '''
    table = 'DisasterMessages'

    engine = create_engine('sqlite:///{}'.format(database_filename))
    

    df.to_sql(name=table, con=engine,if_exists='replace', chunksize=10, index=False) 
    


In [23]:
def main():
        df = load_data('./data/disaster_messages.csv','./data/disaster_categories.csv')

        print('Cleaning data...')
        df = clean_data(df)
        
        print('Saving data...')
        save_data(df, 'DisasterResponse.db')
        
        print('Cleaned data saved to database!')
    


In [24]:
main()

Cleaning data...
Saving data...
Cleaned data saved to database!


In [None]:
python process_data.py disaster_messages.csv disaster_categories.csv DisasterResponse.db