# ETL Pipeline Preparation
Follow the instructions below to help you create your ETL pipeline.
### 1. Import libraries and load datasets.
- Import Python libraries
- Load `messages.csv` into a dataframe and inspect the first few lines.
- Load `categories.csv` into a dataframe and inspect the first few lines.

In [49]:
# import libraries
import numpy as np
import pandas as pd

In [50]:
# load messages dataset
messages = pd.read_csv('../data/disaster_messages.csv')
messages.head()

Unnamed: 0,id,message,original,genre
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct


In [51]:
messages.shape

(26248, 4)

In [52]:
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26248 entries, 0 to 26247
Data columns (total 4 columns):
id          26248 non-null int64
message     26248 non-null object
original    10184 non-null object
genre       26248 non-null object
dtypes: int64(1), object(3)
memory usage: 820.3+ KB


In [53]:
# unique number of features ['id', 'message', 'original', 'genre']
len(messages.id.unique()), len(messages.message.unique()), len(messages.original.unique()), len(messages.genre.unique())

(26180, 26177, 9631, 3)

In [54]:
# unique feature values
messages.genre.unique()

array(['direct', 'social', 'news'], dtype=object)

In [55]:
# number of duplicated ids in messages
len(messages[messages.duplicated('id', keep=False)])

135

In [56]:
# show all duolicated ids
messages[messages.duplicated('id', False)]

Unnamed: 0,id,message,original,genre
162,202,?? port au prince ?? and food. they need gover...,p bay pap la syen ak manje. Yo bezwen ed gouve...,direct
163,202,?? port au prince ?? and food. they need gover...,p bay pap la syen ak manje. Yo bezwen ed gouve...,direct
655,804,elle est vraiment malade et a besoin d'aide. u...,she is really sick she need your help. please ...,direct
656,804,elle est vraiment malade et a besoin d'aide. u...,she is really sick she need your help. please ...,direct
709,862,What is the address of the radio station? I as...,Ki adres radyo a? Paske m bezwen al depoze dos...,direct
710,862,What is the address of the radio station? I as...,Ki adres radyo a? Paske m bezwen al depoze dos...,direct
1407,1652,"please we need water, food and tents, we have ...","p jwen dlo, manje, tant pou nou demi nou gen 1...",direct
1408,1652,"please we need water, food and tents, we have ...","p jwen dlo, manje, tant pou nou demi nou gen 1...",direct
2130,2446,How much money did TV Latino American collect ...,Konbyen kob tv latino america ranmase miami,direct
2131,2446,How much money did TV Latino American collect ...,Konbyen kob tv latino america ranmase miami,direct


In [57]:
messages.drop_duplicates().shape

(26180, 4)

In [58]:
# load categories dataset
categories = pd.read_csv('../data/disaster_categories.csv')
categories.head(6)

Unnamed: 0,id,categories
0,2,related-1;request-0;offer-0;aid_related-0;medi...
1,7,related-1;request-0;offer-0;aid_related-1;medi...
2,8,related-1;request-0;offer-0;aid_related-0;medi...
3,9,related-1;request-1;offer-0;aid_related-1;medi...
4,12,related-1;request-0;offer-0;aid_related-0;medi...
5,14,related-0;request-0;offer-0;aid_related-0;medi...


In [59]:
categories.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26248 entries, 0 to 26247
Data columns (total 2 columns):
id            26248 non-null int64
categories    26248 non-null object
dtypes: int64(1), object(1)
memory usage: 410.2+ KB


In [60]:
# unique number of features ['id', 'categories']
len(categories.id.unique()),len(categories.categories.unique())

(26180, 4003)

In [61]:
# number of duplicated ids in categories 
# same as in messages
len(categories[categories.duplicated('id', False)])

135

In [62]:
# show duplicated ids
# notice for id==862 the labels are different 
duplicated_labels = categories[categories.duplicated('id', False)]
duplicated_labels

Unnamed: 0,id,categories
162,202,related-1;request-1;offer-0;aid_related-1;medi...
163,202,related-1;request-1;offer-0;aid_related-1;medi...
655,804,related-2;request-0;offer-0;aid_related-0;medi...
656,804,related-2;request-0;offer-0;aid_related-0;medi...
709,862,related-0;request-0;offer-0;aid_related-0;medi...
710,862,related-1;request-0;offer-0;aid_related-0;medi...
1407,1652,related-1;request-1;offer-0;aid_related-1;medi...
1408,1652,related-1;request-1;offer-0;aid_related-1;medi...
2130,2446,related-1;request-0;offer-0;aid_related-0;medi...
2131,2446,related-1;request-0;offer-0;aid_related-0;medi...


In [63]:
# show messages for id==862
messages[messages.id==862].message

709    What is the address of the radio station? I as...
710    What is the address of the radio station? I as...
Name: message, dtype: object

In [64]:
# two istance where id == 862 are indeed duplicates
# but there are different labels for it
messages[messages.id==862].iloc[0] == messages[messages.id==862].iloc[1]

id          True
message     True
original    True
genre       True
dtype: bool

In [65]:
# only id 24779 has more than 2 instance
categories.groupby('id').filter(lambda x: len(x) >2)

Unnamed: 0,id,categories
21358,24779,related-1;request-0;offer-0;aid_related-1;medi...
21359,24779,related-1;request-0;offer-0;aid_related-1;medi...
21360,24779,related-1;request-0;offer-0;aid_related-1;medi...


In [66]:
# and the categories values are duplicates(same values) for id==24779
categories.groupby('id').filter(lambda x: len(x) >2).categories.value_counts().unique()

array([3])

In [67]:
# the rest of the 132 instances have one additional id-duplicated instance
# some of them have same values for categories but some of them don't
categories.groupby('id').filter(lambda x: len(x) ==2)

Unnamed: 0,id,categories
162,202,related-1;request-1;offer-0;aid_related-1;medi...
163,202,related-1;request-1;offer-0;aid_related-1;medi...
655,804,related-2;request-0;offer-0;aid_related-0;medi...
656,804,related-2;request-0;offer-0;aid_related-0;medi...
709,862,related-0;request-0;offer-0;aid_related-0;medi...
710,862,related-1;request-0;offer-0;aid_related-0;medi...
1407,1652,related-1;request-1;offer-0;aid_related-1;medi...
1408,1652,related-1;request-1;offer-0;aid_related-1;medi...
2130,2446,related-1;request-0;offer-0;aid_related-0;medi...
2131,2446,related-1;request-0;offer-0;aid_related-0;medi...


In [68]:
# len, number of unique instance and unique ids in categories
# notice the diference in number between unique instances and unique ids == 36
categories.shape[0],categories.drop_duplicates().shape[0], len(categories.id.unique()), categories.drop_duplicates().shape[0]-len(categories.id.unique())

(26248, 26216, 26180, 36)

### 2. Merge datasets.
- Merge the messages and categories datasets using the common id
- Assign this combined dataset to `df`, which will be cleaned in the following steps

In [69]:
# merge datasets
# due to conflicting labels merge on id will create extra un-useable rows
df = messages.merge(categories,on='id')
df.head()

Unnamed: 0,id,message,original,genre,categories
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,related-1;request-0;offer-0;aid_related-0;medi...
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,related-1;request-0;offer-0;aid_related-1;medi...
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,related-1;request-0;offer-0;aid_related-0;medi...
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,related-1;request-1;offer-0;aid_related-1;medi...
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,related-1;request-0;offer-0;aid_related-0;medi...


In [70]:
df.shape, len(df.id.unique())

((26386, 5), 26180)

In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26386 entries, 0 to 26385
Data columns (total 5 columns):
id            26386 non-null int64
message       26386 non-null object
original      10246 non-null object
genre         26386 non-null object
categories    26386 non-null object
dtypes: int64(1), object(4)
memory usage: 1.2+ MB


### 3. Split `categories` into separate category columns.
- Split the values in the `categories` column on the `;` character so that each value becomes a separate column. You'll find [this method](https://pandas.pydata.org/pandas-docs/version/0.23/generated/pandas.Series.str.split.html) very helpful! Make sure to set `expand=True`.
- Use the first row of categories dataframe to create column names for the categories data.
- Rename columns of `categories` with new column names.

In [72]:
# create a dataframe of the 36 individual category columns
categories_fixed = df.categories.str.split(';',expand=True)
categories_fixed.shape

(26386, 36)

In [73]:
categories_fixed.head(6)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
1,related-1,request-0,offer-0,aid_related-1,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-1,floods-0,storm-1,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
2,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
3,related-1,request-1,offer-0,aid_related-1,medical_help-0,medical_products-1,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
4,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
5,related-0,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0


In [74]:
# select the first row of the categories dataframe
row = categories_fixed.iloc[0]

# use this row to extract a list of new column names for categories.
# one way is to apply a lambda function that takes everything 
# up to the second to last character of each string with slicing
category_colnames = row.str.slice(stop=-2).values
print(category_colnames)

['related' 'request' 'offer' 'aid_related' 'medical_help'
 'medical_products' 'search_and_rescue' 'security' 'military'
 'child_alone' 'water' 'food' 'shelter' 'clothing' 'money'
 'missing_people' 'refugees' 'death' 'other_aid' 'infrastructure_related'
 'transport' 'buildings' 'electricity' 'tools' 'hospitals' 'shops'
 'aid_centers' 'other_infrastructure' 'weather_related' 'floods' 'storm'
 'fire' 'earthquake' 'cold' 'other_weather' 'direct_report']


In [75]:
# rename the columns of `categories`
categories_fixed.columns = category_colnames
categories_fixed.head(6)

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
1,related-1,request-0,offer-0,aid_related-1,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-1,floods-0,storm-1,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
2,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
3,related-1,request-1,offer-0,aid_related-1,medical_help-0,medical_products-1,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
4,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
5,related-0,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0


### 4. Convert category values to just numbers 0 or 1.
- Iterate through the category columns in df to keep only the last character of each string (the 1 or 0). For example, `related-0` becomes `0`, `related-1` becomes `1`. Convert the string to a numeric value.
- You can perform [normal string actions on Pandas Series](https://pandas.pydata.org/pandas-docs/stable/text.html#indexing-with-str), like indexing, by including `.str` after the Series. You may need to first convert the Series to be of type string, which you can do with `astype(str)`.

In [76]:
# some categories are not binary; for instance ['related']
len(categories_fixed[categories_fixed.related=='related-2'])

204

In [77]:
for column in categories_fixed:
    # set each value to be the last character of the string
    categories_fixed[column] = categories_fixed[column].str.slice(start=-1)
    
    # convert column from string to numeric
    categories_fixed[column] = categories_fixed[column].apply(int)

categories_fixed.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 5. Replace `categories` column in `df` with new category columns.
- Drop the categories column from the df dataframe since it is no longer needed.
- Concatenate df and categories data frames.

In [78]:
# shapes look fine
df.shape, categories_fixed.shape

((26386, 5), (26386, 36))

In [79]:
# concatenate the original dataframe with the new `categories` dataframe
df = pd.concat([df,categories_fixed],axis=1)
df.head(6)

Unnamed: 0,id,message,original,genre,categories,related,request,offer,aid_related,medical_help,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,related-1;request-0;offer-0;aid_related-0;medi...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,related-1;request-0;offer-0;aid_related-1;medi...,1,0,0,1,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,related-1;request-0;offer-0;aid_related-0;medi...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,related-1;request-1;offer-0;aid_related-1;medi...,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,related-1;request-0;offer-0;aid_related-0;medi...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,14,Information about the National Palace-,Informtion au nivaux palais nationl,direct,related-0;request-0;offer-0;aid_related-0;medi...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
# shape looks fine after concatenation
df.shape

(26386, 41)

In [81]:
# check labels BEFORE removing duplicates
# some labels are not binary / child_alone [0] / related [1 0 2]
for col in df.loc[:,'related':].columns:
    print(col, df[col].unique())


related [1 0 2]
request [0 1]
offer [0 1]
aid_related [0 1]
medical_help [0 1]
medical_products [0 1]
search_and_rescue [0 1]
security [0 1]
military [0 1]
child_alone [0]
water [0 1]
food [0 1]
shelter [0 1]
clothing [0 1]
money [0 1]
missing_people [0 1]
refugees [0 1]
death [0 1]
other_aid [0 1]
infrastructure_related [0 1]
transport [0 1]
buildings [0 1]
electricity [0 1]
tools [0 1]
hospitals [0 1]
shops [0 1]
aid_centers [0 1]
other_infrastructure [0 1]
weather_related [0 1]
floods [0 1]
storm [0 1]
fire [0 1]
earthquake [0 1]
cold [0 1]
other_weather [0 1]
direct_report [0 1]


### 6. Remove duplicates.
- Check how many duplicates are in this dataset.
- Drop the duplicates.
- Confirm duplicates were removed.

In [33]:
# check number of duplicates
len(df[df.duplicated('id', False)])

273

In [34]:
# find miss_matched_ids
# among those 273 duplicated ids, 36 of them need to be dropped, scince they have conflicting labels
# the rest of the duplicated ids/instances will be deduplicated and concatenated with the rest of the unique rows
# groupby twice to find missmatched rows via the ids 
df_groupby_idx2 = pd.DataFrame(pd.DataFrame(df.groupby(['id','categories']).message.count()).reset_index().groupby('id').message.count())
miss_matched_ids = df_groupby_idx2[df_groupby_idx2.message!=1].index.values
miss_matched_ids

array([  202,   862,  1652,  3250,  3882,  4956,  5153,  5776,  6393,
        6492,  6515,  6687,  7747,  7945,  8190, 10286, 11503, 12416,
       12420, 13914, 14135, 15760, 16245, 17385, 17553, 18925, 19003,
       19142, 19687, 22858, 24247, 24347, 25512, 27768, 28462, 29022])

In [35]:
len(miss_matched_ids)

36

In [36]:
# drop duplicates
df.drop_duplicates(inplace=True)

In [37]:
# check number of duplicates
len(df[df.duplicated('id', False)])

72

In [38]:
# drop mismatched ids/instances
df = df[~df.id.isin(miss_matched_ids)]

In [39]:
# check number of duplicates
len(df[df.duplicated('id', False)])

0

In [40]:
# shape looks fine
df.shape, df.shape[0]-31+135 == messages.shape[0]

((26144, 41), True)

In [41]:
# check null values
df.isnull().any()

id                        False
message                   False
original                   True
genre                     False
categories                False
related                   False
request                   False
offer                     False
aid_related               False
medical_help              False
medical_products          False
search_and_rescue         False
security                  False
military                  False
child_alone               False
water                     False
food                      False
shelter                   False
clothing                  False
money                     False
missing_people            False
refugees                  False
death                     False
other_aid                 False
infrastructure_related    False
transport                 False
buildings                 False
electricity               False
tools                     False
hospitals                 False
shops                     False
aid_cent

In [48]:
# check labels AFTER removing duplicates
# some labels are not binary / child_alone [0] / related [1 0 2]
for col in df.loc[:,'related':].columns:
    print(col, df[col].unique())


related [1 0 2]
request [0 1]
offer [0 1]
aid_related [0 1]
medical_help [0 1]
medical_products [0 1]
search_and_rescue [0 1]
security [0 1]
military [0 1]
child_alone [0]
water [0 1]
food [0 1]
shelter [0 1]
clothing [0 1]
money [0 1]
missing_people [0 1]
refugees [0 1]
death [0 1]
other_aid [0 1]
infrastructure_related [0 1]
transport [0 1]
buildings [0 1]
electricity [0 1]
tools [0 1]
hospitals [0 1]
shops [0 1]
aid_centers [0 1]
other_infrastructure [0 1]
weather_related [0 1]
floods [0 1]
storm [0 1]
fire [0 1]
earthquake [0 1]
cold [0 1]
other_weather [0 1]
direct_report [0 1]


### 7. Save the clean dataset into an sqlite database.
You can do this with pandas [`to_sql` method](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_sql.html) combined with the SQLAlchemy library. Remember to import SQLAlchemy's `create_engine` in the first cell of this notebook to use it below.

In [42]:
from sqlalchemy import create_engine

In [43]:
# # save df to sqlite
# engine = create_engine('sqlite:///../data/Disaster_Response.db')
# df.to_sql('Disaster_Response', engine, index=False)

### 8. Use this notebook to complete `etl_pipeline.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database based on new datasets specified by the user. Alternatively, you can complete `etl_pipeline.py` in the classroom on the `Project Workspace IDE` coming later.

### 9. Visualization components for run.py

In [44]:
# message length
message_lengths = df.message.str.len()
length_percents=(message_lengths[message_lengths<=message_lengths.quantile(.99)].value_counts(normalize=True,bins=10).round(decimals=4).sort_index())*100
length_percents.index, length_percents.values

(IntervalIndex([(3.579, 46.0], (46.0, 88.0], (88.0, 130.0], (130.0, 172.0], (172.0, 214.0], (214.0, 256.0], (256.0, 298.0], (298.0, 340.0], (340.0, 382.0], (382.0, 424.0]],
               closed='right',
               dtype='interval[float64]'),
 array([ 8.77, 23.88, 21.43, 19.61, 12.3 ,  7.58,  3.33,  1.74,  0.84,
         0.51]))

In [45]:
# generate x-ticks base on Serise.value_counts.index for plotly
xticks = []
for idx in length_percents.index:
    xticks.append(str(idx.left)+'-'+str(idx.right))
    
xticks 

['3.579-46.0',
 '46.0-88.0',
 '88.0-130.0',
 '130.0-172.0',
 '172.0-214.0',
 '214.0-256.0',
 '256.0-298.0',
 '298.0-340.0',
 '340.0-382.0',
 '382.0-424.0']

In [46]:
# correlation between labels
labels = df.loc[:,'related':]
cor = labels.corr()
cor.values

array([[ 1.        ,  0.23586469,  0.03505207, ...,  0.07466508,
         0.12255749,  0.25486427],
       [ 0.23586469,  1.        , -0.03050871, ..., -0.0012755 ,
        -0.01544376,  0.6479713 ],
       [ 0.03505207, -0.03050871,  1.        , ..., -0.00965781,
        -0.00562015,  0.01905024],
       ...,
       [ 0.07466508, -0.0012755 , -0.00965781, ...,  1.        ,
         0.07482098,  0.00420615],
       [ 0.12255749, -0.01544376, -0.00562015, ...,  0.07482098,
         1.        ,  0.00326831],
       [ 0.25486427,  0.6479713 ,  0.01905024, ...,  0.00420615,
         0.00326831,  1.        ]])

In [47]:
cor.index

Index(['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')