In [1]:
import configuration

from src import crisis_categories
import pandas as pd
import numpy as np

import glob
import os
import re

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

import warnings
warnings.filterwarnings('ignore')

# Dataset loading: Label merging and Crisis categorization

In [2]:
folder_from = '../data/datasets/'

positive_label = 'Related'
negative_label = 'Not_Related'

columns_order = ['tweet_id', 'tweet_text', 'dataset','crisis',
                 'country', 'year', 'original_label', 'mapped_label',
                 'hazard_type', 'hazard_cat', 'hazard_subcat', 
                 'development', 'spread']

## SoSItalyT4

In [3]:
italy_data = pd.read_csv(folder_from + 'SoSItalyT4/SoSItalyT4.csv', encoding='utf_8')
italy_data.head(2)

Unnamed: 0,id,text,source,user_screen_name,user_id,latitude,longitude,created_at,disaster,class
0,204044199048380416,"tornare in camera e trovare l'armadio aperto, #creepy #terremoto",Twitter for iPhone,Miaotze,53883,,,2012-05-20T03:01:42.000Z,2,no damage
1,204044968166297601,altra scossa forte. #terremoto,Twitter for iPhone,Miaotze,53883,,,2012-05-20T03:04:46.000Z,2,no damage


In [4]:
italy_data.shape

(5642, 10)

In [5]:
italy_data['class'].unique()

array(['no damage', 'not relevant', 'damage'], dtype=object)

In [6]:
italy_data = italy_data[['id', 'created_at', 'text', 'disaster', 'class']]

italy_data = italy_data.rename(columns={"id": "tweet_id",
                                        "text": "tweet_text", 
                                        "disaster": "crisis", 
                                        "class": "original_label"})

italy_data['dataset'] = 'SoSItalyT4'
italy_data['country'] = 'Italy'

In [7]:
italy_data.crisis.unique()

array([2, 1, 0, 3], dtype=int64)

In [8]:
italy_data['year'] = italy_data.crisis.replace([0, 1, 2, 3], 
                                               [2013, 2009, 2012, 2014])

italy_data['hazard_type'] = italy_data.crisis.replace([0, 1, 2, 3],
                                                       ['flood', 'earthquake', 'earthquake', 'flood'])

italy_data['crisis'] = italy_data.crisis.replace([0, 1, 2, 3], 
                                                 ['flood_Sardinia', 
                                                  'earthquake_Laquila',
                                                  'earthquake_Italy', #'earthquake_Emilia', also in CrisisLexT26
                                                  'flood_Genova'])

italy_data['mapped_label'] = italy_data.original_label.replace(['damage', 'no damage', 'not relevant'], 
                                                               [positive_label, positive_label, negative_label])

italy_data = crisis_categories.assign_categ(italy_data)
italy_data = italy_data[columns_order]

## ChileEarthquakeT1

In [9]:
chile_data = pd.read_csv(folder_from + 'Chile earthquake/ChileEarthquakeT1.csv', encoding='utf_8')
chile_data.head(2)

Unnamed: 0.1,Unnamed: 0,n,ID_unit,day,date,time_zone,time,tweet_id,user_id,name,screen_name,friends_count,followers_count,texto,value
0,1,1,7303,sat feb 27,2/27/2010,Madrid,11:40:55,9725643670,96025653,joju Aliaga,piziadas,32,37,fuentes sobre el terremoto en chile http://bit.ly/auedsj,True
1,2,2,10973,sat feb 27,2/27/2010,Madrid,11:46:03,9725759250,93857167,Ecodez,ecodez,0,279,sube el parte de víctimas: las autoridades chilenas informan de que al menos 64 personas fallecieron en las... http://bit.ly/awx73m,True


In [10]:
chile_data.shape

(2187, 15)

In [11]:
chile_data['value'].unique()

array([ True, False])

In [12]:
chile_data = chile_data[['tweet_id', 'date', 'texto', 'value']]

chile_data = chile_data.rename(columns={"date": "created_at",
                                        "texto": "tweet_text", 
                                        "value": "original_label"})

In [13]:
chile_data['dataset'] = 'ChileEarthquakeT1'
chile_data['country'] = 'Chile'
chile_data['crisis'] = 'earthquake_Chile'
chile_data['year'] = 2010
chile_data['hazard_type'] = 'earthquake'

chile_data['mapped_label'] = chile_data.original_label.replace([True, False], 
                                                               [positive_label, negative_label])

In [14]:
#Replacing tweet text of those recovery using the Twitter API because they seem to be truncated to the first comma
chile_down = pd.read_csv(folder_from + 'Chile earthquake/Chile_uniques_downloaded.csv', encoding='utf_8')
chile_data = chile_data.join(chile_down.set_index('id')[['full_text']], on='tweet_id')
chile_data.loc[(chile_data.tweet_id.isin(set(chile_down.id))) , 'tweet_text'] = chile_data['full_text']
chile_data = crisis_categories.assign_categ(chile_data)
chile_data = chile_data[columns_order]

## CrisisLexT6

In [15]:
path =folder_from + "CrisisLexT6/"
allFiles = glob.glob(path + "/*.csv")
t6_data = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None, header=0, encoding='utf_8')
    df['crisis'] = file_.split('\\')[-1].split('.')[0].split('-')[0]
    list_.append(df)
t6_data = pd.concat(list_)

t6_data.head(2)

Unnamed: 0,tweet id,tweet,label,crisis
0,'262596552399396864',I've got enough candles to supply a Mexican family,off-topic,2012_Sandy_Hurricane
1,'263044104500420609',Sandy be soooo mad that she be shattering our doors and shiet #HurricaneSandy,on-topic,2012_Sandy_Hurricane


In [16]:
t6_data.shape

(60082, 4)

In [17]:
t6_data[' label'].unique()

array(['off-topic', 'on-topic'], dtype=object)

In [18]:
t6_data = t6_data.rename(columns={"tweet id": "tweet_id", 
                                  " tweet": "tweet_text", 
                                  " label": "original_label"})

In [19]:
t6_data['dataset'] = 'CrisisLexT6'
t6_data['mapped_label'] = t6_data.original_label.replace(['on-topic', 'off-topic'], 
                                                         [positive_label, negative_label])

In [20]:
t6_data['country'] = t6_data.crisis.replace(['2012_Sandy_Hurricane', '2013_Alberta_Floods',
                                             '2013_Boston_Bombings', '2013_Oklahoma_Tornado',
                                             '2013_Queensland_Floods', '2013_West_Texas_Explosion'],
                                            ['several', 'Canada', 'EEUU', 'EEUU','Australia','EEUU'])

t6_data['year'] = t6_data.crisis.replace(['2012_Sandy_Hurricane', '2013_Alberta_Floods',
                                             '2013_Boston_Bombings', '2013_Oklahoma_Tornado',
                                             '2013_Queensland_Floods', '2013_West_Texas_Explosion'],
                                            [2012, 2013, 2013, 2013, 2013, 2013])

t6_data['created_at'] = t6_data['year']

t6_data['hazard_type'] = t6_data.crisis.replace(['2012_Sandy_Hurricane', '2013_Alberta_Floods',
                                                 '2013_Boston_Bombings', '2013_Oklahoma_Tornado',
                                                 '2013_Queensland_Floods', '2013_West_Texas_Explosion'],
                                                ['hurricane', 'flood', 'bombings', 'tornado', 'flood', 'explosion'])

t6_data['crisis'] = t6_data.crisis.replace(['2012_Sandy_Hurricane', '2013_Alberta_Floods',
                                            '2013_Boston_Bombings', '2013_Oklahoma_Tornado',
                                            '2013_Queensland_Floods', '2013_West_Texas_Explosion'],
                                           ['hurricane_Sandy', 'flood_Alberta',
                                            'bombing_Boston', 'tornado_Oklahoma',
                                            'flood_Queensland', 'explosion_WestTexas'])

t6_data = crisis_categories.assign_categ(t6_data)
t6_data = t6_data[columns_order]

## CrisisLexT26

In [21]:
path = folder_from + "CrisisLexT26/"
allFiles = glob.glob(path + "/*.csv")
t26_data = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None, header=0, encoding='utf_8')
    df['crisis'] = file_.split('\\')[-1].split('.')[0].split('-')[0]
    list_.append(df)
t26_data = pd.concat(list_)
t26_data.head(2)

Unnamed: 0,Tweet ID,Tweet Text,Information Source,Information Type,Informativeness,crisis
0,211040709124440064,#Intern #US #TATTOO #Wisconsin #Ohio #NC #PA #Florida #Colorado #Iowa #Nevada #Virginia #NV #mlb Travel Destinations---&gt;http://t.co/TIHBJKF2,Not labeled,Not labeled,Not related,2012_Colorado_wildfires
1,211111710294163457,RT @Jack4Ward: Get in on the fun every Thursday with the @csindependent #FunBrief http://ow.ly/br9Wi #CoSprings #Colorado,Not labeled,Not labeled,Not related,2012_Colorado_wildfires


In [22]:
t26_data.shape

(27933, 6)

In [23]:
t26_data = t26_data.rename(columns={"Tweet ID": "tweet_id", 
                                    " Tweet Text": "tweet_text", 
                                    " Informativeness": "original_label"})

t26_data['dataset'] = 'CrisisLexT26'
t26_data['mapped_label'] = t26_data.original_label.replace(['Related and informative', 
                                                            'Related - but not informative', 
                                                            'Not related', 
                                                            'Not applicable'], 
                                                           [positive_label, 
                                                            positive_label, 
                                                            negative_label, 
                                                            negative_label])

t26_data['year'] = [int(x.split('_')[0]) for x in t26_data.crisis]
t26_data['created_at'] = t26_data['year']

In [24]:
t26_data['crisis'] = t26_data.crisis.replace(['2012_Colorado_wildfires', '2012_CostaRica_earthquake',
                                            '2012_Guatemala_earthquake', '2012_Italy_earthquakes',
                                            '2012_Philipinnes_floods', '2012_Typhoon_Pablo',
                                            '2012_Venezuela_refinery', '2013_Alberta_floods',
                                            '2013_Australia_bushfire', '2013_Bohol_earthquake',
                                            '2013_Boston_bombings', '2013_Brazil_nightclubFire',
                                            '2013_Colorado_floods', '2013_Glasgow_helicopterCrash',
                                            '2013_LacMegantic_trainCrash', '2013_LA_airportShootings',
                                            '2013_Manila_floods', '2013_NY_trainCrash',
                                            '2013_Queensland_floods', '2013_Russia_meteor',
                                            '2013_Sardinia_floods', '2013_Savar_buildingCollapse',
                                            '2013_Singapore_haze', '2013_Spain_trainCrash',
                                            '2013_Typhoon_Yolanda', '2013_WestTexas_explosion'],
                                             
                                             ['wildfires_Colorado', 'earthquake_CostaRica',
                                            'earthquake_Guatemala', 'earthquake_Italy',
                                            'flood_Philippines', 'typhoon_Pablo',
                                            'explosion_Venezuela', 'flood_Alberta',
                                            'bushfire_Australia', 'earthquake_Bohol',
                                            'bombing_Boston', 'nightclubFire_Brazil',
                                            'flood_Colorado', 'helicopterCrash_Glasgow',
                                            'trainCrash_LacMegantic', 'airportShootings_LA',
                                            'flood_Manila', 'trainCrash_NY',
                                            'flood_Queensland', 'meteor_Russia',
                                            'flood_Sardinia', 'buildingCollapse_Savar',
                                            'haze_Singapore', 'trainCrash_Spain',
                                            'typhoon_Yolanda', 'explosion_WestTexas'] )

In [25]:
t26_data['country'] = t26_data.crisis.replace([ 'wildfires_Colorado', 'earthquake_CostaRica',
                                                'earthquake_Guatemala', 'earthquake_Italy',
                                                'flood_Philippines', 'typhoon_Pablo',
                                                'explosion_Venezuela', 'flood_Alberta',
                                                'bushfire_Australia', 'earthquake_Bohol',
                                                'bombing_Boston', 'nightclubFire_Brazil',
                                                'flood_Colorado', 'helicopterCrash_Glasgow',
                                                'trainCrash_LacMegantic', 'airportShootings_LA',
                                                'flood_Manila', 'trainCrash_NY',
                                                'flood_Queensland', 'meteor_Russia',
                                                'flood_Sardinia', 'buildingCollapse_Savar',
                                                'haze_Singapore', 'trainCrash_Spain',
                                                'typhoon_Yolanda', 'explosion_WestTexas'],                                              
                                              
                                              ['EEUU', 'Costa Rica',
                                               'Guatemala', 'Italy', 
                                               'Philippines', 'several', 
                                               'Venezuela', 'Canada',
                                               'Australia', 'Philippines', 
                                               'EEUU', 'Brazil', 
                                               'EEUU', 'UK',
                                               'Canada', 'EEUU', 
                                               'Philippines', 'EEUU',
                                               'Australia', 'Russia', 
                                               'Italy', 'Bangladesh', 
                                               'Singapore', 'Spain',
                                               'several', 'EEUU'])

In [26]:
t26_data['hazard_type'] = t26_data.crisis.replace(['wildfires_Colorado', 'earthquake_CostaRica',
                                                'earthquake_Guatemala', 'earthquake_Italy',
                                                'flood_Philippines', 'typhoon_Pablo',
                                                'explosion_Venezuela', 'flood_Alberta',
                                                'bushfire_Australia', 'earthquake_Bohol',
                                                'bombing_Boston', 'nightclubFire_Brazil',
                                                'flood_Colorado', 'helicopterCrash_Glasgow',
                                                'trainCrash_LacMegantic', 'airportShootings_LA',
                                                'flood_Manila', 'trainCrash_NY',
                                                'flood_Queensland', 'meteor_Russia',
                                                'flood_Sardinia', 'buildingCollapse_Savar',
                                                'haze_Singapore', 'trainCrash_Spain',
                                                'typhoon_Yolanda', 'explosion_WestTexas'], 
                                                  
                                                  ['wildfires', 'earthquake',
                                                   'earthquake', 'earthquake', 
                                                   'flood', 'typhoon', 
                                                   'explosion', 'flood',
                                                   'wildfires', 'earthquake',
                                                   'bombings','fire', 
                                                   'flood', 'crash',
                                                   'derailment', 'shooting', 
                                                   'flood', 'derailment',
                                                   'flood', 'meteorite', 
                                                   'flood', 'collapse',
                                                   'haze', 'derailment',
                                                   'typhoon', 'explosion'])

t26_data = crisis_categories.assign_categ(t26_data)
t26_data = t26_data[columns_order]

## Ecuador-Earthquake

In [27]:
path =folder_from + "Ecuador earthquake/"
allFiles = glob.glob(path + "/*.csv")
ecuador_data = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_, index_col=None, header=0, encoding='utf_8', dtype=object)
    df['crisis'] = file_.split('\\')[-1].split('_labeled_data')[0]
    list_.append(df)
ecuador_data = pd.concat(list_)
ecuador_data.head(2)

Unnamed: 0,id,screen_name,text,url,timestamp,choose_one_category,crisis_related,choose_one_category_a1,choose_one_category_a2,choose_one_category_a3,crisis
0,721627947832979456,ErikaGarza_Tv,The death toll is 77 in Ecuador after a 7.8 earthquake shook the South American country...… https://t.co/xh38pnjAKs,https://www.twitter.com/ErikaGarza_Tv/status/721627947832979456,2016-04-17 09:14:44,injured_or_dead_people,yes,injured_or_dead_people,injured_or_dead_people,injured_or_dead_people,2016_ecuador_eq_en.csv
1,721719744869478400,rr_rr_rr_11,Magnitude-7.8 earthquake hits Ecuador https://t.co/iiFoWwPVEU,https://www.twitter.com/rr_rr_rr_11/status/721719744869478400,2016-04-17 15:19:30,other_useful_information,yes,other_useful_information,other_useful_information,other_useful_information,2016_ecuador_eq_en.csv


In [28]:
ecuador_data.shape

(8360, 11)

In [29]:
ecuador_data.crisis_related.unique()

array(['yes', 'no'], dtype=object)

In [30]:
ecuador_data = ecuador_data[['id', 'timestamp', 'text', 'crisis_related', 'crisis']]

ecuador_data = ecuador_data.rename(columns={"id": "tweet_id",
                                            "timestamp": "created_at", 
                                            "text": "tweet_text", 
                                            "crisis": "lang_dataset", 
                                            "crisis_related": "original_label"})

In [31]:
ecuador_data['dataset'] = 'ESPOL_Ecuador_earthquake'
ecuador_data['country'] = 'Ecuador'
ecuador_data['year'] = 2016
ecuador_data['hazard_type'] = 'earthquake'
ecuador_data['crisis'] = 'earthquake_Ecuador'
ecuador_data['mapped_label'] = ecuador_data.original_label.replace(['yes', 'no'], 
                                                                   [positive_label, negative_label])

In [32]:
#Removing 2 cases where tweet text is empty
ecuador_data = ecuador_data[~ecuador_data.tweet_text.isna()]
ecuador_data.shape
ecuador_data = crisis_categories.assign_categ(ecuador_data)
ecuador_data = ecuador_data[columns_order]

## CrisisNlpR1 Crowdflower

In [33]:
path = folder_from + "CrisisNLP_labeled_data_crowdflower_v2/"
allFiles = glob.glob(path + "/*.tsv")
CrisisNlpR1_data = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_, index_col=None, header=0, sep='\t', encoding='utf_8', quotechar=' ')
    df['crisis'] = file_.split('\\')[-1].split('_labeled_data')[0]
    list_.append(df)
CrisisNlpR1_data = pd.concat(list_)
CrisisNlpR1_data.head(2)

Unnamed: 0,tweet_id,tweet_text,label,crisis
0,'383600460340666369',RT @Faiz_Baluch: #BalochistanEarthQuake Pakistan army is terrorizing the people by aerial firing in #Awaran https://t.co/R3R5ph8zSS http://…,other_useful_information,2013_Pakistan_eq_CF
1,'383790723222364161',#Earthquake 2013-09-28 02:39:43 (M5.0) EAST OF THE SOUTH SANDWICH ISLANDS -59.5 -19.1 (70fa9) http://t.co/uBN98fFmNj notice,other_useful_information,2013_Pakistan_eq_CF


In [34]:
CrisisNlpR1_data.shape

(22099, 4)

In [35]:
CrisisNlpR1_data = CrisisNlpR1_data.rename(columns={"label": "original_label"})

In [36]:
CrisisNlpR1_data['dataset'] = 'CrisisNlpR1_CF'


CrisisNlpR1_data['country'] = CrisisNlpR1_data.crisis.replace(['2013_Pakistan_eq_CF', '2014_California_Earthquake_CF',
                                                               '2014_Chile_Earthquake_cl', '2014_Chile_Earthquake_en_CF',
                                                               '2014_ebola_CF', '2014_Hurricane_Odile_Mexico_en_CF',
                                                               '2014_India_floods_CF', '2014_MERS_en_CF',
                                                               '2014_Pakistan_floods_CF',
                                                               '2014_Philippines_Typhoon_Hagupit_en_CF', '2015_Cyclone_Pam_en_CF',
                                                               '2015_Nepal_Earthquake_en_CF'],
                                                              
                                                              ['Pakistan', 'EEUU',
                                                               'Chile', 'Chile',
                                                               'several', 'Mexico',
                                                               'India', 'several',
                                                               'Pakistan',
                                                               'Philippines', 'Vanuatu',
                                                               'Nepal']  
                                                             )
CrisisNlpR1_data['year'] = CrisisNlpR1_data.crisis.replace(['2013_Pakistan_eq_CF', '2014_California_Earthquake_CF',
                                                               '2014_Chile_Earthquake_cl', '2014_Chile_Earthquake_en_CF',
                                                               '2014_ebola_CF', '2014_Hurricane_Odile_Mexico_en_CF',
                                                               '2014_India_floods_CF', '2014_MERS_en_CF',
                                                               '2014_Pakistan_floods_CF',
                                                               '2014_Philippines_Typhoon_Hagupit_en_CF', '2015_Cyclone_Pam_en_CF',
                                                               '2015_Nepal_Earthquake_en_CF'],
                                                              
                                                              ['2013', '2014',
                                                               '2014', '2014',
                                                               '2014', '2014',
                                                               '2014', '2014',
                                                               '2014',
                                                               '2014', '2015',
                                                               '2015'] 
                                                             )
CrisisNlpR1_data['created_at'] = CrisisNlpR1_data['year']

In [37]:
CrisisNlpR1_data['hazard_type'] = CrisisNlpR1_data.crisis.replace(['2013_Pakistan_eq_CF', '2014_California_Earthquake_CF',
                                                                   '2014_Chile_Earthquake_cl', '2014_Chile_Earthquake_en_CF',
                                                                   '2014_ebola_CF', '2014_Hurricane_Odile_Mexico_en_CF',
                                                                   '2014_India_floods_CF', '2014_MERS_en_CF',
                                                                   '2014_Pakistan_floods_CF',
                                                                   '2014_Philippines_Typhoon_Hagupit_en_CF', '2015_Cyclone_Pam_en_CF',
                                                                   '2015_Nepal_Earthquake_en_CF'],
                                                                  
                                                                  ['earthquake', 'earthquake',
                                                                   'earthquake', 'earthquake',
                                                                   'viral_disease', 'hurricane',
                                                                   'flood', 'viral_disease',
                                                                   'flood',
                                                                   'typhoon', 'cyclone',
                                                                   'earthquake'])

In [38]:
CrisisNlpR1_data['crisis'] = CrisisNlpR1_data.crisis.replace(['2013_Pakistan_eq_CF', '2014_California_Earthquake_CF',
                                                                   '2014_Chile_Earthquake_cl', '2014_Chile_Earthquake_en_CF',
                                                                   '2014_ebola_CF', '2014_Hurricane_Odile_Mexico_en_CF',
                                                                   '2014_India_floods_CF', '2014_MERS_en_CF',
                                                                   '2014_Pakistan_floods_CF',
                                                                   '2014_Philippines_Typhoon_Hagupit_en_CF', '2015_Cyclone_Pam_en_CF',
                                                                   '2015_Nepal_Earthquake_en_CF'],
                                                              
                                                              ['earthquake_Pakistan', 'earthquake_California',
                                                                   'earthquake_Chile', 'earthquake_Chile',
                                                                   'ebola_virus', 'hurricane_Mexico_Odile',
                                                                   'flood_India', 'Middle_East_Respiratory_Syndrome',
                                                                   'flood_Pakistan',
                                                                   'typhoon_Philippines_Hagupit', 'cyclone_Vanuatu_Pam',
                                                                   'earthquake_Nepal'])

In [39]:
nl1_cf_notrelevant = ['not_related_or_irrelevant']

CrisisNlpR1_data['mapped_label'] = np.where((CrisisNlpR1_data.original_label.isin(nl1_cf_notrelevant)),
                                            negative_label, positive_label)

CrisisNlpR1_data = crisis_categories.assign_categ(CrisisNlpR1_data)
CrisisNlpR1_data = CrisisNlpR1_data[columns_order]

## CrisisNlpR1 Volunteer

In [40]:
path = folder_from + "CrisisNLP_volunteers_labeled_data/"
allFiles = glob.glob(path + "/*.csv")
CrisisNlpR1_volunteer = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_, index_col=None, header=0, encoding='utf_8')
    df['crisis'] = file_.split('\\')[-1].split('.csv')[0]
    list_.append(df)
CrisisNlpR1_volunteer = pd.concat(list_)
CrisisNlpR1_volunteer.head(2)

Unnamed: 0,tweet_id,tweet_time,tweet_author,tweet_author_id,tweet_language,tweet_lon,tweet_lat,tweet_text,tweet_url,label,crisis
0,'503866345822244864',Mon Aug 25 11:27:52 +0000 2014,NewsYouN2Know,2420642647,en,,,"å_ USA 592 Earthquake Earthquake rattles, doesn t wreck Napa wineries news24lhot Georgia USA Augusta-Richmond... http://t.co/S8EMU5FrXH",https://twitter.com/NewsYouN2Know/status/503866345822244864,Infrastructure and utilities,2014_California_Earthquake
1,'503866373961809920',Mon Aug 25 11:27:59 +0000 2014,debbyxme,2300611952,en,,,‰Û¢ ‰Û¢ Earthquake #Earthquake California Earthquake San Francisco 428 ‰Û¢ Northern California Stru‰Û_... http://t.co/WG0ufzf2W1 #Earthquake,https://twitter.com/debbyxme/status/503866373961809920,Other relevant information,2014_California_Earthquake


In [41]:
CrisisNlpR1_volunteer.shape

(27497, 11)

In [42]:
CrisisNlpR1_volunteer = CrisisNlpR1_volunteer.rename(columns={"label": "original_label",
                                                              ' tweet_text':"tweet_text"})

In [43]:
# Excluding one tweet of Animal Management
CrisisNlpR1_volunteer = CrisisNlpR1_volunteer[CrisisNlpR1_volunteer['original_label'] != 'Animal management']

nl1_vol_notrelevant = ['Not related or irrelevant', 'Not related to crisis', 
                       'Not informative', 'No', 'Not Informative', 'Not relevant', 
                       'Not Relevant',  'Not physical landslide']

In [44]:
CrisisNlpR1_volunteer['mapped_label'] = np.where((CrisisNlpR1_volunteer.original_label.isin(nl1_vol_notrelevant)),
                                                 negative_label, positive_label)

CrisisNlpR1_volunteer['dataset'] = 'CrisisNlpR1_Vol'

CrisisNlpR1_volunteer['country'] = CrisisNlpR1_volunteer.crisis.replace(
    ['2014_California_Earthquake', '2014_chile_earthquake_cl',
     '2014_Chile_Earthquake_en', '2014_Hurricane_Odile_Mexico_en',
     '2014_Iceland_Volcano_en', '2014_Malaysia_Airline_MH370_en',
     '2014_Middle_East_Respiratory_Syndrome_en',
     '2014_Typhoon_Hagupit_en', '2015_Cyclone_Pam_en',
     '2015_Nepal_Earthquake_en', 'Landslides_Worldwide_en',
     'Landslides_Worldwide_esp', 'LandSlides_Worldwide_fr'],
    
    ['EEUU', 'Chile',
     'Chile', 'Mexico',
     'Iceland', 'Malaysia',
     'several',
     'Philippines', 'Vanuatu',
     'Nepal', 'several',
     'several', 'several'])

CrisisNlpR1_volunteer['year'] = CrisisNlpR1_volunteer.crisis.replace(
    ['2014_California_Earthquake', '2014_chile_earthquake_cl',
     '2014_Chile_Earthquake_en', '2014_Hurricane_Odile_Mexico_en',
     '2014_Iceland_Volcano_en', '2014_Malaysia_Airline_MH370_en',
     '2014_Middle_East_Respiratory_Syndrome_en',
     '2014_Typhoon_Hagupit_en', '2015_Cyclone_Pam_en',
     '2015_Nepal_Earthquake_en', 'Landslides_Worldwide_en',
     'Landslides_Worldwide_esp', 'LandSlides_Worldwide_fr'],
    
    ['2014', '2014',
     '2014', '2014',
     '2014', '2014',
     '2014',
     '2014', '2015',
     '2015', '2014',
     '2015', '2015'])

CrisisNlpR1_volunteer['created_at'] = CrisisNlpR1_volunteer[' tweet_time']

In [45]:
CrisisNlpR1_volunteer['hazard_type'] = CrisisNlpR1_volunteer.crisis.replace(
    ['2014_California_Earthquake', '2014_chile_earthquake_cl',
       '2014_Chile_Earthquake_en', '2014_Hurricane_Odile_Mexico_en',
       '2014_Iceland_Volcano_en', '2014_Malaysia_Airline_MH370_en',
       '2014_Middle_East_Respiratory_Syndrome_en',
       '2014_Typhoon_Hagupit_en', '2015_Cyclone_Pam_en',
       '2015_Nepal_Earthquake_en', 'Landslides_Worldwide_en',
       'Landslides_Worldwide_esp', 'LandSlides_Worldwide_fr'],
    
    ['earthquake', 'earthquake',
       'earthquake', 'hurricane',
       'volcano', 'crash',
       'viral_disease',
       'typhoon', 'cyclone',
       'earthquake', 'landslide',
       'landslide', 'landslide'])

In [46]:
CrisisNlpR1_volunteer['crisis'] = CrisisNlpR1_volunteer.crisis.replace(
    ['2014_California_Earthquake', '2014_chile_earthquake_cl',
       '2014_Chile_Earthquake_en', '2014_Hurricane_Odile_Mexico_en',
       '2014_Iceland_Volcano_en', '2014_Malaysia_Airline_MH370_en',
       '2014_Middle_East_Respiratory_Syndrome_en',
       '2014_Typhoon_Hagupit_en', '2015_Cyclone_Pam_en',
       '2015_Nepal_Earthquake_en', 'Landslides_Worldwide_en',
       'Landslides_Worldwide_esp', 'LandSlides_Worldwide_fr'],
    
    ['earthquake_California', 'earthquake_Chile',
       'earthquake_Chile', 'hurricane_Mexico_Odile',
       'Volcano_Iceland', 'Airline_MH370_Malaysia',
       'Middle_East_Respiratory_Syndrome',
       'typhoon_Philippines_Hagupit', 'cyclone_Vanuatu_Pam',
       'earthquake_Nepal', 'Landslides_Worldwide',
       'Landslides_Worldwide', 'Landslides_Worldwide'])

CrisisNlpR1_volunteer = crisis_categories.assign_categ(CrisisNlpR1_volunteer)
CrisisNlpR1_volunteer = CrisisNlpR1_volunteer[columns_order]

## CrisisMMD

In [47]:
path = folder_from + "CrisisMMD_v2.0/crisismmd_datasplit_agreed_label/"
allFiles = glob.glob(path + "/*.tsv")
mmd = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_, index_col=None, header=0, sep='\t', encoding='utf_8', quotechar=' ')
    list_.append(df)
mmd = pd.concat(list_)
mmd.head(2)

Unnamed: 0,event_name,tweet_id,image_id,tweet_text,image,label,label_text,label_image,label_text_image
0,hurricane_harvey,905064623199719425,905064623199719425_0,We've lost track of how many houses/families @NEHBC teams have helped. So proud of our church. #Harvey https://t.co/wBkKvUt9vw,data_image/hurricane_harvey/5_9_2017/905064623199719425_0.jpg,informative,informative,informative,Positive
1,hurricane_maria,922857566220283904,922857566220283904_0,The gym above subway got destroyed. The cement wall got blown out. #HurricaneMaria https://t.co/GS0tKhvR30,data_image/hurricane_maria/24_10_2017/922857566220283904_0.jpg,informative,informative,informative,Positive


In [48]:
#Counting duplicated by text and by image
mmd.shape

(12708, 9)

In [49]:
mmd.label_image.unique()

array(['informative', 'not_informative'], dtype=object)

In [50]:
mmd = mmd[['tweet_id', 'tweet_text', 'label_text', 'event_name']]
mmd = mmd.rename(columns={"label_text": "original_label", 
                          "event_name": "crisis"})
mmd = mmd.drop_duplicates(subset=['tweet_id'], keep='first').reset_index(drop=True)
mmd.shape

(11400, 4)

In [51]:
mmd['dataset'] = 'CrisisMMD'
mmd['year'] = 2017
mmd['created_at'] = mmd['year']
mmd['mapped_label'] = mmd.original_label.replace(['informative', 'not_informative'], 
                                                 [positive_label, negative_label])

In [52]:
mmd['country'] = mmd.crisis.replace(['california_wildfires', 'hurricane_harvey', 'hurricane_irma',
                                     'hurricane_maria', 'iraq_iran_earthquake', 'mexico_earthquake',
                                     'srilanka_floods'],
                                    ['EEUU', 'EEUU', 'several',
                                     'several', 'several', 'Mexico',
                                     'Sri Lanka'])

mmd['crisis'] = mmd.crisis.replace(['california_wildfires', 'hurricane_harvey', 'hurricane_irma',
                                     'hurricane_maria', 'iraq_iran_earthquake', 'mexico_earthquake',
                                     'srilanka_floods'],
                                    ['wildfires_California', 'hurricane_Harvey', 'hurricane_Irma',
                                     'hurricane_Maria', 'earthquake_Iraq_Iran', 'earthquake_Mexico',
                                     'flood_SriLanka'])

In [53]:
mmd['hazard_type'] = mmd.crisis.replace(['wildfires_California', 'hurricane_Harvey', 'hurricane_Irma',
                                         'hurricane_Maria', 'earthquake_Iraq_Iran', 'earthquake_Mexico',
                                         'flood_SriLanka'],
                                        ['wildfires', 'hurricane', 'hurricane',
                                         'hurricane', 'earthquake', 'earthquake',
                                         'flood'])

mmd = crisis_categories.assign_categ(mmd)
mmd = mmd[columns_order]

# Removing duplicates by ID

In [54]:
data = pd.concat([italy_data, chile_data, t6_data, t26_data, ecuador_data, CrisisNlpR1_data, CrisisNlpR1_volunteer, mmd])
data = data.reset_index(drop=True)
data.shape, len(data.groupby(['crisis', 'country', 'year']).count())

((165197, 13), 54)

In [55]:
def join_l(l):
    return " -- ".join(str(v) for v in l)

g = data.groupby(['dataset', 'mapped_label', 'original_label']).count()[[]].reset_index(
    ['mapped_label', 'original_label'])

g.pivot_table(values='original_label', index=g.index, 
              columns='mapped_label', aggfunc={'original_label':join_l})

mapped_label,Not_Related,Related
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
ChileEarthquakeT1,False,True
CrisisLexT26,Not applicable -- Not related,Related - but not informative -- Related and informative
CrisisLexT6,off-topic,on-topic
CrisisMMD,not_informative,informative
CrisisNlpR1_CF,not_related_or_irrelevant,affected_people -- caution_and_advice -- deaths_reports -- disease_signs_or_symptoms -- disease_transmission -- displaced_people_and_evacuations -- donation_needs_or_offers_or_volunteering_services -- infrastructure_and_utilities_damage -- injured_or_dead_people -- missing_trapped_or_found_people -- other_useful_information -- prevention -- sympathy_and_emotional_support -- treatment
CrisisNlpR1_Vol,No -- Not Informative -- Not Relevant -- Not informative -- Not physical landslide -- Not related or irrelevant -- Not related to crisis -- Not relevant,"Caution and advice -- Displaced people -- Donations of money -- Donations of supplies and/or volunteer work -- Humanitarian Aid Provided -- Informative -- Infrastructure -- Infrastructure Damage -- Infrastructure and utilities -- Infrastructure damage -- Injured and dead -- Injured or dead people -- Missing, trapped, or found people -- Money -- Needs of those affected -- Non-government -- Other Relevant Information -- Other relevant -- Other relevant information -- Other useful information -- People missing or found -- Personal -- Personal only -- Personal updates -- Personal updates, sympathy, support -- Physical landslide -- Praying -- Requests for Help/Needs -- Response Efforts -- Response efforts -- Shelter and supplies -- Sympathy and emotional support -- Traditional media -- Urgent Needs -- Volunteer or professional services -- Yes"
ESPOL_Ecuador_earthquake,no,yes
SoSItalyT4,not relevant,damage -- no damage


In [56]:
# Convert tweet_id into int 
def str_tweet(tweet): 
    tweet = str(tweet) 
    return re.sub(r"\D", "", tweet)

data['tweet_id'] = data['tweet_id'].apply(str_tweet)
data['tweet_id'] = data['tweet_id'].astype(np.int64) 

In [57]:
#removing duplicated tweet_id
data = data.drop_duplicates(subset=['tweet_id'], keep='first')
data.shape

(164625, 13)

# Language detection

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm, tqdm_notebook

import langid 
from langdetect import detect  

import fasttext 
detector_fasttext = fasttext.load_model('../data/pretrained_models/lang_identification/fasttext lid.176.bin')

from src import clean_tweet
from ftfy import fix_text
import pandas as pd
tqdm.pandas()

## Cleaning the text

In [59]:
# Fix encoding text
data['fixed_text'] = data.tweet_text.progress_apply(fix_text)

100%|███████████████████████████████████████████████████████████████████████| 164625/164625 [00:11<00:00, 14915.54it/s]


In [60]:
#For detecting language, mantain capital letters (except when all tweet in uppercase) numbers and some punctuations
data['fixed_clean_lan'] = [clean_tweet.clean_tweet_lan(tweet, lower=False) for tweet in tqdm_notebook(data.fixed_text)]
data['fixed_clean_lan'] = data['fixed_clean_lan'].astype(str)

#Remove all retweet symbol, urls, users, emojis, hashtags, cashtag, numbers, symbols...
data['fixed_clean_total'] = [clean_tweet.clean_tweet_totally(tweet, lower=True) for tweet in tqdm_notebook(data.fixed_text)]
data['fixed_clean_total'] = data['fixed_clean_total'].astype(str)

HBox(children=(FloatProgress(value=0.0, max=164625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=164625.0), HTML(value='')))




In [61]:
#Counting chars 
data['chars_fixed_clean_lan'] = [len(tweet) for tweet in tqdm_notebook(data.fixed_clean_lan)]
data['chars_fixed_clean_total'] = [len(tweet) for tweet in tqdm_notebook(data.fixed_clean_total)]

#Counting words
data['words_fixed_clean_lan'] = [len(tweet.split()) for tweet in tqdm_notebook(data.fixed_clean_lan)]
data['words_fixed_clean_total'] = [len(tweet.split()) for tweet in tqdm_notebook(data.fixed_clean_total)]

HBox(children=(FloatProgress(value=0.0, max=164625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=164625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=164625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=164625.0), HTML(value='')))




In [75]:
data[['tweet_text', 'fixed_text', 'fixed_clean_lan', 'fixed_clean_total']].sample(3)

Unnamed: 0,tweet_text,fixed_text,fixed_clean_lan,fixed_clean_total
39364,@cassafrass11 haha I just kept hitting refresh starting at 9:58,@cassafrass11 haha I just kept hitting refresh starting at 9:58,haha I just kept hitting refresh starting at 9 58,haha i just kept hitting refresh starting at
69631,Canal de Costa Rica transmite en directo primeras imágenes del terremoto. Mira aquí http://t.co/TyC2vbFC,Canal de Costa Rica transmite en directo primeras imágenes del terremoto. Mira aquí http://t.co/TyC2vbFC,Canal de Costa Rica transmite en directo primeras imágenes del terremoto. Mira aquí,canal de costa rica transmite en directo primeras imágenes del terremoto mira aquí
55100,Flood in Maryborough in Queensland Australia http://t.co/s5myMxp7,Flood in Maryborough in Queensland Australia http://t.co/s5myMxp7,Flood in Maryborough in Queensland Australia,flood in maryborough in queensland australia


## Detecting language

In [63]:
def detect_fasttext(text):
    return detector_fasttext.predict(text)[0][0].split('__label__')[-1]

def detect_langid(text):
    return langid.classify(text)[0]

def detect_langdetect(text):
    try:
        return detect(text)
    except Exception as e:
        return 'error'    

In [64]:
data['lan_fasttext'] = data.fixed_clean_lan.progress_apply(detect_fasttext) 

100%|███████████████████████████████████████████████████████████████████████| 164625/164625 [00:02<00:00, 56672.61it/s]


In [65]:
data['lan_langid'] = data.fixed_clean_lan.progress_apply(detect_langid) 

100%|█████████████████████████████████████████████████████████████████████████| 164625/164625 [04:56<00:00, 555.80it/s]


In [66]:
data['lan_langdetect'] = data.fixed_clean_lan.progress_apply(detect_langdetect) 

100%|█████████████████████████████████████████████████████████████████████████| 164625/164625 [11:02<00:00, 248.41it/s]


## Voting language

In [72]:
def top_langs(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [76]:
lang_cols = ['lan_fasttext', 'lan_langid', 'lan_langdetect']

data['vote_lan'] = [top_langs(tweet) for tweet in data[lang_cols].values]

data['lan_final'] = [vote[0][0] if vote[0][1] > sum([lan[1] for lan in vote]) / 2 
                     else 'no_agree' for vote in data.vote_lan.values]

In [77]:
data.loc[((data.chars_fixed_clean_total==0) | # tweets with no text (only numbers, hashtags, users)
          ((data.words_fixed_clean_total==1) & # tweets with one word of less than four chars
           (data.chars_fixed_clean_total.isin([1, 2, 3])))) , 'lan_final'] = 'no_text'

In [3]:
cols = ['tweet_id', 'tweet_text', 'fixed_text', 'dataset', 
        'crisis', 'country', 'year', 'original_label', 'mapped_label', 
        'hazard_type', 'hazard_cat', 'hazard_subcat', 'development', 
        'spread', 'vote_lan', 'lan_final']


data[cols].to_csv('../data/unified/Unified_MultiCrisis_Dataset.csv', index=False)
data[cols].shape

(164625, 16)

In [15]:
len(data.groupby(['crisis', 'country', 'year']).count())

53