# 2. Build Disaster Corpus

Build a disaster corpus using related words in training set and word2vec.

## 2.1 Data Cleaning and NLP

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction import stop_words 
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer 

In [2]:
# combine data 
campfire = pd.read_csv('./data/campfire_tweets.csv')
carrfire = pd.read_csv('./data/carrfire_tweets.csv')
hurricane = pd.read_csv('./data/hurricane_tweets.csv')

In [3]:
# remove iirlated column in hurricane
hurricane.drop(columns=['latitute','longitute'], inplace=True)

In [4]:
# combine all three files
disaster_tweets = pd.concat([campfire, carrfire, hurricane])
disaster_tweets.head()

Unnamed: 0,created_at,favorite_count,hashtags,retweet_count,text
0,Thu Nov 08 23:59:59 +0000 2018,0,['CampFire'],0,RT @RealJamesWoods: Any info? Call 435-238-073...
1,Thu Nov 08 23:59:59 +0000 2018,0,['CampFire'],0,RT @RealJamesWoods: IMPORTANT!!! 911 overwhelm...
2,Thu Nov 08 23:59:59 +0000 2018,10,[],0,Think it’s getting to the point where it may b...
3,Thu Nov 08 23:59:59 +0000 2018,0,"['BREAKING', 'CampFire']",0,RT @nbcbayarea: #BREAKING: Fast-moving #CampFi...
4,Thu Nov 08 23:59:58 +0000 2018,0,['CampFire'],0,RT @DaniD0909: My BIL was stuck in a safe zone...


In [5]:
disaster_tweets.shape

(6396, 5)

In [6]:
# drop duplicate rows based on text
disaster_tweets.drop_duplicates(subset ="text", keep = False, inplace = True)

In [7]:
# 1397 unique tweets
len(disaster_tweets)

1397

In [8]:
# store the data for further use
%store disaster_tweets

Stored 'disaster_tweets' (DataFrame)


### Pull words in tweets

In [9]:
# use regularexpression to do lemmatize
# use countvectorizer to tokenize, lemmatize, and exclude stopwords 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        tokenizer = RegexpTokenizer('(?u)\\b\\w\\w+\\b')
        return [self.wnl.lemmatize(t, 'v') for t in tokenizer.tokenize(doc)]

In [10]:
# pull words using countvectorizer
vectorizer = CountVectorizer(tokenizer = LemmaTokenizer(),
                            preprocessor = None,
                            stop_words = 'english',
                            max_features = 1500,
                            ngram_range= (1,2),
                            analyzer = 'word', 
                            min_df=1) 

tweets_words= disaster_tweets['text']
tweets_words = vectorizer.fit_transform(tweets_words)
# convert into a dataframe
tweets_words= pd.DataFrame(tweets_words.toarray(), columns=vectorizer.get_feature_names())

  'stop_words.' % sorted(inconsistent))


In [11]:
# store the data for EDA
%store tweets_words

Stored 'tweets_words' (DataFrame)


### Count word frequency as a reference

In [12]:
# check words and word_count as a reference for us to create disaster related corpus
tweets_word_count = pd.DataFrame(tweets_words.sum(), index=vectorizer.get_feature_names(), columns=['word_count']).sort_values(by='word_count', ascending=False)
tweets_word_count = tweets_word_count.reset_index()


In [13]:
# export to csv for a clear view of related words
tweets_word_count.to_csv('./data/tweets_word_count.csv', index=False)

In [14]:
# remove word that doesn't make sense or completely useless
self_defined_stop_words = ['https', 'rt', 'amp','hurricaneharvey https','299','rd','hwy','hwy 299',
                           'powerhouse rd','000', '0','just','299 carr','sr','200 amp','299w','tx',
                           'sr 299w', 'htt', 'gt', 've', 'll', 'jjwatt', 'st','ht','ho','nra','th',
                           'lt','gt gt', '500 500']
                           

tweets_word_count = tweets_word_count[~tweets_word_count['index'].isin(self_defined_stop_words)]

In [15]:
%store self_defined_stop_words

Stored 'self_defined_stop_words' (list)


## 2.2 Build Disaster Corpus

In [16]:
# tweets_word_count after removing self_defined_stop_words
tweets_word_count.head()

Unnamed: 0,index,word_count
1,hurricaneharvey,851
3,carrfire,238
4,help,212
7,campfire,138
8,houston,128


In [17]:
# create a list of words related to disasters from tweets_word_count
tweets_word_count_list = [row.split(',') for row in tweets_word_count['index']]

### Check word frequency as a reference

Check Word Frequency as a reference to build a basic help_words list.

In [18]:
# picked up words based on tweets_count_word
# some interesting findings: 'please help' and 'pleasehelp' both exists
 
help_words = [# words in tweets_count_word
              'help', 'victims', 'flood', 'donations', 'donate', 'shelter','redcross',
              # self-added words
              'please help','pleasehelp','need help','needhelp','assistance',
              'support','need support','lose home','red across', 'lost',
              'dab','debris','disaster','devastate','damage','emergency','destroy','disaster assist team',
              'support harricane','disasterassistteam','severe','unsafe',
              'need shelter', 'help us', 'help me', 'flooding']

### Create disaster corpus  based on exsiting tweets

Use help_words as above to roughly find all related words in the existing tweets. This corpus is used to label our training data(existing tweets).

In [19]:
# Import Word2Vec
import numpy as np
import time
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

# Import Word2Vec
from gensim.models.word2vec import Word2Vec

# If you want to use gensim's data, import their downloader
# and load it.
import gensim.downloader as api
corpus = tweets_word_count_list

# If you have your own iterable corpus of cleaned data:

# Train a model! 
model = Word2Vec(corpus,      # Corpus of data.
                 size=100,    # How many dimensions do you want in your word vector?
                 window=5,    # How many "context words" do you want?
                 min_count=1, # Ignores words below this threshold.
                 sg=0,        # SG = 1 uses SkipGram, SG = 0 uses CBOW (default).
                 workers=4)   # Number of "worker threads" to use (parallelizes process).

In [20]:
similar_words = []
for words in help_words:
    try:
        similar_words.append(list(zip(*model.most_similar(f"{words}")))[0])
    except KeyError:
        pass

  after removing the cwd from sys.path.


In [21]:
# each element in similar_words is a tuple
# change each element as a word
similar_words = [i for sub in similar_words for i in sub]

In [22]:
# got 138 similar words
len(set(similar_words))

145

In [23]:
# here is our final tweets_disaster_corpus
# hand picked help_words + similar_words extracted from training set
tweets_disaster_corpus = help_words + similar_words

In [24]:
# tweets_disaster_corpus includes 169 related words
tweets_disaster_corpus = list(set(tweets_disaster_corpus))
len(tweets_disaster_corpus)

174

## Create disaster corpus based on Wikipedia

This corpus is an extension of the previous existing disaster corpus. This new disaster corpus is used to predict whether a new tweet is related to diasater. 

In [25]:
# Import Word2Vec
from gensim.models.word2vec import Word2Vec

# If you want to use gensim's data, import their downloader
# and load it.
import gensim.downloader as api
corpus = api.load('text8')

# If you have your own iterable corpus of cleaned data:

# Train a model! 
model = Word2Vec(corpus,      # Corpus of data.
                 size=100,    # How many dimensions do you want in your word vector?
                 window=5,    # How many "context words" do you want?
                 min_count=1, # Ignores words below this threshold.
                 sg=0,        # SG = 1 uses SkipGram, SG = 0 uses CBOW (default).
                 workers=4)   # Number of "worker threads" to use (parallelizes process).


In [26]:
wiki_similar_words = []
for words in tweets_disaster_corpus:
    try:
        wiki_similar_words.append(list(zip(*model.most_similar(f"{words}")))[0])
    except KeyError:
        pass

  after removing the cwd from sys.path.


In [27]:
# change each element as a word
wiki_similar_words = [i for sub in wiki_similar_words for i in sub]

In [28]:
# wiki_similar_words includes 839 related words
wiki_similar_words = list(set(wiki_similar_words))
len(wiki_similar_words)

859

In [29]:
# hand picked related words in wiki_similar_words
wiki_similar_words = [
 'disaster',
 'relief',
 'accidents',
 'deficit',
 'risks',
 'devour',
 'inflict',
 'rehabilitation',
 'cessation',
 'diagnose',
 'migrate',
 'agencies',
 'needs',
 'christ',
 'subdue',
 'disasters',
 'concerns',
 'defend',
 'uncertainty',
 'resettle',
 'clinical',
 'necessity',
 'confront',
 'inflation',
 'prolonged',
 'tunnel',
 'organize',
 'liberate',
 'assemble',
 'assistance',
 'needed',
 'flee',
 'fear',
 'crises',
 'pull',
 'accident',
 'navigate',
 'hide',
 'marshes',
 'secured',
 'storms',
 'danger',
 'responses',
 'hurricane',
 'leave',
 'sinking',
 'try',
 'destroy',
 'aid',
 'situation',
 'abandoned',
 'explosion',
 'rejoin',
 'canal',
 'borrowing',
 'breath',
 'alluvium',
 'crisis',
 'abandonment',
 'flooding',
 'vulnerability',
 'surgical',
 'hurricanes',
 'assist',
 'believer',
 'seek',
 'hot',
 'tornadoes',
 'strive',
 'communicate',
 'raise',
 'climb',
 'regained',
 'preventing',
 'safety',
 'god',
 'flood',
 'survivors',
 'weigh',
 'loss',
 'explodes',
 'lava',
 'droughts',
 'send',
 'tsunami',
 'escape',
 'shelters',
 'defeated',
 'rains',
 'precipitation',
 'withdrawal',
 'darkest',
 'killings',
 'injure',
 'dust',
 'shortage',
 'snowfall',
 'winds',
 'foodstocks',
 'sediment',
    'sediments',
 'prayers',
 'debris',
 'procure',
 'smoke',
 'respond',
 'floods',
 'injuries',
 'lose',
 'distribute',
 'devastating',
 'allocate',
 'helping',
 'droplets',
 'confine',
 'medicine',
 'downfall',
 'forests',
 'recovery',
 'fires',
 'explosions',
 'earthquakes',
 'alienate',
 'lightning',
 'silt',
 'needing',
 'rainforest',
 'katrina',
 'hypothermia',
 'responding',
 'toxin',
 'erosion',
 'swamps',
 'eruption',
 'relocate',
 'starvation',
 'losing',
 'abandon',
 'collect',
 'freshwater',
 'kill',
 'detention',
 'fund',
 'fatal',
 'hazards',
 'damaging',
 'queue',
 'regain',
 'health',
 'rain',
 'teams',
 'casualties',
 'protection',
 'reservoir',
 'ports',
 'supplies',
 'miracle',
 'accommodate',
 'freezing',
 'salvation',
 'deterioration',
 'locate',
 'catastrophic',
 'physicians',
 'injury',
 'harbour',
 'emergency',
 'humanitarian',
 'breach',
 'painful',
 'soils',
 'storm',
 'serious']

In [30]:
# combine two corpus
tweets_and_wiki_corpus = wiki_similar_words + tweets_disaster_corpus

In [31]:
# remove duplicate words
# get 331 words that really related to disaster
tweets_and_wiki_corpus = list(set(tweets_and_wiki_corpus))
len(tweets_and_wiki_corpus)

337

In [40]:
# hand picked final disaster words for labeling
final_disaster_words = ['needing',
 'danger',
 'disaster',
 'hypothermia',
 'injure',
 'victim',
 'sinking',
 'damage',
 'support harricane',
 'accidents',
 'destroy',
 'aid',
 'toxin',
 'erosion',
 'redcross',
 'abandoned',
 'evacuate',
 'disaster assist team',
 'explosion',
 'victims',
 'crisis',
 'abandonment',
 'need shelter',
 'help me',
 'starvation',
 'flooding',
 'losing',
 'disasterassistteam',
 'needhelp',
 'foodstocks',
 'assist',
 'abandon',
 'sediment',
 'destroy home',
 'donate',
 'sediments',
 'needs',
 'disasters',
 'help us',
 'property',
 'fatal',
 'debris',
 'medical',
 'hazards',
 'damaging',
 'smoke',
 'health',
 'pleasehelp',
 'flood',
 'casualties',
 'respond',
 'survivors',
 'donations',
 'floods',
 'loss',
 'injuries',
 'explodes',
 'need support',
 'assistance',
 'supplies',
 'needed',
 'lose',
 'lose home',
 'devastate',
 'fear',
 'crises',
 'need help',
 'droughts',
 'send help',
 'unsafe',
 'death',
 'freezing',
 'severe',
 'devastating',
 'catastrophic',
 'please help',
 'accident',
 'helping',
 'shelter',
 'escape',
 'shelters',
 'navigate',
 'tragedy',
 'injury',
 'medicine',
 'emergency',
 'recovery',
 'fires',
 'explosions',
 'breach',
 'help',
 'flame',
 'painful',
 'lightning',
 'withdrawal',
 'storm',
 'storms',
 'lost',
 'help needed',
 'serious']


In [41]:
len(set(final_disaster_words))

100

In [42]:
%store final_disaster_words

Stored 'final_disaster_words' (list)
