In [141]:
from gensim.models import Word2Vec,KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from nltk.tokenize import word_tokenize
import string
import os
import json

from functions import load_data, query_dataframe, train_test_split

### Loading data

In [2]:
df = load_data()
print(len(df))
df.head()

457


Unnamed: 0,doc_id,filename,is_flood,is_bangladesh,flood_related,flood_climatechange,newspaper,flood_type,text
0,,a2OdfvBIbDPO1aDIwd9K0d6uSECa-1988_c592d7972eb0...,True,True,True,False,ny_times,monsoon,Misery Rises With Rivers In Bangladesh\n1988-0...
1,26718c21-62c0-422c-ac53-36766942fb7b,aoWObyIMi8CwBOMVuRclPk73DHvK-dhakaTribune_data...,False,True,False,False,dhaka_tribune,,Date Published:2020-04-04 00:00:00 \nRMCH...
2,d8f462ed-aec4-455e-8ba8-71c6bd4f292d,atY5uczPE6zPTNIFVBiEvbfv4sju-dhakaTribune_data...,False,True,False,False,dhaka_tribune,,Date Published:2019-06-03 00:00:00 \nThe ...
3,deacde3e-4e9e-404c-aa28-627c9c8e9ed4,ak5TnURTgl6eKymISBt_60JAQebG-dhakaTribune_data...,False,True,False,False,dhaka_tribune,,Date Published:2020-02-01 00:00:00 \nHe i...
4,9188bf4f-a3a9-464f-aec1-b3adb6313c0e,aInceN38qOVMeSAdJ4xXJPDJruf4-dhakaTribune_data...,False,True,False,False,dhaka_tribune,,Date Published:2020-04-10 00:00:00 \nNusr...


### Checking IS Flood and Is Not Flood distribution

In [3]:
df_is_flood = query_dataframe(df, {'is_flood':True})
df_is_not_flood = query_dataframe(df, {'is_flood':False})
print('Is flood:',len(df_is_flood),'\nIs Not Flood:',len(df_is_not_flood))

Is flood: 38 
Is Not Flood: 417


#### Is Flood to CSV

In [4]:
df_is_flood['text'].to_csv('isFloodTrue.csv')

### Checking number of times "flood" occurs in isFlood texts, vs non IsFlood texts

Add flood count to df

In [5]:
df_count = df.copy()
flood_count = [t.lower().count('flood') for t in df_count['text']]
df_count['flood_count'] = flood_count

In [28]:
is_flood_occurance = list(query_dataframe(df_count, {'is_flood':True})['flood_count'])
is_not_flood_occurance = list(query_dataframe(df_count, {'is_flood':False})['flood_count'])
print('"flood" occurs in is Flood:',sorted(is_flood_occurance, reverse=True)[:15])
print('"flood" occurs in is Not Flood:',sorted(is_not_flood_occurance, reverse=True)[:15])

"flood" occurs in is Flood: [18, 13, 12, 11, 10, 10, 9, 9, 9, 8, 8, 7, 7, 7, 6]
"flood" occurs in is Not Flood: [8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 5, 5, 5, 5, 4]


In [142]:
df_not_flood_count_check = query_dataframe(df_count, {'is_flood':False, 'flood_count':8})
df_not_flood_count_check['text'].to_csv('notFlood_flood_count_8.csv')

In [31]:
df_flood_count_check = query_dataframe(df_count, {'is_flood':True, 'flood_count':0})
df_flood_count_check.to_csv('isFlood_flood_count_0.csv')

### Checking word2vec, relation between flood and words in article

Converting glove to word2vec (single time run)

In [9]:
# glove_model_path = 'word2vec_model/glove_word2vec_model.txt'
# word2vec_model_path = 'word2vec_model/word2vec_model.model'
# glove2word2vec(glove_model_path, word2vec_model_path)

Load model

In [10]:
word2vec_model_path = 'word2vec_model/word2vec_model.model'
model = KeyedVectors.load_word2vec_format(word2vec_model_path, binary=False)

In [26]:
model.similarity('flooded', 'flooding')

0.7978086

Calculate Similarities between different words (except 'flood' cause sim=1)

In [83]:
def calculate_similarity(df, debug=False, single=False, word_has_flood=True):
    sim = []
    for row in df.iterrows():
        row_sim = []
        for t in word_tokenize(row[1]['text']):
            try:
                txt = t.translate(str.maketrans('', '', string.punctuation)).strip().lower()
                if txt and txt!='flood':
                    if not word_has_flood and 'flood' not in txt: row_sim.append((model.similarity('flood', txt),txt))
                    elif word_has_flood: row_sim.append((model.similarity('flood', txt),txt))
                if debug: print(txt)
            except Exception as e:
                if debug: print(e)
                continue
        if single: break
        sim.append(sorted(row_sim, reverse=True, key=lambda x:x[0]))
    return sim

In [72]:
is_flood_similarities = calculate_similarity(df_is_flood)
is_not_flood_similarities = calculate_similarity(df_is_not_flood)

Check the top similarities for both isFlood and not isFlood

In [74]:
check_top5_isflood_sim = [f[:5] for f in is_flood_similarities]
check_top5_not_isflood_sim = [f[:5] for f in is_not_flood_similarities]
check_top5_isflood_sim[:2], '---------------------------', check_top5_not_isflood_sim[:2]

([[(0.8515943, 'flooding'),
   (0.8515943, 'flooding'),
   (0.84088975, 'floods'),
   (0.84088975, 'floods'),
   (0.75193965, 'flooded')],
  [(0.8515943, 'flooding'),
   (0.8515943, 'flooding'),
   (0.51762384, 'rivers'),
   (0.50659436, 'water'),
   (0.50659436, 'water')]],
 '---------------------------',
 [[(0.3505731, 'deadly'),
   (0.34283066, 'authorities'),
   (0.33594906, 'people'),
   (0.33594906, 'people'),
   (0.33594906, 'people')],
  [(0.33594906, 'people'),
   (0.33146256, 'over'),
   (0.3306599, 'the'),
   (0.3306599, 'the'),
   (0.3306599, 'the')]])

Check to see if sim > 0.8 and how many of those there are like that

In [75]:
def calculate_similarity_above(l, threshold=0.8, debug=False, single=False):
    sim=[]
    for f in l:
        sim_check=[]
        for ff in f:
            if ff[0]>threshold: sim_check.append(ff)
        if single: break
        sim.append(sim_check)
    return sim

In [82]:
threshold = 0.8
isflood_simcheck = calculate_similarity_above(is_flood_similarities, threshold)
not_isflood_simcheck = calculate_similarity_above(is_not_flood_similarities, threshold)
print('Threshold satisfied by atleast 1 word:', end='\t')
print('Is Flood:',len([f for f in isflood_simcheck if f]),
      '\tIs Not Flood:',len([f for f in not_isflood_simcheck if f]))
print('Occurs more than once:', end='\t\t\t')
print('Is Flood:', len([f for f in isflood_simcheck if len(f)>1]),
      '\tIs Not Flood:',len([f for f in not_isflood_simcheck if len(f)>1]))

# [f for f in isflood_simcheck if f],'---------------------------', [f for f in not_isflood_simcheck if f]
# isflood_simcheck

Threshold satisfied by atleast 1 word:	Is Flood: 25 	Is Not Flood: 50
Occurs more than once:			Is Flood: 18 	Is Not Flood: 17


Consider words that don't have flood in it and check them out

In [84]:
is_flood_similarities_noflood = calculate_similarity(df_is_flood, word_has_flood=False)
is_not_flood_similarities_noflood = calculate_similarity(df_is_not_flood, word_has_flood=False)

In [85]:
check_top5_isflood_sim_noflood = [f[:5] for f in is_flood_similarities_noflood]
check_top5_not_isflood_sim_noflood = [f[:5] for f in is_not_flood_similarities_noflood]
check_top5_isflood_sim_noflood[:2], '---------------------------', check_top5_not_isflood_sim_noflood[:2]

([[(0.57215005, 'disaster'),
   (0.5205556, 'rain'),
   (0.5205556, 'rain'),
   (0.51762384, 'rivers'),
   (0.51762384, 'rivers')],
  [(0.51762384, 'rivers'),
   (0.50659436, 'water'),
   (0.50659436, 'water'),
   (0.50659436, 'water'),
   (0.50659436, 'water')]],
 '---------------------------',
 [[(0.3505731, 'deadly'),
   (0.34283066, 'authorities'),
   (0.33594906, 'people'),
   (0.33594906, 'people'),
   (0.33594906, 'people')],
  [(0.33594906, 'people'),
   (0.33146256, 'over'),
   (0.3306599, 'the'),
   (0.3306599, 'the'),
   (0.3306599, 'the')]])

In [100]:
threshold = 0.6
isflood_simcheck_noflood = calculate_similarity_above(is_flood_similarities_noflood, threshold)
not_isflood_simcheck_noflood = calculate_similarity_above(is_not_flood_similarities_noflood, threshold)
print('Threshold satisfied by atleast 1 word:', end='\t')
print('Is Flood:',len([f for f in isflood_simcheck_noflood if f]),
      '\tIs Not Flood:',len([f for f in not_isflood_simcheck_noflood if f]))
# print('Occurs more than once:', end='\t\t\t')
# print('Is Flood:', len([f for f in isflood_simcheck_noflood if len(f)>1]),
#       '\tIs Not Flood:',len([f for f in not_isflood_simcheck_noflood if len(f)>1]))
# not_isflood_simcheck_noflood

Threshold satisfied by atleast 1 word:	Is Flood: 23 	Is Not Flood: 31


Combined

In [92]:
print('COMBINED: Threshold satisfied by atleast 1 word:', end='\t')
print('Is Flood:',len([f for i,f in enumerate(isflood_simcheck_noflood) if f or isflood_simcheck[i]]),
      '\tIs Not Flood:',len([f for i,f in enumerate(not_isflood_simcheck_noflood) if f or not_isflood_simcheck[i]]))
# print('Occurs more than once:', end='\t\t\t')
# print('Is Flood:', len([f for f in isflood_simcheck_noflood if len(f)>1]),
#       '\tIs Not Flood:',len([f for f in not_isflood_simcheck_noflood if len(f)>1]))
# not_isflood_simcheck_noflood

COMBINED: Threshold satisfied by atleast 1 word:	Is Flood: 31 	Is Not Flood: 66


Combined method

In [116]:
def single_sim_text(text, debug=False, word_has_flood=True):
    row_sim = []
    for t in word_tokenize(text):
        try:
            txt = t.translate(str.maketrans('', '', string.punctuation)).strip().lower()
            if txt and txt!='flood':
                if not word_has_flood and 'flood' not in txt: row_sim.append((model.similarity('flood', txt),txt))
                elif word_has_flood: row_sim.append((model.similarity('flood', txt),txt))
            if debug: print(txt)
        except Exception as e:
            if debug: print(e)
            continue
    return sorted(row_sim, reverse=True, key=lambda x:x[0])

def single_sim_above(l, threshold=0.8, debug=False):
    sim=[]
    for ff in l:
        if ff[0]>threshold: sim.append(ff)
    return sim

In [127]:
files = [f for f in os.listdir('../all_articles/') if '.json' in f and 'model' not in f]
threshold1 = 0.8
threshold2 = 0.6
arr = []
arr_not_flood = []
for f in files:
    filepath = '../all_articles/'+f
    js = json.load(open(filepath))
    for j in js:
        flood_words = single_sim_text(j['text'])
        flood_words_thresh = single_sim_above(flood_words, threshold1)
        no_flood_words = single_sim_text(j['text'], word_has_flood=False)
        no_flood_words_thresh = single_sim_above(no_flood_words, threshold2)
        if flood_words_thresh or no_flood_words_thresh:
            arr.append(j) 
        else: arr_not_flood.append(j)

In [129]:
len(arr), len(arr_not_flood)

(462, 1940)

In [133]:
json.dump(arr,open('../all_articles/isflood_true_word2vec_model1.json','w'),indent=2)
json.dump(arr_not_flood,open('../all_articles/isflood_false_word2vec_model1.json','w'),indent=2)

### Train Test Split

In [9]:
train, test = train_test_split(df)
print('Train:',len(train),'\t\tTest:',len(test))
print('Is flood Train:',len(query_dataframe(train,{'is_flood':True})),
      '\tIs Flood Test:',len(query_dataframe(test,{'is_flood':True})))
train.head()

Train: 365 		Test: 92
Is flood Train: 26 	Is Flood Test: 12


Unnamed: 0,doc_id,filename,is_flood,is_bangladesh,flood_related,flood_climatechange,newspaper,flood_type,text
17,70e67b2a-32e5-4888-a25d-9100ccb98018,aAfUVRlM3rES6aUFPOCroCDgaTLu-dhakaTribune_data...,False,True,False,False,dhaka_tribune,,Date Published:2020-01-12 00:00:00 \nDevo...
66,afdcc8b1-a036-45f2-bb35-938a02de7586,aLhMAGKiVPa2AC4yL6j2nH44IaT4-dhakaTribune_data...,False,True,False,False,dhaka_tribune,,Date Published:2020-01-14 00:00:00 \nTea ...
410,6454f41c-5c82-4085-aba8-b3cfec6c610c,alk1925T4ACEN1XcZI5oGpv989Hi-dhakaTribune_data...,False,False,False,False,dhaka_tribune,,Date Published:2020-03-22 00:00:00 \nChil...
320,,adKDGsYIi33Hbbfb0Q8_hM2iZFLS-1988_8f66e40f412f...,False,False,False,False,ny_times,,ANSWERS TO QUIZ\n1988-09-10T05:00:00.000Z\nQue...
31,15061609-dbed-4598-ad1f-7fcc7c78c1bb,a3VHG544C2mtoqB75_yAFqPSVVUO-dhakaTribune_data...,False,True,False,False,dhaka_tribune,,Date Published:2020-03-28 00:00:00 \nThe ...
