# Libraries, data

In [1]:
import pandas as pd
import spacy
import re
from spacy import displacy
from spacy.matcher import Matcher

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Import English library
nlp = spacy.load('en_core_web_lg')

In [4]:
dfWater=pd.read_json("dfwaterpub.json")[["title","abstract"]]

# Data preparation

In [5]:
dfWater=dfWater.dropna(subset=["abstract"])

In [6]:
dfWater["doc"]=dfWater["abstract"].apply(nlp)

In [7]:
df=dfWater
a=df.iloc[2].abstract
exemplar=nlp(a)

In [8]:
allsentences=[]
for i in range (2880):
    a=df.iloc[i].abstract
    b=nlp(a)
    for s in b.sents:
        allsentences.append(s)

In [9]:
len(allsentences)

23176

# Causal hypothesis

Water security and water insecurity aren't used as concepts with the opposite meaning. Water security is related to more technical terms, water insecurity is related to social terms.

## training data

- prepare matching function for sentences, output in format JSONL

# Method

Spacy and the noun_chunks function are used. 
"You can think of noun chunks as a noun plus the words describing the noun." https://spacy.io/usage/linguistic-features
Which words describe water security, which words describe water insecruity?

In [10]:
#import warnings
#warnings.filterwarnings('ignore')
dict={}
noun_list=[]
key_term="water security"
for sent in allsentences:
    chunks=list(sent.noun_chunks)
    for noun in chunks:
        if str(noun) in noun_list:
            #very important to reduce duration of program
            None
        else:
            noun_list.append(str(noun))
            sim=noun.similarity(nlp(key_term))
            dict[str(noun)]=[sim]

In [11]:
dict

{' The Ru River Basin': [0.5038631443725973],
 'severe water security challenges': [0.8465667486963725],
 'frequent occurrence': [0.33384458320124677],
 'extreme events': [0.3950543027853779],
 'serious nutrient enrichment': [0.46511021062950003],
 'its water bodies': [0.7096385528205574],
 'daily streamflow': [0.29063078583347685],
 'monthly total phosphorous': [0.37306002061533355],
 'TP) loads': [0.2736654258575061],
 'the basin': [0.5280127385963699],
 'the SWAT (Soil and Water Assessment Tool) model': [0.6941674918731985],
 'hourly rainfall inputs': [0.390329595994724],
 'the model': [0.4069670926890995],
 'daily rainfall inputs': [0.45086955795168754],
 'climate change impact assessment': [0.558991232307789],
 'the hourly weather generator': [0.5646808439927287],
 'AWE-GEN': [0.1907801608817119],
 'historical hourly rainfall records': [0.4266495600194066],
 'Evaluation': [0.3416875959741756],
 'its performance': [0.44711589857636186],
 'the AWE-GEN': [0.331494557699452],
 'the ma

In [None]:
key_term="water insecurity"
noun_list=[]
for sent in allsentences:
    chunks=list(sent.noun_chunks)
    for noun in chunks:
        if str(noun) in noun_list:
            None
        else:
            noun_list.append(str(noun))
            sim=noun.similarity(nlp(key_term))
            difference=(dict[str(noun)][0] - sim)
            dict[str(noun)].append(sim)
            dict[str(noun)].append(difference)

# The 100 words which describe water security and water insecurity the most

The words with the highest similarity to the key words. The words which include security/insecurity are sorted out. Obviously "security" describes "water security" well but that is not interesting for the research study.

In [None]:
list=[]
security_100=[]
insecurity_100=[]
for key in dict:
    list.append([key, dict[key][0], dict[key][1]])
    
list.sort(key=lambda x:x[1], reverse=True)
k=0
while len(security_100) < 100:
    if re.search(r"-?[Ss]ecur", list[k][0]):
        k+=1
    else:
        security_100.append([str(list[k][0]),list[k][1]])
        k+=1
        
list.sort(key=lambda x:x[2], reverse=True)
k=0
while len(insecurity_100) < 100:
    if re.search(r"-?[Ii]nsecur", str(list[k][0])):
        k+=1
    else:
        insecurity_100.append([str(list[k][0]),list[k][2]])
        k+=1

In [None]:
security_100

In [None]:
insecurity_100

### Result

#### water security:
water safety,
water systems,
water (resources) protection,
water policy,
water infrastructure,
water access,
water management,

#### water insecurity:
**water poverty**
water scarcity,
water vulnerability,
water crisis,
water conflicts,
water shortage,
water mismanagment,
**water inequality**



### The method gets applied again. The describing words get sorted out. 

In [None]:
new_security_100=[]
new_insecurity_100=[]
list.sort(key=lambda x:x[1], reverse=True)
k=0
while len(new_security_100) < 100:
    x=list[k][0]
    if re.search(r"-?[Ss]ecur", x) or re.search(r"-?[Ss]afe", x) or re.search(r"-?[Pp]rotect", x) or re.search(r"-?[Pp]policy", x) or re.search(r"-?[Ii]nfrastr", x) or re.search(r"-?[Aa]cce", x) or re.search(r"-?[Mm]anage", x):
        k+=1
    else:
        new_security_100.append([str(list[k][0]),x])
        k+=1
        
list.sort(key=lambda x:x[2], reverse=True)
k=0
while len(new_insecurity_100) < 100:
    x=list[k][0]
    if re.search(r"-?[Ii]nsecur", x) or re.search(r"-?[Pp]overt", x) or re.search(r"-?[Ss]carc", x) or re.search(r"-?[Vv]ulnerab", x) or re.search(r"-?[Cc]ris[ei]s", x) or re.search(r"-?[Cc]onflict", x) or re.search(r"-?[Ss]hortage", x) or re.search(r"-?[Mm]ismanagem", x) or re.search(r"-?[Ii]nequal", x): 
        k+=1
    else:
        new_insecurity_100.append([str(list[k][0]),list[k][2]])
        k+=1

In [None]:
new_security_100
new_insecurity_100

### The method does not carry out useful results. A different method get applied: The similarity of a term with the term "water security" gets compared with the similarity of the term with the word "water insecurity". The output are two lists which show which words describe one of our key words considerably well but not the other.

In [None]:
interval_start=1.2
interval_end=0.2

moresimilartosecurity=[]
moresimilartoinsecurity=[]
x=0 #security
y=0 #insecurity 

for key in dict:
    if re.search(r"-?[Ii]nsecur", str(key)):
        y+=1
    elif re.search(r"-?[Ss]ecur", str(key)):
        x+=1
    else:
        a=dict[key][2]
        if interval_start > a > interval_end:
            moresimilartosecurity.append([key, a, dict[key][0]])
        elif -interval_start < a < -interval_end:
            moresimilartoinsecurity.append([key, a, dict[key][1]])

In [None]:
moresimilartosecurity.sort(key=lambda x:x[1], reverse=True)
for e in moresimilartosecurity:
    print(e)

In [None]:
moresimilartoinsecurity.sort(key=lambda x:x[1])
for e in moresimilartoinsecurity:
    print(e)