# Libraries, data

In [1]:
import pandas as pd
import spacy
import re
from spacy import displacy
from spacy.matcher import Matcher

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Import English library
nlp = spacy.load('en_core_web_lg')

In [4]:
dfWater=pd.read_json("dfwaterpub.json")[["title","abstract"]]

# Data preparation

In [5]:
dfWater=dfWater.dropna(subset=["abstract"])

In [6]:
dfWater["doc"]=dfWater["abstract"].apply(nlp)

In [7]:
df=dfWater
a=df.iloc[2].abstract
exemplar=nlp(a)

In [8]:
allsentences=[]
for i in range (2880):
    a=df.iloc[i].abstract
    b=nlp(a)
    for s in b.sents:
        allsentences.append(s)

In [9]:
len(allsentences)

23176

# Causal hypothesis

Water security and water insecurity aren't used as concepts with the opposite meaning. Water security is related to more technical terms, water insecurity is related to social terms.

## training data

- prepare matching function for sentences, output in format JSONL

# Method

Spacy and the noun_chunks function are used. 
"You can think of noun chunks as a noun plus the words describing the noun." https://spacy.io/usage/linguistic-features
Which words describe water security, which words describe water insecruity?

In [10]:
#import warnings
#warnings.filterwarnings('ignore')
dict={}
noun_list=[]
key_term="water security"
for sent in allsentences:
    chunks=list(sent.noun_chunks)
    for noun in chunks:
        if str(noun) in noun_list:
            #very important to reduce duration of program
            None
        else:
            noun_list.append(str(noun))
            sim=noun.similarity(nlp(key_term))
            dict[str(noun)]=[sim]

In [11]:
dict

{' The Ru River Basin': [0.5038631443725973],
 'severe water security challenges': [0.8465667486963725],
 'frequent occurrence': [0.33384458320124677],
 'extreme events': [0.3950543027853779],
 'serious nutrient enrichment': [0.46511021062950003],
 'its water bodies': [0.7096385528205574],
 'daily streamflow': [0.29063078583347685],
 'monthly total phosphorous': [0.37306002061533355],
 'TP) loads': [0.2736654258575061],
 'the basin': [0.5280127385963699],
 'the SWAT (Soil and Water Assessment Tool) model': [0.6941674918731985],
 'hourly rainfall inputs': [0.390329595994724],
 'the model': [0.4069670926890995],
 'daily rainfall inputs': [0.45086955795168754],
 'climate change impact assessment': [0.558991232307789],
 'the hourly weather generator': [0.5646808439927287],
 'AWE-GEN': [0.1907801608817119],
 'historical hourly rainfall records': [0.4266495600194066],
 'Evaluation': [0.3416875959741756],
 'its performance': [0.44711589857636186],
 'the AWE-GEN': [0.331494557699452],
 'the ma

In [12]:
key_term="water insecurity"
noun_list=[]
for sent in allsentences:
    chunks=list(sent.noun_chunks)
    for noun in chunks:
        if str(noun) in noun_list:
            None
        else:
            noun_list.append(str(noun))
            sim=noun.similarity(nlp(key_term))
            difference=(dict[str(noun)][0] - sim)
            dict[str(noun)].append(sim)
            dict[str(noun)].append(difference)

# The 100 words which describe water security and water insecurity the most

The words with the highest similarity to the key words. The words which include security/insecurity are sorted out. Obviously "security" describes "water security" well but that is not interesting for the research study.

In [13]:
list=[]
security_100=[]
insecurity_100=[]
for key in dict:
    list.append([key, dict[key][0], dict[key][1]])
    
list.sort(key=lambda x:x[1], reverse=True)
k=0
while len(security_100) < 100:
    if re.search(r"-?[Ss]ecur", list[k][0]):
        k+=1
    else:
        security_100.append([str(list[k][0]),list[k][1]])
        k+=1
        
list.sort(key=lambda x:x[2], reverse=True)
k=0
while len(insecurity_100) < 100:
    if re.search(r"-?[Ii]nsecur", str(list[k][0])):
        k+=1
    else:
        insecurity_100.append([str(list[k][0]),list[k][2]])
        k+=1

In [14]:
security_100

[['water and water infrastructure', 0.8520420936133888],
 ['water protection', 0.8491171118719402],
 ['the water safety systems', 0.8455711083412298],
 ['Jiansanjiang&#x27;s water safety systems', 0.8443769853816167],
 ['water safety systems', 0.8443769818040214],
 ['water system threats', 0.8438543771179393],
 ['water environment protection', 0.8369795253982395],
 ['public water systems', 0.8328880626289655],
 ['virtual water and water governance', 0.8300406741961703],
 ['water safety', 0.8285041751237188],
 ['Water safety', 0.8285041751237188],
 ['water and waste water systems', 0.8275493231392137],
 ['the safe drinking water supply systems', 0.8263439608495078],
 ['source water protection planning', 0.8257580805405391],
 ['Source water protection planning', 0.8257580805405391],
 ['water resources protection', 0.8257107643495196],
 ['the water monitoring', 0.8249257763892198],
 ['water safety management', 0.824519667341322],
 ['the public water systems', 0.82386012635217],
 ['water p

In [15]:
insecurity_100

[['water poverty', 0.8440107651349945],
 [' Water scarcity', 0.8385802886279289],
 ['water scarcity', 0.8385802698132254],
 ['Water scarcity', 0.8385802698132254],
 ['water scarcity problems', 0.8373465205224819],
 ['water scarcity situation', 0.8366854713883183],
 ['growing water scarcity', 0.8331085802163378],
 ['Growing water scarcity', 0.8331085802163378],
 ['water scarcity issues', 0.829340691575898],
 ['water scarcity problem', 0.8292108142021501],
 ['increasing water scarcity', 0.8267421518615004],
 ['widespread water scarcity', 0.8254969855244664],
 ['both water scarcity', 0.8203244676007587],
 ['The water scarcity', 0.8201787048848048],
 ['the water scarcity', 0.8201787048848048],
 ['worsening water scarcity', 0.8192020224834285],
 ['water vulnerability', 0.8191634504939048],
 ['domestic water inadequacy', 0.8182896881411039],
 ['water scarcity risks', 0.8172575626890339],
 ['severe water scarcity', 0.8170938157179009],
 ['Water conflict', 0.815043232715168],
 ['water conflict

### Result

#### water security:
water safety,
water systems,
water (resources) protection,
water policy,
water infrastructure,
water access,
water management,

#### water insecurity:
**water poverty**
water scarcity,
water vulnerability,
water crisis,
water conflicts,
water shortage,
water mismanagment,
**water inequality**



### The method gets applied again. The describing words get sorted out. 

In [16]:
new_security_100=[]
new_insecurity_100=[]
list.sort(key=lambda x:x[1], reverse=True)
k=0
while len(new_security_100) < 100:
    x=list[k][0]
    if re.search(r"-?[Ss]ecur", x) or re.search(r"-?[Ss]afe", x) or re.search(r"-?[Pp]rotect", x) or re.search(r"-?[Pp]policy", x) or re.search(r"-?[Ii]nfrastr", x) or re.search(r"-?[Aa]cce", x) or re.search(r"-?[Mm]anage", x):
        k+=1
    else:
        new_security_100.append([str(list[k][0]),x])
        k+=1
        
list.sort(key=lambda x:x[2], reverse=True)
k=0
while len(new_insecurity_100) < 100:
    x=list[k][0]
    if re.search(r"-?[Ii]nsecur", x) or re.search(r"-?[Pp]overt", x) or re.search(r"-?[Ss]carc", x) or re.search(r"-?[Vv]ulnerab", x) or re.search(r"-?[Cc]ris[ei]s", x) or re.search(r"-?[Cc]onflict", x) or re.search(r"-?[Ss]hortage", x) or re.search(r"-?[Mm]ismanagem", x) or re.search(r"-?[Ii]nequal", x): 
        k+=1
    else:
        new_insecurity_100.append([str(list[k][0]),list[k][2]])
        k+=1

In [17]:
new_security_100
new_insecurity_100

[['domestic water inadequacy', 0.8182896881411039],
 ['rising water stress', 0.803384140622645],
 ['water stresses', 0.7872113658004375],
 ['water stress intensification', 0.7855068929751552],
 ['growing water pressures', 0.7818248710030486],
 ['water problems', 0.7812526658700675],
 ['increasing water stress', 0.780988037832511],
 ['inadequate water', 0.7800768035659214],
 ['urban water stress', 0.7799626606902154],
 ['water stress', 0.7799329249501475],
 ['Water stress', 0.7799329249501475],
 ['China&#x27;s water stress', 0.7799328919560904],
 ['water pollution problems', 0.7774577329255964],
 ['Water Human Water', 0.7763655071995357],
 ['water pressures', 0.7754442408867726],
 ['Water Governance Human Water', 0.7753718878133916],
 ['the water injustice', 0.7753608434031747],
 ['water security and water ecosystem', 0.7748516352443559],
 ['water stress alleviation', 0.7743938120503078],
 ['water problem', 0.7740979896943594],
 ['Water problem', 0.7740979896943594],
 ['water and water 

### The method does not carry out useful results. A different method get applied: The similarity of a term with the term "water security" gets compared with the similarity of the term with the word "water insecurity". The output are two lists which show which words describe one of our key words considerably well but not the other.

In [18]:
interval_start=1.2
interval_end=0.2

moresimilartosecurity=[]
moresimilartoinsecurity=[]
x=0 #security
y=0 #insecurity 

for key in dict:
    if re.search(r"-?[Ii]nsecur", str(key)):
        y+=1
    elif re.search(r"-?[Ss]ecur", str(key)):
        x+=1
    else:
        a=dict[key][2]
        if interval_start > a > interval_end:
            moresimilartosecurity.append([key, a, dict[key][0]])
        elif -interval_start < a < -interval_end:
            moresimilartoinsecurity.append([key, a, dict[key][1]])

In [19]:
moresimilartosecurity.sort(key=lambda x:x[1], reverse=True)
for e in moresimilartosecurity:
    print(e)

['software-assisted alarms', 0.28220993053442334, 0.5280494383115211]
['Services Solution software', 0.2821529959251879, 0.5472799547388345]
['automated surveillance systems', 0.2714625562050198, 0.5776178592216291]
['library service', 0.26710498088321716, 0.4359822803844821]
['‘Digital Service Systems Engineering’(DSSE', 0.26707287855124257, 0.5709480095569027]
['software', 0.26613838408571144, 0.36576333118898674]
['Software', 0.26613838408571144, 0.36576333118898674]
['VOSviewer software', 0.26613838408571144, 0.36576333118898674]
['MODFLOW software', 0.26613838408571144, 0.36576333118898674]
['key installations', 0.26607744534803074, 0.5997200320176238]
['Windows application', 0.26525635243857915, 0.5064321189074666]
['two software tools', 0.2642209431624064, 0.5001725347286695]
['reliable access', 0.26418488922406386, 0.5504468728654472]
['service upgrading', 0.25843661058496603, 0.48711132123973655]
['service', 0.25816043676536626, 0.4356555798504913]
['Service', 0.25816043676536

In [20]:
moresimilartoinsecurity.sort(key=lambda x:x[1])
for e in moresimilartoinsecurity:
    print(e)

['inadequacy', -0.34915479717654657, 0.5007363474793485]
['underdevelopment', -0.34491052000786665, 0.3162259265397999]
['inequity', -0.3327345626885614, 0.3919782580213105]
['desperation', -0.3282534050171333, 0.5094793002983272]
['illiteracy', -0.3271781560517014, 0.4002863653757484]
['alienation', -0.3135126275952393, 0.3903630333741718]
['mistrust', -0.3109673898077504, 0.4473331685796872]
['marginalisation', -0.3098183977806135, 0.2657016799552846]
['aridity', -0.3065704227004398, 0.29922694627148705]
['Biodimatic aridity', -0.3065704227004398, 0.29922694627148705]
['precariousness', -0.306203306928749, 0.27779514186760024]
['women&#x27;s marginalization', -0.30604852127079346, 0.34183889996173267]
['marginalization', -0.30604852127079346, 0.34183889996173267]
['Malnutrition', -0.300089623573933, 0.4982890881059077]
['malnutrition', -0.300089623573933, 0.4982890881059077]
['hostility', -0.2919830810926086, 0.439943029300464]
['scarcity', -0.2902291766677326, 0.5607262135374755]
['