In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
from math import radians, cos, sin, asin, sqrt, atan2
import matplotlib.pyplot as plt
%matplotlib inline
import networkx as nx

from collections import Counter
from datetime import datetime
from pyproj import Proj, transform

import rasterio
from rasterio.transform import xy
from rasterio.plot import show
from rasterio.features import geometry_mask

from shapely.geometry import Point, Polygon, LineString, MultiLineString

import statsmodels.formula.api as smf
import tqdm.notebook as tqdm

import seaborn as sns
# from osgeo import gdal
sns.set()
%config InlineBackend.figure_format = 'retina'

In [3]:
data_path = '/home/gleb/Desktop/thesis/data/'

countries_path = data_path + 'world-administrative-boundaries/'
basins_6_path = data_path + 'HydroBASINS Africa 6 level/'
basins_12_path = data_path + 'HydroBASINS Africa 12 level/'
conflict_path = data_path + 'conflicts/'
dams_path = data_path + 'GRanD_Version_1_3/'
sheds_file_3 = data_path + 'hyd_af_dem_3s/af_dem_3s.tif'
sheds_file_15 = data_path + 'hyd_af_dem_15s/hyd_af_dem_15s.tif'
rivers_file = data_path + 'HydroRIVERS_v10_af/HydroRIVERS_v10_af.gdb'

output_path = '/home/gleb/Desktop/thesis/outcomes/'

In [4]:
rivers = gpd.read_file(rivers_file)
levels = [1,2,3,4,5]
rivers_main = rivers[rivers['ORD_CLAS'].apply(lambda x: True if x in levels else False)].copy()
rivers_main.reset_index(inplace=True, drop=True)

In [5]:
with rasterio.open(sheds_file_15) as dem:
    dem_data = dem.read(1)
    transformation = dem.transform

In [4]:
def calculate_riv_gradient(elevation_data, transform, line_string):
    points = [xy for xy in line_string.coords]
    gradient_values = []
    distance_values = []
    for idx in range(len(points) - 1):
        point1, point2 = points[idx], points[idx + 1]
        col1, row1 = rasterio.transform.rowcol(transformation, point1[0], point1[1])
        col2, row2 = rasterio.transform.rowcol(transformation, point2[0], point2[1])
        elevation1 = dem_data[col1, row1]
#         print('Elevation level of point 1: ' + str(elevation1))
        elevation2 = dem_data[col2, row2]
#         print('Elevation level of point 2: ' + str(elevation2))
        distance = haversine(point1[0], point1[1], point2[0], point2[1])
#         print('Distance between points: ' + str(distance))
        gradient = ((elevation1 - elevation2) / distance)*100
#         print('Gradient between points: ' + str(gradient))
        distance_values.append(distance)
        gradient_values.append(gradient)
    return gradient_values, distance_values

def haversine(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 * 1000
    return c * r

def gradient_fits(grad, category='suitable'):
    if category=='suitable':
        if (grad >= 1.5) & (grad <= 3):
            return True
        elif grad >= 6:
            return True
        else:
            return False
    elif category=='1.5-3':
        if (grad >= 1.5) & (grad <= 3):
            return True
        else:
            return False
    elif category == '3-6':
        if (grad > 3) & (grad < 6):
            return True
        else:
            return False
    elif category == '>6':
        if grad >= 6:
            return True 
        else:
            return False

# Test for river distances:

In [7]:
# if isinstance(river_geoms, MultiLineString):
#     grads, dists = calculate_gradient(dem_data, transformation, river_geoms[0])
index = 24740
river_geoms = rivers_main.loc[index, 'geometry']
grads, dists = calculate_riv_gradient(dem_data, transformation, river_geoms.geoms[0])
sum(dists)

7221.239512363846

In [8]:
rivers_main.iloc[index:index+1, :]

Unnamed: 0,HYRIV_ID,NEXT_DOWN,MAIN_RIV,LENGTH_KM,DIST_DN_KM,DIST_UP_KM,CATCH_SKM,UPLAND_SKM,ENDORHEIC,DIS_AV_CMS,ORD_STRA,ORD_CLAS,ORD_FLOW,HYBAS_L12,Shape_Length,geometry
24740,10025303,10024963,10018841,7.22,137.100006,11.1,25.48,25.5,0,0.047,1,2,8,1120109000.0,0.07092,"MULTILINESTRING ((-6.55833 33.26250, -6.55208 ..."


# Basins:

In [9]:
bas_6 = gpd.read_file(basins_6_path + 'hybas_af_lev06_v1c.shp')
bas_12 = gpd.read_file(basins_12_path + 'hybas_af_lev12_v1c.shp')
bas_12['PFAF_ID_6l'] = bas_12['PFAF_ID'].apply(lambda x: int(str(x)[:6]))
bas_l12_to_6 = pd.Series(bas_12.PFAF_ID_6l.values,index=bas_12.HYBAS_ID).to_dict()

In [None]:
# Calculate IV and geographic controls for each basin: 
# 1. Revir Gradient = ratio_of_grad, check
# 2. basin size = SUB_AREA (in sq.km), check
# 3. elevation = average_elevation, check
# 4. average basin gradient = average_gradient, check
# 5. river length= tot_riv_dist, check

In [10]:
# Matching rivers to basins of 6th level using PFAF ids:
rivers['HYBAS_L6_PFAFID'] = rivers['HYBAS_L12'].apply(lambda x: bas_l12_to_6[x])
rivers_main['HYBAS_L6_PFAFID'] = rivers_main['HYBAS_L12'].apply(lambda x: bas_l12_to_6[x])

# Dictionary of rivers geometries. Keys: rivers' HYRIV_ID. Values: geometries.
riv_geoms = pd.Series(rivers.geometry.values,index=rivers.HYRIV_ID).to_dict()

# Dictionary basins-rivers. Keys: Basin's PFAF index. Values: lists of 'HYRIV_ID' that belong to a basin.
bas_riv_dict = rivers_main.groupby('HYBAS_L6_PFAFID')['HYRIV_ID'].apply(list).to_dict()
# bas_riv_dict = rivers.groupby('HYBAS_L6_PFAFID')['HYRIV_ID'].apply(list).to_dict()

In [11]:
# Calculate ratios of suitable river gradients and total length of rivers for basins:
ratios_RG = {}
ratios_15_3 = {}
ratios_3_6 = {}
ratios_more_6 = {}
basin_riv_dist = {}

for pfaf_id in set(bas_6.PFAF_ID):
    print('Calculated ratio of IDs: ' + str(len(ratios_RG)/len(set(bas_6.PFAF_ID))), end = '\r')
    if pfaf_id in bas_riv_dict.keys():
        rivers_in_basin = bas_riv_dict[pfaf_id]
        
        total_rivers_distance = 0
        fitted_gradients_distance_RG = 0
        fitted_gradients_distance_15_3 = 0
        fitted_gradients_distance_3_6 = 0
        fitted_gradients_distance_more_6 = 0
        
        for riv in rivers_in_basin:
            river_geom = riv_geoms[riv]
            grads, dists = calculate_riv_gradient(dem_data, transformation, river_geom.geoms[0])
            # Which sections of the river have gradient 1.5-3% or >3%:
            grad_fits_RG = [gradient_fits(grad, category='suitable') for grad in grads]
            grad_fits_15_3 = [gradient_fits(grad, category='1.5-3') for grad in grads]
            grad_fits_3_6 = [gradient_fits(grad, category='3-6') for grad in grads]
            grad_fits_more_6 = [gradient_fits(grad, category='>6') for grad in grads]
            # Calculate distance of sections within needed gradient:
            fitted_gradients_distance_RG += sum([x for x, y in zip(dists, grad_fits_RG) if y])
            fitted_gradients_distance_15_3 += sum([x for x, y in zip(dists, grad_fits_15_3) if y])
            fitted_gradients_distance_3_6 += sum([x for x, y in zip(dists, grad_fits_3_6) if y])
            fitted_gradients_distance_more_6 += sum([x for x, y in zip(dists, grad_fits_more_6) if y])
            # Add length of the river to the distance of all rivers in the basin:
            total_rivers_distance += sum(dists)
        ratio_RG = fitted_gradients_distance_RG/total_rivers_distance
        ratio_15_3 = fitted_gradients_distance_15_3/total_rivers_distance
        ratio_3_6 = fitted_gradients_distance_3_6/total_rivers_distance
        ratio_more_6 = fitted_gradients_distance_more_6/total_rivers_distance
        
        ratios_RG[pfaf_id] = ratio_RG
        ratios_15_3[pfaf_id] = ratio_15_3
        ratios_3_6[pfaf_id] = ratio_3_6
        ratios_more_6[pfaf_id] = ratio_more_6
        basin_riv_dist[pfaf_id] = total_rivers_distance
    else:
        ratios_RG[pfaf_id] = 0
        ratios_15_3[pfaf_id] = 0
        ratios_3_6[pfaf_id] = 0
        ratios_more_6[pfaf_id] = 0
        basin_riv_dist[pfaf_id] = 0

Calculated ratio of IDs: 0.99972199054767864543

In [12]:
bas_6['RG'] = bas_6['PFAF_ID'].apply(lambda x: ratios_RG[x])
bas_6['grad_15_3'] = bas_6['PFAF_ID'].apply(lambda x: ratios_15_3[x])
bas_6['grad_3_6'] = bas_6['PFAF_ID'].apply(lambda x: ratios_3_6[x])
bas_6['grad_more_6'] = bas_6['PFAF_ID'].apply(lambda x: ratios_more_6[x])
bas_6['tot_riv_dist'] = bas_6['PFAF_ID'].apply(lambda x: int(basin_riv_dist[x]))

In [15]:
# bas_6.to_file(output_path + 'hybas_af_lev06_v1c_grads.shp')
bas_6 = gpd.read_file(output_path + 'hybas_af_lev06_v1c_grads.shp')
bas_6.rename(columns={'grad_more_':'grad_more_6', 'tot_riv_di':'tot_riv_dist'}, inplace=True)

In [39]:
# # Calculate average elevation and gradient for basins:
# def calculate_bas_controls(geom, dem_file=dem_data):
#     mask = geometry_mask([geom], out_shape=dem_file.shape, transform=transformation,
#                          all_touched=False, invert=False)
#     # Calculate average elevation:
#     basin_elevations = np.ma.array(dem_data, mask=mask)
#     av_elevation = basin_elevations.mean()
# #     print('Elevation calculated') 
#     # Calculate average gradient: 
#     max_indices = np.unravel_index(np.argmax(basin_elevations), basin_elevations.shape)
#     min_indices = np.unravel_index(np.argmin(basin_elevations), basin_elevations.shape)
#     max_lon, max_lat = xy(transformation, max_indices[0], max_indices[1])
#     min_lon, min_lat = xy(transformation, min_indices[0], min_indices[1])
#     distance = haversine(min_lon, min_lat, max_lon, max_lat)
#     av_gradient = ((basin_elevations[max_indices] - basin_elevations[min_indices]) / distance)*100
# #     print('Gradient calculated')
#     return [av_elevation, av_gradient]

# bas_6[['av_elev','av_grad']] = pd.DataFrame(bas_6['geometry'].apply(lambda x: 
#                                                                     calculate_bas_controls(x)).tolist(),
#                                                                index= bas_6.index)

In [16]:
# Calculate average elevation and gradient for basins:
def calculate_bas_controls(geom, dem_file=dem_data):
    mask = geometry_mask([geom], out_shape=dem_file.shape, transform=transformation,
                         all_touched=False, invert=False)
    # Calculate average elevation:
    basin_elevations = np.ma.array(dem_data, mask=mask)
    av_elevation = basin_elevations.mean()
    basin_elevations = basin_elevations[~basin_elevations.mask].data
    n_pixels = len(basin_elevations)
    pxs_25 = len(basin_elevations[basin_elevations<=250])
    pxs_25_50 = len(basin_elevations[(basin_elevations>250) & (basin_elevations<=500)])
    pxs_50_1k = len(basin_elevations[(basin_elevations>500) & (basin_elevations<=1000)])
    pxs_more_1k = len(basin_elevations[(basin_elevations>1000) & (basin_elevations<=10000)])
    share_less_25 = pxs_25/n_pixels
    share_25_50 = pxs_25_50/n_pixels
    share_50_1k = pxs_50_1k/n_pixels
    share_more_1k = pxs_more_1k/n_pixels
    return [av_elevation, share_less_25, share_25_50, share_50_1k, share_more_1k]

bas_6[['av_elev', 'shr_l_25', 'shr_25_50', 'shr_50_1k', 'shr_m_1k']] = pd.DataFrame(bas_6['geometry'].apply(
    lambda x: calculate_bas_controls(x)).tolist(), index= bas_6.index)



In [23]:
# bas_6.to_file(output_path + '5rivers_controls.shp')
bas_6 = gpd.read_file(output_path + '5rivers_controls.shp')
bas_6.rename(columns={'grad_more_':'grad_more_6', 'tot_riv_di':'tot_riv_dist'}, inplace=True)
bas_6['HYBAS_ID'] = bas_6['HYBAS_ID'].apply(lambda x: str(x))
bas_6['PFAF_ID'] = bas_6['PFAF_ID'].apply(lambda x: str(x))

# Dams for 1st stage:

In [6]:
dams_df = gpd.read_file(dams_path + 'GRanD_dams_v1_3.shp')
# Wherever main year is -99, so is alternative year:
dams_df = dams_df[dams_df['YEAR'] != -99]
dams_df.reset_index(inplace=True,drop=True)

In [6]:
# # dams_df[(dams_df['YEAR'] == -99)&(dams_df['COUNTRY'] == 'Zimbabwe')]
# a = dams_df[(dams_df['YEAR'] == -99)][['RES_NAME', 'DAM_NAME', 'ALT_NAME', 'RIVER',
#                                        'ALT_RIVER', 'GRAND_ID', 'COUNTRY', 'USE_ELEC', 'USE_IRRI']].copy()

# uses = {'Main', 'Sec', 'Major'}
# a['el_ir'] = a.apply(lambda row: 1 if (row.USE_ELEC in uses) or 
#                                               (row.USE_IRRI in uses) else 0, axis=1)

# print(a[a['el_ir']==1]['COUNTRY'].value_counts())

# a = a[a['el_ir']==1]
# a.reset_index(drop=True,inplace=True)

# Nigeria: 8,
# Zimbabwe: 7,
# South Africa: 3,
# Burkina Faso: 2,
# Tunisia: 2,
# Egypt: 2,
# Ethiopia: 1,
# Lesotho: 1,
# Botswana: 2,
# Mozambique: 1,
# Zambia: 1,
# Namibia: 1,
# Togo: 1,
# Ivory Coast: 1

In [7]:
any_dams_97 = dams_df[dams_df['YEAR'] <= 1999].copy()
any_dams_97.reset_index(inplace=True,drop=True)
any_dams_89 = dams_df[dams_df['YEAR'] <= 1989].copy()
any_dams_89.reset_index(inplace=True,drop=True)

# uses = {'Major'}
uses = {'Main', 'Major'}
# uses = {'Main', 'Sec', 'Major'}

any_dams_97['el_ir'] = any_dams_97.apply(lambda row: 1 if (row.USE_ELEC in uses) or 
                                              (row.USE_IRRI in uses) else 0, axis=1)
any_dams_97['el'] = any_dams_97.apply(lambda row: 1 if row.USE_ELEC in uses else 0, axis=1)
any_dams_97['ir'] = any_dams_97.apply(lambda row: 1 if row.USE_IRRI in uses else 0, axis=1)

any_dams_89['el_ir'] = any_dams_89.apply(lambda row: 1 if (row.USE_ELEC in uses) or 
                                              (row.USE_IRRI in uses) else 0, axis=1)
any_dams_89['el'] = any_dams_89.apply(lambda row: 1 if row.USE_ELEC in uses else 0, axis=1)
any_dams_89['ir'] = any_dams_89.apply(lambda row: 1 if row.USE_IRRI in uses else 0, axis=1)

dams_el_ir_97 = any_dams_97[any_dams_97['el_ir'] == 1]
dams_el_ir_97.reset_index(inplace=True,drop=True)
dams_el_97 = any_dams_97[any_dams_97['el'] == 1]
dams_el_97.reset_index(inplace=True,drop=True)
dams_ir_97 = any_dams_97[any_dams_97['ir'] == 1]
dams_ir_97.reset_index(inplace=True,drop=True)

dams_el_ir_89 = any_dams_89[any_dams_89['el_ir'] == 1]
dams_el_ir_89.reset_index(inplace=True,drop=True)
dams_el_89 = any_dams_89[any_dams_89['el'] == 1]
dams_el_89.reset_index(inplace=True,drop=True)
dams_ir_89 = any_dams_89[any_dams_89['ir'] == 1]
dams_ir_89.reset_index(inplace=True,drop=True)

In [10]:
# Hydropower dams before 1997:
joined = gpd.sjoin(dams_el_97[['GRAND_ID', 'YEAR', 'MAIN_USE', 'geometry']],
                   bas_6[['HYBAS_ID', 'PFAF_ID', 'geometry', 'RG']], op='within', how='left')

basin_dam_dict = {}
for hydrobasin_id, dams_in_basin in joined.groupby('HYBAS_ID'):
    dam_ids = dams_in_basin['GRAND_ID'].tolist()
    basin_dam_dict[hydrobasin_id] = dam_ids
    
bas_6['has_dam'] = bas_6['HYBAS_ID'].apply(lambda x: 1 if x in basin_dam_dict.keys() else 0)
bas_6['n_of_dams'] = bas_6['HYBAS_ID'].apply(lambda x: len(basin_dam_dict[x]) if x in 
                                                  basin_dam_dict.keys() else 0)

first_stage1 = smf.ols("has_dam ~ SUB_AREA + av_elev + shr_25_50 + shr_50_1k + shr_m_1k + tot_riv_dist + grad_15_3 + grad_3_6 + grad_more_6",
                      data=bas_6).fit()
first_stage2 = smf.ols("n_of_dams ~ SUB_AREA + av_elev + shr_25_50 + shr_50_1k + shr_m_1k + tot_riv_dist + grad_15_3 + grad_3_6 + grad_more_6",
                      data=bas_6).fit()

rows, model = [], first_stage1
for idx in range (len(first_stage1.params)-3, len(first_stage1.params)):
    rows.append([model.params.index[idx], model.params[idx], model.pvalues[idx]])
print(pd.DataFrame(rows, columns=[' ', 'Coef:', 'P>|t|']))

rows, model = [], first_stage2
for idx in range (len(first_stage2.params)-3, len(first_stage2.params)):
    rows.append([model.params.index[idx], model.params[idx], model.pvalues[idx]])
print(pd.DataFrame(rows, columns=[' ', 'Coef:', 'P>|t|']))

                   Coef:     P>|t|
0    grad_15_3  0.102507  0.047256
1     grad_3_6  0.015465  0.885657
2  grad_more_6  0.153903  0.069869
                   Coef:     P>|t|
0    grad_15_3  0.082220  0.222317
1     grad_3_6  0.128011  0.361449
2  grad_more_6  0.163503  0.139737


In [11]:
# Hydropower dams before 1989:
joined = gpd.sjoin(dams_el_89[['GRAND_ID', 'YEAR', 'MAIN_USE', 'TIMELINE', 'geometry']],
                   bas_6[['HYBAS_ID', 'PFAF_ID', 'geometry', 'RG']], op='within', how='left')

basin_dam_dict = {}
for hydrobasin_id, dams_in_basin in joined.groupby('HYBAS_ID'):
    dam_ids = dams_in_basin['GRAND_ID'].tolist()
    basin_dam_dict[hydrobasin_id] = dam_ids
    
bas_6['has_dam'] = bas_6['HYBAS_ID'].apply(lambda x: 1 if x in basin_dam_dict.keys() else 0)
bas_6['n_of_dams'] = bas_6['HYBAS_ID'].apply(lambda x: len(basin_dam_dict[x]) if x in 
                                                  basin_dam_dict.keys() else 0)

first_stage3 = smf.ols("has_dam ~ SUB_AREA + av_elev + shr_25_50 + shr_50_1k + shr_m_1k + tot_riv_dist + grad_15_3 + grad_3_6 + grad_more_6",
                      data=bas_6).fit()

first_stage4 = smf.ols("n_of_dams ~ SUB_AREA + av_elev + shr_25_50 + shr_50_1k + shr_m_1k + tot_riv_dist + grad_15_3 + grad_3_6 + grad_more_6",
                      data=bas_6).fit()

rows, model = [], first_stage3
for idx in range (len(first_stage3.params)-3, len(first_stage3.params)):
    rows.append([model.params.index[idx], model.params[idx], model.pvalues[idx]])
print(pd.DataFrame(rows, columns=[' ', 'Coef:', 'P>|t|']))

rows, model = [], first_stage4
for idx in range (len(first_stage2.params)-3, len(first_stage2.params)):
    rows.append([model.params.index[idx], model.params[idx], model.pvalues[idx]])
print(pd.DataFrame(rows, columns=[' ', 'Coef:', 'P>|t|']))

                   Coef:     P>|t|
0    grad_15_3  0.116317  0.021539
1     grad_3_6 -0.033116  0.753227
2  grad_more_6  0.153571  0.064769
                   Coef:     P>|t|
0    grad_15_3  0.096030  0.149253
1     grad_3_6  0.079430  0.566657
2  grad_more_6  0.163171  0.135912


In [12]:
# Irrigation dams before 1997:
joined = gpd.sjoin(dams_ir_97[['GRAND_ID', 'YEAR', 'MAIN_USE', 'TIMELINE', 'geometry']],
                   bas_6[['HYBAS_ID', 'PFAF_ID', 'geometry', 'RG']], op='within', how='left')

basin_dam_dict = {}
for hydrobasin_id, dams_in_basin in joined.groupby('HYBAS_ID'):
    dam_ids = dams_in_basin['GRAND_ID'].tolist()
    basin_dam_dict[hydrobasin_id] = dam_ids
    
bas_6['has_dam'] = bas_6['HYBAS_ID'].apply(lambda x: 1 if x in basin_dam_dict.keys() else 0)
bas_6['n_of_dams'] = bas_6['HYBAS_ID'].apply(lambda x: len(basin_dam_dict[x]) if x in 
                                                  basin_dam_dict.keys() else 0)

# Only dams for irrigation or electricity:

first_stage5 = smf.ols("has_dam ~ SUB_AREA + av_elev + shr_25_50 + shr_50_1k + shr_m_1k + tot_riv_dist + grad_15_3 + grad_3_6 + grad_more_6",
                      data=bas_6).fit()
first_stage6 = smf.ols("n_of_dams ~ SUB_AREA + av_elev + shr_25_50 + shr_50_1k + shr_m_1k + tot_riv_dist + grad_15_3 + grad_3_6 + grad_more_6",
                      data=bas_6).fit()

rows, model = [], first_stage5
for idx in range (len(first_stage5.params)-3, len(first_stage5.params)):
    rows.append([model.params.index[idx], model.params[idx], model.pvalues[idx]])
print(pd.DataFrame(rows, columns=[' ', 'Coef:', 'P>|t|']))

rows, model = [], first_stage6
for idx in range (len(first_stage6.params)-3, len(first_stage6.params)):
    rows.append([model.params.index[idx], model.params[idx], model.pvalues[idx]])
print(pd.DataFrame(rows, columns=[' ', 'Coef:', 'P>|t|']))

                   Coef:     P>|t|
0    grad_15_3  0.350065  0.000604
1     grad_3_6  0.429852  0.042999
2  grad_more_6 -0.373268  0.025983
                   Coef:     P>|t|
0    grad_15_3  1.037230  0.000180
1     grad_3_6  0.513601  0.372638
2  grad_more_6 -0.580859  0.201433


In [13]:
# Irrigation dams before 1989:
joined = gpd.sjoin(dams_ir_89[['GRAND_ID', 'YEAR', 'MAIN_USE', 'TIMELINE', 'geometry']],
                   bas_6[['HYBAS_ID', 'PFAF_ID', 'geometry', 'RG']], op='within', how='left')

basin_dam_dict = {}
for hydrobasin_id, dams_in_basin in joined.groupby('HYBAS_ID'):
    dam_ids = dams_in_basin['GRAND_ID'].tolist()
    basin_dam_dict[hydrobasin_id] = dam_ids
    
bas_6['has_dam'] = bas_6['HYBAS_ID'].apply(lambda x: 1 if x in basin_dam_dict.keys() else 0)
bas_6['n_of_dams'] = bas_6['HYBAS_ID'].apply(lambda x: len(basin_dam_dict[x]) if x in 
                                                  basin_dam_dict.keys() else 0)

# Only dams for irrigation or electricity:

first_stage7 = smf.ols("has_dam ~ SUB_AREA + av_elev + shr_25_50 + shr_50_1k + shr_m_1k + tot_riv_dist + grad_15_3 + grad_3_6 + grad_more_6",
                      data=bas_6).fit()
first_stage8 = smf.ols("n_of_dams ~ SUB_AREA + av_elev + shr_25_50 + shr_50_1k + shr_m_1k + tot_riv_dist + grad_15_3 + grad_3_6 + grad_more_6",
                      data=bas_6).fit()

rows, model = [], first_stage7
for idx in range (len(first_stage7.params)-3, len(first_stage5.params)):
    rows.append([model.params.index[idx], model.params[idx], model.pvalues[idx]])
print(pd.DataFrame(rows, columns=[' ', 'Coef:', 'P>|t|']))

rows, model = [], first_stage8
for idx in range (len(first_stage5.params)-3, len(first_stage5.params)):
    rows.append([model.params.index[idx], model.params[idx], model.pvalues[idx]])
print(pd.DataFrame(rows, columns=[' ', 'Coef:', 'P>|t|']))

                   Coef:     P>|t|
0    grad_15_3  0.360826  0.000357
1     grad_3_6  0.389104  0.064302
2  grad_more_6 -0.362933  0.028797
                   Coef:     P>|t|
0    grad_15_3  1.067190  0.000082
1     grad_3_6  0.347510  0.537473
2  grad_more_6 -0.576698  0.194812


In [14]:
# Hyrdopower and irrigation dams before 1997:
joined = gpd.sjoin(dams_el_ir_97[['GRAND_ID', 'YEAR', 'MAIN_USE', 'TIMELINE', 'geometry']],
                   bas_6[['HYBAS_ID', 'PFAF_ID', 'geometry', 'RG']], op='within', how='left')

basin_dam_dict = {}
for hydrobasin_id, dams_in_basin in joined.groupby('HYBAS_ID'):
    dam_ids = dams_in_basin['GRAND_ID'].tolist()
    basin_dam_dict[hydrobasin_id] = dam_ids
    
bas_6['has_dam'] = bas_6['HYBAS_ID'].apply(lambda x: 1 if x in basin_dam_dict.keys() else 0)
bas_6['n_of_dams'] = bas_6['HYBAS_ID'].apply(lambda x: len(basin_dam_dict[x]) if x in 
                                                  basin_dam_dict.keys() else 0)

# Only dams for irrigation or electricity:

first_stage9 = smf.ols("has_dam ~ SUB_AREA + av_elev + shr_25_50 + shr_50_1k + shr_m_1k + tot_riv_dist + grad_15_3 + grad_3_6 + grad_more_6",
                      data=bas_6).fit()
first_stage10 = smf.ols("n_of_dams ~ SUB_AREA + av_elev + shr_25_50 + shr_50_1k + shr_m_1k + tot_riv_dist + grad_15_3 + grad_3_6 + grad_more_6",
                      data=bas_6).fit()

rows, model = [], first_stage9
for idx in range (len(first_stage9.params)-3, len(first_stage9.params)):
    rows.append([model.params.index[idx], model.params[idx], model.pvalues[idx]])
print(pd.DataFrame(rows, columns=[' ', 'Coef:', 'P>|t|']))

rows, model = [], first_stage10
for idx in range (len(first_stage10.params)-3, len(first_stage10.params)):
    rows.append([model.params.index[idx], model.params[idx], model.pvalues[idx]])
print(pd.DataFrame(rows, columns=[' ', 'Coef:', 'P>|t|']))

                   Coef:     P>|t|
0    grad_15_3  0.412424  0.000182
1     grad_3_6  0.481213  0.035862
2  grad_more_6 -0.220129  0.223777
                   Coef:     P>|t|
0    grad_15_3  1.112894  0.000096
1     grad_3_6  0.650249  0.273350
2  grad_more_6 -0.425002  0.364328


In [15]:
# Hyrdopower and irrigation dams before 1989:
joined = gpd.sjoin(any_dams_89[['GRAND_ID', 'YEAR', 'MAIN_USE', 'TIMELINE', 'geometry']],
                   bas_6[['HYBAS_ID', 'PFAF_ID', 'geometry', 'RG']], op='within', how='left')

basin_dam_dict = {}
for hydrobasin_id, dams_in_basin in joined.groupby('HYBAS_ID'):
    dam_ids = dams_in_basin['GRAND_ID'].tolist()
    basin_dam_dict[hydrobasin_id] = dam_ids

bas_6['has_dam'] = bas_6['HYBAS_ID'].apply(lambda x: 1 if x in basin_dam_dict.keys() else 0)
bas_6['n_of_dams'] = bas_6['HYBAS_ID'].apply(lambda x: len(basin_dam_dict[x]) if x in 
                                                  basin_dam_dict.keys() else 0)

# Only dams for irrigation or electricity:

first_stage11 = smf.ols("has_dam ~ SUB_AREA + av_elev + shr_25_50 + shr_50_1k + shr_m_1k + tot_riv_dist + grad_15_3 + grad_3_6 + grad_more_6",
                      data=bas_6).fit()
first_stage12 = smf.ols("n_of_dams ~ SUB_AREA + av_elev + shr_25_50 + shr_50_1k + shr_m_1k + tot_riv_dist + grad_15_3 + grad_3_6 + grad_more_6",
                      data=bas_6).fit()

rows, model = [], first_stage11
for idx in range (len(first_stage11.params)-3, len(first_stage11.params)):
    rows.append([model.params.index[idx], model.params[idx], model.pvalues[idx]])
print(pd.DataFrame(rows, columns=[' ', 'Coef:', 'P>|t|']))

rows, model = [], first_stage12
for idx in range (len(first_stage12.params)-3, len(first_stage12.params)):
    rows.append([model.params.index[idx], model.params[idx], float(model.pvalues[idx])])
print(pd.DataFrame(rows, columns=[' ', 'Coef:', 'P>|t|']))

                   Coef:     P>|t|
0    grad_15_3  0.497319  0.000042
1     grad_3_6  0.449326  0.075291
2  grad_more_6 -0.346480  0.082237
                   Coef:     P>|t|
0    grad_15_3  1.861760  0.000031
1     grad_3_6  0.224746  0.808960
2  grad_more_6 -1.177234  0.108648


In [16]:
# Hydropower dams before 1997:
joined = gpd.sjoin(dams_el_97[['GRAND_ID', 'YEAR', 'MAIN_USE', 'geometry']],
                   bas_6[['HYBAS_ID', 'PFAF_ID', 'geometry', 'RG']], op='within', how='left')

basin_dam_dict = {}
for hydrobasin_id, dams_in_basin in joined.groupby('HYBAS_ID'):
    dam_ids = dams_in_basin['GRAND_ID'].tolist()
    basin_dam_dict[hydrobasin_id] = dam_ids
    
bas_6['has_dam'] = bas_6['HYBAS_ID'].apply(lambda x: 1 if x in basin_dam_dict.keys() else 0)
bas_6['n_of_dams'] = bas_6['HYBAS_ID'].apply(lambda x: len(basin_dam_dict[x]) if x in 
                                                  basin_dam_dict.keys() else 0)

first_stage1 = smf.ols("has_dam ~ SUB_AREA + av_elev + shr_25_50 + shr_50_1k + shr_m_1k + tot_riv_dist + RG",
                      data=bas_6).fit()
first_stage2 = smf.ols("n_of_dams ~ SUB_AREA + av_elev + shr_25_50 + shr_50_1k + shr_m_1k + tot_riv_dist + RG",
                      data=bas_6).fit()

# print("ratio_of_grad parameter estimate:, ", first_stage.params["RG"])
# print("ratio_of_grad p-value:, ", first_stage.pvalues["RG"])
# print(first_stage1.summary())
# print(first_stage2.summary())

# rows, model = [], first_stage1
# for idx in range (len(first_stage1.params)-3, len(first_stage1.params)):
#     rows.append([model.params.index[idx], model.params[idx], model.pvalues[idx]])
# print(pd.DataFrame(rows, columns=[' ', 'Coef:', 'P>|t|']))

rows, model = [], first_stage2
for idx in range (len(first_stage2.params)-3, len(first_stage2.params)):
    rows.append([model.params.index[idx], model.params[idx], model.pvalues[idx]])
print(pd.DataFrame(rows, columns=[' ', 'Coef:', 'P>|t|']))

                        Coef:         P>|t|
0      shr_m_1k -4.179289e-03  8.851845e-01
1  tot_riv_dist  9.009867e-09  6.398244e-03
2            RG  1.579615e-01  7.058144e-07


## Assign basins to the countries and vice versa:

In [24]:
bas_temp = bas_6[['HYBAS_ID', 'geometry']].copy()
# Create centroids of basins:
bas_temp["geometry"] = bas_temp["geometry"].centroid
# Read boundaries of countries:
countries = gpd.read_file(countries_path + 'world-administrative-boundaries.shp')
cntry_name_match = {'Bosnia & Herzegovina':'Bosnia and Herzegovina',
                    'Democratic Republic of the Congo':'Congo (DRC)',
                    'Iran (Islamic Republic of)':'Iran',
                    "CÃ´te d'Ivoire":'Ivory Coast',"Lao People's Democratic Republic":'Laos',
                    'Libyan Arab Jamahiriya':'Libya', "Ma'tan al-Sarra":'Libya',
                    'The former Yugoslav Republic of Macedonia':'Macedonia',
                    'Moldova, Republic of':'Moldova', 'Myanmar':'Myanmar (Burma)',
                    "Democratic People's Republic of Korea": 'North Korea',
                    'Russian Federation':'Russia', 'Republic of Korea':'South Korea',
                    'Syrian Arab Republic':'Syria', 'United Republic of Tanzania':'Tanzania',
                    'U.K. of Great Britain and Northern Ireland':'United Kingdom','United States of America':'United States'}

countries['name'] = countries['name'].apply(lambda x: cntry_name_match[x] if x in cntry_name_match.keys() else x)
afr_countries = countries[countries.continent == 'Africa'].copy()
afr_countries.reset_index(inplace=True, drop=True)

# Match centroids of basins with countries:
joined = gpd.sjoin(bas_temp, afr_countries, how='left', op='within')
joined = joined[~joined[['index_right']].isna().any(axis=1)].reset_index(drop=True)
bas_country_dict = pd.Series(joined.name.values,index=joined.HYBAS_ID).to_dict()

# Create lists of basins for each corresponding country:
country_bas_dict = {}
for country, basins_in_country in joined.groupby('name'):
    bas_ids = basins_in_country['HYBAS_ID'].to_list()
    country_bas_dict[country] = bas_ids
    
# Hold accounts of not assigned basins
not_assigned_bas = set(joined[joined[['iso3']].isna().any(axis=1)]['HYBAS_ID'])

In [46]:
# # Manually add coastal basins for which centroid lies outside of the country borders:
# Extend not assigned basins dict:
not_assigned_bas = {'1060008110': 'Somalia',
                    # '1060003780':,
                    '1060020500': 'Gabon',
                    '1060023020': 'Nigeria',
                    '1060032860': 'Libyan Arab Jamahiriya',
                    # '1060034490': 'Sao Tome and Principe',
                    # '1060034610': ,
                    # '1060034900': ,
                    '1060035090': 'United Republic of Tanzania',
                    # '1060040030': 'Madagascar',
                    # '1060040040': 'Madagascar',
                    # '1060040050': 'Comoros',
                    # '1060040110': 'Seychelles',
                    # '1060040140': 'Seychelles',
                    # '1060040160': ,
                    # '1060040180':
                    }

# Apply dicts to fill the countries for basins:
bas_6['Country'] = bas_6['HYBAS_ID'].apply(lambda x: bas_country_dict.get(x, 'None'))
bas_6['Country'] = bas_6['Country'].apply(lambda x: not_assigned_bas.get(x, x))

# Drop basins w/o assigned countries (these basins have coastal and island nature): 
bas_6 = bas_6[bas_6['Country'] != 'None']
bas_6.reset_index(inplace=True,drop=True)

In [50]:
# Adjust the disputed territories:
country_adjust = {"Hala'ib Triangle": 'Egypt',
                  'Ilemi Triangle': 'Kenya'}
bas_6['Country'] = bas_6['Country'].apply(lambda x: country_adjust[x] if x in country_adjust.keys() else x)

# # Following the identification strategy, I remove countries which occupy only 1 hydrobasin
# Find countries with 1 hydrobasin:
countries_to_remove = set()
for country in country_bas_dict.keys():
    if len(country_bas_dict[country]) == 1:
        countries_to_remove.add(country)

# {'Burundi', 'Djibouti', 'Ilemi Triangle'}

bas_6 = bas_6[~bas_6['Country'].isin(countries_to_remove)].copy()
bas_6.reset_index(drop=True,inplace=True)

In [9]:
# bas_6.to_file(output_path + '5rivers_all_controls.shp')
bas_6 = gpd.read_file(output_path + '5rivers_all_controls.shp')
bas_6.rename(columns={'grad_more_':'grad_more_6', 'tot_riv_di':'tot_riv_dist'}, inplace=True)
bas_6['HYBAS_ID'] = bas_6['HYBAS_ID'].apply(lambda x: str(x))
bas_6['PFAF_ID'] = bas_6['PFAF_ID'].apply(lambda x: str(x))

In [56]:
# Explode the basin data frame by month (1-12) and year (1997-2023) to a panel dataset:
bas_6['year'] = bas_6['HYBAS_ID'].apply(lambda x: [year for year in range(1999, 2024)])
bas_6_exp = bas_6.explode('year', ignore_index=True)
# bas_6_exp['month'] = bas_6_exp['HYBAS_ID'].apply(lambda x: [month for month in range(1, 13)])
# bas_6_exp = bas_6_exp.explode('month', ignore_index=True)

## Match dams per country per year with basins:

In [57]:
# IDs of dams that were removed in corresponding years:
rem_dams = dams_df[dams_df['REM_YEAR'] != -99].copy().reset_index(drop=True)
# rem_dams = pd.Series(rem_dams.REM_YEAR.values,index=rem_dams.GRAND_ID).to_dict()
year_rem_dict = {}
for year, data in rem_dams.groupby('REM_YEAR'):
    dam_ids = data['GRAND_ID'].to_list()
    year_rem_dict[year] = dam_ids
year_rem_dict

{1987: [1918],
 2002: [2085],
 2003: [769],
 2005: [2006],
 2007: [820, 2844],
 2008: [772, 2882],
 2013: [6315],
 2016: [6285]}

In [63]:
# # Calculate total number of dams in country c in year t:
# Split the dams df:
year_to_use = 1999
dams_before_start = dams_df[dams_df['YEAR'] < year_to_use]

# Create a dict of countries and dams:
dams_p_cntr_at_start = dams_before_start.groupby('COUNTRY').count().reset_index()
dams_p_cntr_at_start = pd.Series(dams_p_cntr_at_start.GRAND_ID.values,
                                index=dams_p_cntr_at_start.COUNTRY).to_dict()

# Create column in for dams per country in 1999:
bas_6_exp['dams_per_count_991'] = bas_6_exp['Country'].apply(lambda x: dams_p_cntr_at_start.get(x, 0))

In [65]:
# # Calculate dams for country c for year t:
# Split the df:
dams_after_start = dams_df[dams_df['YEAR'] >= year_to_use]
dams_after_start = dams_after_start.groupby(['COUNTRY', 'YEAR']).count().reset_index()

# Create the dict where key is tuple of country and year:
new_dams_after_start = dict()
for index, row in dams_after_start.iterrows():
    key = (row['COUNTRY'], row['YEAR'])
    new_dams_after_start[key] = row['GRAND_ID']

# Create the dict of total amount of dams in country c for year t:
counts = set(bas_6_exp.Country)
tot_dams_per_country_year = {}
idx = 0
for country in counts:
    idx += 1
    print(idx/len(counts), end='\r')
    tot_dams_per_year = {}
    for year in range(1999, 2024):
        if year == 1999:
            if country in dams_p_cntr_at_start.keys():
                tot_dams_per_year[year] = dams_p_cntr_at_start[country]
            else:
                tot_dams_per_year[year] = 0
        else:
            key = (country, year)
            if key in new_dams_after_start.keys():
                tot_dams_per_year[year] = tot_dams_per_year[year-1] + new_dams_after_start[key]
            else:
                tot_dams_per_year[year] = tot_dams_per_year[year-1]
    tot_dams_per_country_year[country] = tot_dams_per_year
    
# Use the created dictionary to fill in the basins dataframe:
bas_6_exp['n_of_dam_in_c_per_y'] = 0
for idx, row in bas_6_exp.iterrows():
    print(idx/len(bas_6_exp), end='\r')
    bas_6_exp.loc[idx, 'n_of_dam_in_c_per_y'] = tot_dams_per_country_year[row['Country']][row['year']]

0.99998882369376924646

In [None]:
# Left to-dos:
# 1. Account for removed dams

In [4]:
# bas_6_exp.to_file(output_path + '5rivers_all_controls_panel.shp')
bas_6_exp = gpd.read_file(output_path + '5rivers_all_controls_panel.shp')
bas_6_exp.rename(columns={'grad_more_':'grad_more_6', 'tot_riv_di':'tot_riv_dist',
                      'n_of_dam_i':'dams_in_c_per_y', 'dams_per_c':'dams_per_c_99'}, inplace=True)
bas_6_exp['HYBAS_ID'] = bas_6_exp['HYBAS_ID'].apply(lambda x: str(x))
bas_6_exp['PFAF_ID'] = bas_6_exp['PFAF_ID'].apply(lambda x: str(x))
bas_6_exp['year'] = bas_6_exp['year'].apply(lambda x: int(x))

# Conflicts:

In [5]:
# CRS of df is EPSG:3857
df = gpd.read_file(conflict_path + '1997-01-01-2023-09-30.csv', sep=';', dtype={'timestamp': 'object'})
gdf_conf = gpd.GeoDataFrame(df, geometry = gpd.points_from_xy(df.longitude, df.latitude))
gdf_conf.set_crs('epsg:4326', inplace=True)
# gdf_conf.set_crs('epsg:3857', inplace=True)
# gdf_conf.to_crs(epsg=4326, inplace = True)

Unnamed: 0,event_id_cnty,event_date,year,time_precision,disorder_type,event_type,sub_event_type,actor1,assoc_actor_1,inter1,...,latitude,longitude,geo_precision,source,source_scale,notes,fatalities,tags,timestamp,geometry
0,BFO10686,30 September 2023,2023,1,Political violence,Battles,Armed clash,JNIM: Group for Support of Islam and Muslims,,2,...,13.8478,-2.4172,1,Whatsapp,New media,"On 30 September 2023, presumed JNIM militants ...",0,,1696869398,POINT (-2.41720 13.84780)
1,BFO10693,30 September 2023,2023,1,Political violence,Battles,Armed clash,JNIM: Group for Support of Islam and Muslims,,2,...,12.7301,-4.0967,1,Facebook; Signal; Whatsapp,New media,"On 30 September 2023, presumed JNIM militants ...",0,,1696869398,POINT (-4.09670 12.73010)
2,BFO10708,30 September 2023,2023,1,Political violence,Violence against civilians,Attack,JNIM: Group for Support of Islam and Muslims,,2,...,13.7602,-2.426,1,Facebook,New media,"On 30 September 2023, presumed JNIM militants ...",3,,1696869398,POINT (-2.42600 13.76020)
3,CAO7889,30 September 2023,2023,2,Political violence,Violence against civilians,Attack,Islamic State (West Africa) and/or Boko Haram ...,,2,...,10.7424,13.8023,2,Twitter; Xinhua,New media-International,"Around 30 September 2023, ISWAP or Boko Haram ...",1,,1696869398,POINT (13.80230 10.74240)
4,CDI2876,30 September 2023,2023,1,Political violence,Violence against civilians,Attack,Police Forces of the Ivory Coast (2011-),,1,...,7.4125,-7.5538,1,Koaci News; Soir Info,National,"On 30 September 2023, policemen beat a civilia...",1,,1696869398,POINT (-7.55380 7.41250)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984432,SIE2,01 January 1997,1997,3,Political violence,Battles,Government regains territory,Military Forces of Sierra Leone (1996-1997),,1,...,8.4642,-10.9332,2,No Peace Without Justice; SL-LED,Local partner-New media,"Around 1 January 1997 (month of), Military For...",0,,1670286851,POINT (-10.93320 8.46420)
984433,SIE3,01 January 1997,1997,3,Political violence,Battles,Armed clash,Kamajor Militia,,3,...,8.1221,-11.7047,2,No Peace Without Justice; SL-LED,Local partner-New media,"Around 1 January 1997 (month of), Kamajor Mili...",0,,1670286851,POINT (-11.70470 8.12210)
984434,SIE6,01 January 1997,1997,3,Political violence,Violence against civilians,Attack,Military Forces of Sierra Leone (1996-1997),,1,...,7.5317,-12.4694,2,No Peace Without Justice; SL-LED,Local partner-New media,"Around 1 January 1997 (month of), Military For...",0,,1670286851,POINT (-12.46940 7.53170)
984435,SIE7,01 January 1997,1997,3,Political violence,Battles,Armed clash,Kamajor Militia,,3,...,8.231,-12.338,2,SL-LED; No Peace Without Justice,Local partner-New media,"Around 1 January 1997 (month of), Kamajor Mili...",0,,1670286851,POINT (-12.33800 8.23100)


In [6]:
# Filter by region:
regions = ['Eastern Africa', 'Middle Africa', 'Northern Africa', 'Southern Africa', 'Western Africa']
gdf_conf = gdf_conf[gdf_conf['region'].apply(lambda x: True if x in regions else False)]
gdf_conf.reset_index(drop=True, inplace=True)

# Filter by battles:
battles_gdf = gdf_conf[gdf_conf.event_type == 'Battles']
battles_gdf.reset_index(drop=True, inplace=True)
battles_gdf = gdf_conf.copy()

del gdf_conf
del df

In [10]:
# Match geography of conflicts with hydrobasins:
joined = gpd.sjoin(bas_6[['HYBAS_ID', 'PFAF_ID', 'geometry', 'RG']],
                   battles_gdf[['event_id_cnty', 'event_date', 'geometry', 'year', 'fatalities', 'timestamp']], how='right')
joined.drop(['index_left'], axis=1, inplace=True)
joined[['day', 'month', 'year']] = joined['event_date'].str.split(' ', expand = True)
joined['month_year'] = joined[['month', 'year']].apply(lambda x: ' '.join(x), axis=1)
joined['month_year'] = joined['month_year'].apply(lambda x: datetime.strptime(x, '%B %Y'))
joined['month'] = joined['month_year'].apply(lambda x: x.month)
joined['year'] = joined['month_year'].apply(lambda x: x.year)

# Create dict of lists of conflicts for every combination with of basin ID, month and year:
basin_conf_dict = {}
# for hydrobasin_id, confs_in_basin in joined.groupby(['HYBAS_ID', 'month', 'year']):
for hydrobasin_id, confs_in_basin in joined.groupby(['HYBAS_ID', 'year']):
    confs_ids = confs_in_basin['event_id_cnty'].tolist()
    basin_conf_dict[hydrobasin_id] = confs_ids

In [11]:
basin_conf_dict

{('1060000010', 2007): ['EGY322'],
 ('1060000010', 2011): ['EGY1414',
  'EGY1257',
  'EGY1184',
  'EGY1165',
  'EGY1163',
  'EGY9422',
  'EGY9423',
  'EGY715',
  'EGY694',
  'EGY674',
  'EGY675',
  'EGY658',
  'EGY660',
  'EGY9415',
  'EGY657'],
 ('1060000010', 2012): ['EGY1879',
  'EGY1830',
  'EGY1775',
  'EGY1717',
  'EGY1602',
  'EGY1582',
  'EGY1487',
  'EGY1476',
  'EGY1453'],
 ('1060000010', 2013): ['EGY4400',
  'EGY4349',
  'EGY4302',
  'EGY4155',
  'EGY4116',
  'EGY4035',
  'EGY3963',
  'EGY3964',
  'EGY3918',
  'EGY3920',
  'EGY3773',
  'EGY3728',
  'EGY3722',
  'EGY3700',
  'EGY3657',
  'EGY3621',
  'EGY3535',
  'EGY3364',
  'EGY3337',
  'EGY3330',
  'EGY3310',
  'EGY3222',
  'EGY3246',
  'EGY3005',
  'EGY2827',
  'EGY2801',
  'EGY2740',
  'EGY2060',
  'EGY1938',
  'EGY1923',
  'EGY1922',
  'EGY1934',
  'EGY1909'],
 ('1060000010', 2014): ['EGY5750',
  'EGY5602',
  'EGY5523',
  'EGY5111',
  'EGY5098',
  'EGY5013',
  'EGY4930',
  'EGY4848',
  'EGY4675',
  'EGY4648',
  'EGY4662

In [13]:
# Match number of conflicts with basins, country and year:
bas_6_exp['battles_per_y_loop'] = 0
for idx, row in bas_6_exp.iterrows():
    print(idx/len(bas_6_exp), end='\r')
    # key = (row['HYBAS_ID'], row['month'], row['year'])
    key = (row['HYBAS_ID'], row['year'])
    if key in basin_conf_dict.keys():
        bas_6_exp.loc[idx, 'battles_per_y_loop'] = int(len(basin_conf_dict[key]))
    else:
        bas_6_exp.loc[idx, 'battles_per_y_loop'] = int(0)
        
# bas_6_exp.battles_per_m_loop.value_counts()
bas_6_exp['had_fight'] = bas_6_exp['battles_per_y_loop'].apply(lambda x: 1 if x > 0 else 0)
bas_6_exp['RGxD_hat'] = bas_6_exp['RG']*bas_6_exp['dams_in_c_per_y']

0.99998882369376924646

In [7]:
# bas_6_exp.to_file(output_path + '5rivers_prepared_data.shp')
bas_6_exp = gpd.read_file(output_path + '5rivers_prepared_data.shp')
bas_6_exp.rename(columns={'grad_more_':'grad_more_6', 'tot_riv_di':'tot_riv_dist',
                      'dams_in_c_':'dams_in_c_per_y', 'dams_per_c':'dams_per_c_99',
                      'battles_pe':'battles_per_y_loop'}, inplace=True)
# In case of weird column: dams_per_1 = dams_per_c_99

## Add dams as endogenous variable:

In [15]:
year_to_use = 1999
dams_df = gpd.read_file(dams_path + 'GRanD_dams_v1_3.shp')

# # Wherever main year is -99, so is alternative year:
dams_df = dams_df[dams_df['YEAR'] != -99]
dams_df.reset_index(inplace=True,drop=True)

uses = {'Main', 'Major'}
# uses = {'Main', 'Sec', 'Major'}

dams_df['el_ir'] = dams_df.apply(lambda row: 1 if (row.USE_ELEC in uses) or 
                                 (row.USE_IRRI in uses) else 0, axis=1)
dams_df = dams_df[dams_df['el_ir'] == 1]
dams_df.reset_index(inplace=True, drop=True)

dams_1900_1999 = dams_df[(dams_df['YEAR'] >= 1900) & (dams_df['YEAR'] < year_to_use)].copy()
dams_1900_1999.reset_index(inplace=True, drop=True)

dams_after_99 = dams_df[dams_df['YEAR'] >= year_to_use].copy()
dams_after_99.reset_index(inplace=True, drop=True)

In [16]:
# Get the amount of dams per basin by 1999:
joined_1900_1999 = gpd.sjoin(dams_1900_1999[['GRAND_ID', 'YEAR', 'MAIN_USE', 'geometry']],
                   bas_6[['HYBAS_ID', 'PFAF_ID', 'geometry', 'RG']], op='within', how='left')
joined_1900_1999 = joined_1900_1999[~joined_1900_1999['HYBAS_ID'].isna()]
joined_1900_1999.reset_index(inplace=True, drop=True)

dam_bas_1900_99 = {}
for bas_id, data in joined_1900_1999.groupby('HYBAS_ID'):
    dam_ids = data['GRAND_ID'].to_list()
    dam_bas_1900_99[bas_id] = dam_ids

# Fill number of dams by 1999 for each basin with created dictionary values:
bas_6_exp['basdams_99'] = bas_6_exp['HYBAS_ID'].apply(lambda x: len(dam_bas_1900_99[x]) if 
                                                      x in dam_bas_1900_99.keys() else 0)

In [17]:
# Get the amount of dams per basin per year after 1999:
joined_after_99 = gpd.sjoin(dams_after_99[['GRAND_ID', 'YEAR', 'MAIN_USE', 'geometry']],
                   bas_6[['HYBAS_ID', 'PFAF_ID', 'geometry', 'RG']], op='within', how='left')
joined_after_99 = joined_after_99[~joined_after_99['HYBAS_ID'].isna()]
joined_after_99.reset_index(inplace=True, drop=True)

dam_bas_after_99 = {}
for bas_id, data in joined_after_99.groupby(['HYBAS_ID', 'YEAR']):
    dam_ids = data['GRAND_ID'].to_list()
    dam_bas_after_99[bas_id] = dam_ids

In [18]:
# removed_dams = {1987: [1918],
#                 2002: [2085],
#                 2003: [769],
#                 2005: [2006],
#                 2007: [820, 2844],
#                 2008: [772, 2882],
#                 2013: [6315],
#                 2016: [6285]}

# dams_df[dams_df.REM_YEAR != -99].COUNTRY.value_counts()

dam_bas_after_99
# dam_bas_1900_99

{('1060025060', 2010): [7139],
 ('1060030760', 2009): [6875],
 ('1060030980', 2005): [6872],
 ('1060030980', 2009): [6873],
 ('1060041070', 2000): [6874],
 ('1060097780', 2015): [7149],
 ('1060100170', 2005): [7150],
 ('1060121400', 2001): [7147],
 ('1060512190', 2009): [7136],
 ('1060649730', 2016): [7137],
 ('1060675820', 2010): [6942],
 ('1060709280', 2017): [6917],
 ('1060729980', 2010): [6918],
 ('1060769850', 2014): [6943],
 ('1060850580', 2015): [6941],
 ('1060890730', 2004): [6939],
 ('1060894490', 2013): [6944],
 ('1060963480', 2016): [6940],
 ('1061208950', 2011): [6933],
 ('1061331240', 2004): [4043],
 ('1061431950', 2001): [7212],
 ('1061532060', 2015): [7213],
 ('1061608140', 2008): [7138]}

In [19]:
# Create the dict of total amount of dams in country c for year t:
basins = set(bas_6_exp.HYBAS_ID)
tot_dams_per_basin_year = {}
idx = 0
for basin in basins:
    idx += 1
    print(idx/len(basins), end='\r')
    tot_dams_per_year = {}
    for year in range(1999, 2024):
        if year == 1999:
            if basin in dam_bas_1900_99.keys():
                tot_dams_per_year[year] = len(dam_bas_1900_99[basin])
            else:
                tot_dams_per_year[year] = 0
        else:
            key = (basin, year)
            if key in dam_bas_after_99.keys():
                tot_dams_per_year[year] = tot_dams_per_year[year-1] + len(dam_bas_after_99[key])
            else:
                tot_dams_per_year[year] = tot_dams_per_year[year-1]
    tot_dams_per_basin_year[basin] = tot_dams_per_year
    
# Use the created dictionary to fill in the basins dataframe:
bas_6_exp['dams_in_b_per_y'] = 0
for idx, row in bas_6_exp.iterrows():
    print(idx/len(bas_6_exp), end='\r')
    bas_6_exp.loc[idx, 'dams_in_b_per_y'] = tot_dams_per_basin_year[row['HYBAS_ID']][row['year']]

0.99998882369376924646

In [81]:
# bas_6_exp.drop(columns=['basdams_py'], inplace=True)

col_names = list(bas_6_exp.columns)
for col in col_names:
    if len(col) > 10:
        print(col)

grad_more_6
tot_riv_dist
dams_per_c_99
dams_in_c_per_y
battles_per_y_loop
dams_in_b_per_y


In [20]:
cols_to_rename = {'grad_more_6': 'grad_m_6',
                  'tot_riv_dist': 'tot_riv_di',
                  'dams_per_c_99': 'dams_pc_99',
                  'dams_in_b_per_y': 'dams_pby',
                  'dams_in_c_per_y': 'dams_pcy',
                  'battles_per_y_loop': 'btl_p_y'}

bas_6_exp.rename(columns=cols_to_rename, inplace=True)

In [88]:
bas_6_exp.columns

Index(['HYBAS_ID', 'NEXT_DOWN', 'NEXT_SINK', 'MAIN_BAS', 'DIST_SINK',
       'DIST_MAIN', 'SUB_AREA', 'UP_AREA', 'PFAF_ID', 'ENDO', 'COAST', 'ORDER',
       'SORT', 'RG', 'grad_15_3', 'grad_3_6', 'grad_m_6', 'tot_riv_di',
       'av_elev', 'shr_l_25', 'shr_25_50', 'shr_50_1k', 'shr_m_1k', 'Country',
       'year', 'dams_pc_99', 'dams_per_1', 'dams_pcy', 'btl_p_y', 'had_fight',
       'RGxD_hat', 'geometry', 'basdams_99', 'dams_pby'],
      dtype='object')

In [21]:
# Interaction between geographic controls and D_hat
bas_6_exp['z_ieygg'] = bas_6_exp['grad_15_3']*bas_6_exp['dams_pcy']
bas_6_exp['z_tkdgw'] = bas_6_exp['grad_3_6']*bas_6_exp['dams_pcy']
bas_6_exp['z_sobhm'] = bas_6_exp['grad_m_6']*bas_6_exp['dams_pcy']
bas_6_exp['z_fobki'] = bas_6_exp['tot_riv_di']*bas_6_exp['dams_pcy']
bas_6_exp['z_hyeah'] = bas_6_exp['av_elev']*bas_6_exp['dams_pcy']
bas_6_exp['z_vcjei'] = bas_6_exp['shr_l_25']*bas_6_exp['dams_pcy']
bas_6_exp['z_nlvsk'] = bas_6_exp['shr_25_50']*bas_6_exp['dams_pcy']
bas_6_exp['z_ahjvn'] = bas_6_exp['shr_50_1k']*bas_6_exp['dams_pcy']
bas_6_exp['z_zgjij'] = bas_6_exp['shr_m_1k']*bas_6_exp['dams_pcy']

columns_names_meanings = {
    'z_ieygg': 'Share of river gradient between 1,5% and 3% in a basin multiplied by total number of dams in a corresponding country in a corresponding year.',
    'z_tkdgw': 'Share of river gradient between 3% and 6% in a basin multiplied by total number of dams in a corresponding country in a corresponding year.',
    'z_sobhm': 'Share of river gradient more than 6% in a basin multiplied by total number of dams in a corresponding country in a corresponding year.',
    'z_fobki': 'Total distance of rivers in a basin multiplied by total number of dams in a corresponding country in a corresponding year.',
    'z_hyeah': 'Average elevation of a basin multiplied by total number of dams in a corresponding country in a corresponding year.',
    'z_vcjei': 'Area of elevation lower than 250m as a share of basin area, multiplied by total number of dams in a corresponding country in a corresponding year.',
    'z_nlvsk': 'Area of elevation between 250m and 500m as a share of basin area, multiplied by total number of dams in a corresponding country in a corresponding year.',
    'z_ahjvn': 'Area of elevation between 500m and 1000m as a share of basin area, multiplied by total number of dams in a corresponding country in a corresponding year.',
    'z_zgjij': 'Area of elevation between 500m and 1000m as a share of basin area, multiplied by total number of dams in a corresponding country in a corresponding year.'
    }

In [12]:
# bas_6_exp["centroid"] = bas_6_exp["geometry"].centroid

In [22]:
bas_6_exp.to_file(output_path + '5rivers_fully_prepared_data_all_conflicts.shp')