## Setup

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [63]:
import pandas as pd
import geopandas as gpd
import altair as alt
import altair_latimes as lat

In [4]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

In [64]:
wells = pd.read_csv('./data/well_completion.csv') 
shortage = pd.read_csv('./data/shortages.csv') 
levels = pd.read_csv('./data/spring_water_levels.csv') 

In [7]:
main_counties = ["Fresno", "Kern", "Kings", "Madera", "Merced", "San Joaquin", "Stanislaus", "Tulare"]

## Wells drilled in 2021

In [8]:
grouped_date_use = wells.groupby(['year_work_ended','use']).agg({
    'wcr_no': ['count'],
}).reset_index()
grouped_date_use.columns = ["date","use","count"]

In [9]:
grouped_date_use_pivot = grouped_date_use.pivot(index='date', columns='use', values='count').reset_index()

In [11]:
grouped_date_use_pivot.tail(1)

use,date,agriculture,domestic,public
97,2021.0,619.0,671.0,12.0


## Total agriculture wells

In [482]:
grouped_date_use_pivot['agriculture'].sum()

34352.0

## Total domestic wells

In [483]:
grouped_date_use_pivot['domestic'].sum()

59456.0

## Well construction over time

In [14]:
drought_years = [
    {"start": 1976, "end": 1977},
    {"start": 1987, "end": 1992},
    {"start": 2001, "end": 2002},
    {"start": 2007, "end": 2009},
    {"start": 2012, "end": 2016},
]

drought_years = alt.pd.DataFrame(drought_years)

sgma_year = [
    {"year": 2014, "title": 'SGMA passed'}
]
sgma_year = alt.pd.DataFrame(sgma_year)

highlight = alt.selection(type='single', 
                          on='mouseover',
                          fields=['use'], 
                          nearest=True,
                          init={'use':'agriculture'})

base = alt.Chart(grouped_date_use[grouped_date_use['date'] >= 1960]).encode(
    x=alt.X('date:T', axis=alt.Axis(values=[.995,.020], format='%Y', title='Year')),
    y='count:Q',
    color='use:N',
    tooltip=['date', 'use', 'count']
)

points = base.mark_circle().encode(
    opacity=alt.value(0)
).add_selection(
    highlight
).properties(
    width=600,
    height=300
)

lines = base.mark_line().encode(
    size=alt.condition(~highlight, alt.value(1), alt.value(3))
)

rect = alt.Chart(drought_years).mark_rect(color="#EEE").encode(
    x='start:T',
    x2='end:T',
)

rules = alt.Chart(sgma_year).mark_rule(color="#000").encode(
    x='date:T',
)

rect + points + lines + rules

## Agriculture well construction by county and year

In [18]:
grouped_year_county = wells[wells['use'] == 'agriculture'].groupby(by=['year_work_ended', 'county']).size().reset_index(name="count")
grouped_year_county.rename(columns={'year_work_ended':'year'}, inplace=True)
grouped_year_county['year'] = grouped_year_county['year'].astype('int')

In [19]:
alt.Chart(grouped_year_county[(grouped_year_county['year'] >= 1980) & 
                              (grouped_year_county['county'].isin(main_counties))]).mark_line().encode(
    x='year:O',
    y='count:Q',
).properties(
    width=160,
    height=160
).facet(
    facet='county:N',
    columns=4
)

## 2021 agriculture well drilling by county

In [20]:
grouped_year_county[(grouped_year_county['county'].isin(main_counties)) & (grouped_year_county['year'] == 2021)]

Unnamed: 0,year,county,count
765,2021,Fresno,179
766,2021,Kern,60
767,2021,Kings,39
768,2021,Madera,35
769,2021,Merced,32
771,2021,San Joaquin,9
772,2021,Stanislaus,17
773,2021,Tulare,243


## Percent of 2021 agriculture drilling in Tulare County

In [21]:
drilling_2021 = grouped_year_county[grouped_year_county['year'] == 2021]['count'].sum()
drilling_2021_tulare = grouped_year_county[(grouped_year_county['year'] == 2021) & (grouped_year_county['county'] == 'Tulare')]['count'].sum()
drilling_2021_tulare / drilling_2021

0.3925686591276252

## Wells completed since 2014

In [22]:
grouped_use_2014 = wells[wells['year_work_ended'] >= 2014].groupby(['use']).agg({
    'wcr_no': ['count'],
}).reset_index()
grouped_use_2014.columns = ["use","count"]
grouped_use_2014

Unnamed: 0,use,count
0,agriculture,7057
1,domestic,6652
2,public,279


## Wells completed since 2020

In [23]:
grouped_use_2020 = wells[wells['year_work_ended'] >= 2020].groupby(['use']).agg({
    'wcr_no': ['count'],
}).reset_index()
grouped_use_2020.columns = ["use","count"]
grouped_use_2020

Unnamed: 0,use,count
0,agriculture,1134
1,domestic,1512
2,public,41


In [370]:
grouped_date_use_pivot['Agriculture_Cum'] = grouped_date_use_pivot['agriculture'].cumsum()
grouped_date_use_pivot['Domestic_Cum'] = grouped_date_use_pivot['domestic'].cumsum()
grouped_date_use_pivot['Public_Cum'] = grouped_date_use_pivot['public'].cumsum()
grouped_date_use_pivot['Agriculture_Active'] = grouped_date_use_pivot['agriculture'].rolling(50).sum()
grouped_date_use_pivot['Domestic_Active'] = grouped_date_use_pivot['domestic'].rolling(50).sum()
grouped_date_use_pivot['Public_Active'] = grouped_date_use_pivot['public'].rolling(50).sum()
grouped_date_use_pivot['Agriculture_Active'].fillna(grouped_date_use_pivot['Agriculture_Cum'], inplace=True)
grouped_date_use_pivot['Domestic_Active'].fillna(grouped_date_use_pivot['Domestic_Cum'], inplace=True)
grouped_date_use_pivot['Public_Active'].fillna(grouped_date_use_pivot['Public_Cum'], inplace=True)

In [371]:
grouped_date_use_pivot.tail()

use,date,agriculture,domestic,public,Agriculture_Cum,Domestic_Cum,Public_Cum,Agriculture_Active,Domestic_Active,Public_Active
93,2017.0,569.0,636.0,23.0,32405.0,56793.0,2354.0,25750.0,49308.0,1910.0
94,2018.0,344.0,470.0,25.0,32749.0,57263.0,2379.0,25839.0,49359.0,1921.0
95,2019.0,469.0,681.0,27.0,33218.0,57944.0,2406.0,26111.0,49660.0,1933.0
96,2020.0,515.0,841.0,29.0,33733.0,58785.0,2435.0,26387.0,50082.0,1945.0
97,2021.0,619.0,671.0,12.0,34352.0,59456.0,2447.0,26715.0,50170.0,1940.0


In [372]:
alt.Chart(grouped_date_use_pivot[grouped_date_use_pivot['date'] >= 2000]).mark_line().encode(
    alt.X('date:O'),
    alt.Y(alt.repeat("column"), type='quantitative'),
).properties(
    width=250,
    height=250
).repeat(
    column=['Agriculture_Active', 'Domestic_Active', 'Public_Active']
)

In [374]:
grouped_use = wells.groupby(['use']).agg({
    'wcr_no': ['count'],
}).reset_index()
grouped_use.columns = ["use","all wells"]

grouped_use_active = wells[wells['year_work_ended'] >= lifespanYear].groupby(['use']).agg({
    'wcr_no': ['count'],
}).reset_index()
grouped_use_active.columns = ["use","active wells"]

In [375]:
grouped_use = grouped_use.merge(grouped_use_active, on="use")

In [376]:
grouped_use['pct active'] = grouped_use['active wells'] / grouped_use['all wells']

In [377]:
grouped_use

Unnamed: 0,use,all wells,active wells,pct active
0,agriculture,36892,27006,0.732029
1,domestic,64583,50753,0.785857
2,public,2600,1957,0.752692


In [378]:
grouped_use['active wells'].sum()

79716

In [380]:
grouped_use_county = wells.groupby(['use','county']).agg({
    'wcr_no': ['count'],
}).reset_index()
grouped_use_county.columns = ["use","county","all wells"]

grouped_use_county_active = wells[wells['year_work_ended'] >= 2021-lifespan].groupby(['use','county']).agg({
    'wcr_no': ['count'],
}).reset_index()
grouped_use_county_active.columns = ["use","county","active wells"]

In [381]:
grouped_use_county = grouped_use_county.merge(grouped_use_county_active, on=["use","county"])

In [382]:
grouped_use_county['pct active'] = grouped_use_county['active wells'] / grouped_use_county['all wells']

In [383]:
grouped_use_county[grouped_use_county["use"] == "agriculture"]

Unnamed: 0,use,county,all wells,active wells,pct active
0,agriculture,Alameda,4,4,1.0
1,agriculture,Amador,17,15,0.882353
2,agriculture,Calaveras,72,61,0.847222
3,agriculture,Contra Costa,148,126,0.851351
4,agriculture,Fresno,9599,6982,0.727367
5,agriculture,Kern,3983,2467,0.619382
6,agriculture,Kings,3196,2387,0.746871
7,agriculture,Madera,2825,2225,0.787611
8,agriculture,Mariposa,32,25,0.78125
9,agriculture,Merced,3527,2645,0.749929


In [384]:
def active_by_county(county):
    df = wells[wells['county'] == county]
    df_group = df.groupby(['year_work_ended','use']).agg({
        'wcr_no': ['count'],
    }).reset_index()
    df_group.columns = ['date','use','count']
    df_group = df_group.pivot(index='date', columns='use', values='count').reset_index()
    df_group['Agriculture_Cum'] = df_group['agriculture'].cumsum()
    df_group['Domestic_Cum'] = df_group['domestic'].cumsum()
    df_group['Public_Cum'] = df_group['public'].cumsum()
    df_group['Agriculture_Active'] = df_group['agriculture'].rolling(50).sum()
    df_group['Domestic_Active'] = df_group['domestic'].rolling(50).sum()
    df_group['Public_Active'] = df_group['public'].rolling(50).sum()
    df_group['Agriculture_Active'].fillna(df_group['Agriculture_Cum'], inplace=True)
    df_group['Domestic_Active'].fillna(df_group['Domestic_Cum'], inplace=True)
    df_group['Public_Active'].fillna(df_group['Public_Cum'], inplace=True)
    return df_group

In [385]:
grouped_date_use_tulare = active_by_county('Tulare')

In [386]:
grouped_date_use_tulare.tail(10)

use,date,agriculture,domestic,public,Agriculture_Cum,Domestic_Cum,Public_Cum,Agriculture_Active,Domestic_Active,Public_Active
64,2012.0,101.0,39.0,4.0,5234.0,5355.0,417.0,4470.0,4600.0,381.0
65,2013.0,121.0,51.0,2.0,5355.0,5406.0,419.0,4527.0,4609.0,382.0
66,2014.0,290.0,124.0,11.0,5645.0,5530.0,430.0,4762.0,4666.0,392.0
67,2015.0,652.0,332.0,44.0,6297.0,5862.0,474.0,5370.0,4928.0,433.0
68,2016.0,336.0,322.0,44.0,6633.0,6184.0,518.0,5626.0,5206.0,474.0
69,2017.0,166.0,141.0,11.0,6799.0,6325.0,529.0,5762.0,5299.0,484.0
70,2018.0,90.0,108.0,4.0,6889.0,6433.0,533.0,5817.0,5369.0,486.0
71,2019.0,158.0,105.0,2.0,7047.0,6538.0,535.0,5949.0,5435.0,487.0
72,2020.0,160.0,104.0,5.0,7207.0,6642.0,540.0,6081.0,5486.0,490.0
73,2021.0,243.0,137.0,1.0,7450.0,6779.0,541.0,6262.0,5535.0,490.0


In [387]:
chart_tulare = alt.Chart(grouped_date_use_tulare[grouped_date_use_tulare['date'] >= 2000]).mark_line().encode(
    x='date:O',
    y='Agriculture_Active:Q'
).properties(
    height=200,
    width=200
)

grouped_date_use_fresno = active_by_county('Fresno')
chart_fresno = alt.Chart(grouped_date_use_fresno[grouped_date_use_fresno['date'] >= 2000]).mark_line().encode(
    x='date:O',
    y='Agriculture_Active:Q'
).properties(
    height=200,
    width=200
)

grouped_date_use_merced = active_by_county('Merced')
chart_merced = alt.Chart(grouped_date_use_merced[grouped_date_use_merced['date'] >= 2000]).mark_line().encode(
    x='date:O',
    y='Agriculture_Active:Q'
).properties(
    height=200,
    width=200
)

grouped_date_use_kings = active_by_county('Kings')
chart_kings = alt.Chart(grouped_date_use_kings[grouped_date_use_kings['date'] >= 2000]).mark_line().encode(
    x='date:O',
    y='Agriculture_Active:Q'
).properties(
    height=200,
    width=200
)

grouped_date_use_madera = active_by_county('Madera')
chart_madera = alt.Chart(grouped_date_use_madera[grouped_date_use_madera['date'] >= 2000]).mark_line().encode(
    x='date:O',
    y='Agriculture_Active:Q'
).properties(
    height=200,
    width=200
)

chart_tulare | chart_madera | chart_fresno | chart_merced | chart_kings

## Overall water table changes from given start year

In [24]:
levels.sample()

Unnamed: 0,site,MTRS,TownshipRange,county,year,gse_gwe
234633,374327N1208504W001,MDM-T06S-R10E-10,T06S R10E,Merced,1959,7.55


In [25]:
# Group wells that had multiple measurements each spring and get the average
levels_group = levels.groupby(['site','county','year']).agg({
    'gse_gwe': ['mean'],
}).reset_index()
levels_group.columns = ['site','county','year','gse_gwe']

In [26]:
def get_level_change_overall(start_year):
    # Filter for the years that we are interested in
    levels_group_trim = levels_group[levels_group['year'] >= start_year]
    avg_change_year = levels_group_trim.groupby(['year']).agg({
        'gse_gwe':['mean', 'count']
    }).reset_index()
    avg_change_year.columns = ['year','level','count']
    start = avg_change_year[avg_change_year['year'] == start_year]['level'].iloc[0]
    avg_change_year['start_level'] = start
    avg_change_year['diff_start'] = avg_change_year['start_level'] - avg_change_year['level']
    return avg_change_year

In [27]:
overall_level_change_1950 = get_level_change_overall(1950)
overall_level_change_1950.tail(1)

Unnamed: 0,year,level,count,start_level,diff_start
71,2021,172.283911,1398,67.813887,-104.470024


In [28]:
overall_level_change_2011 = get_level_change_overall(2011)
overall_level_change_2011.tail(1)

Unnamed: 0,year,level,count,start_level,diff_start
10,2021,172.283911,1398,128.768708,-43.515203


In [29]:
alt.Chart(overall_level_change_1950).mark_line().encode(
    x='year:O',
    y=alt.Y('diff_start:Q'),
    tooltip=['year', 'diff_start']
)

## Dry wells from 2020-2021 water level changes

In [30]:
# group spring measurements by range and year
range_levels = levels.groupby(['TownshipRange','year']).agg({
    'gse_gwe': ['mean'],
}).reset_index()
range_levels.columns = ['TownshipRange','year','gse_gwe']

In [33]:
# create datasets for each year
range_levels_2020 = range_levels[range_levels['year'] == 2020]
range_levels_2021 = range_levels[range_levels['year'] == 2021]
range_levels_2021.sample()

Unnamed: 0,TownshipRange,year,gse_gwe
1314,T02S R05E,2021,76.44875


In [34]:
# merge the wells data with the 2020 measurements data by township/range
wells_depth = wells.loc[wells['use'] == 'domestic',['wcr_no','TownshipRange','completed_depth_updated','year_work_ended','county']]
wells_depth = wells_depth.merge(range_levels_2020, on=["TownshipRange"], how="left")
wells_depth = wells_depth.drop_duplicates(subset="wcr_no")
wells_depth.sample()

Unnamed: 0,wcr_no,TownshipRange,completed_depth_updated,year_work_ended,county,year,gse_gwe
30544,WCR1984-005102,T13S R22E,124.0,1984.0,Fresno,2020.0,47.46875


In [35]:
# categorize dry and not dry wells based on depth (allowing for 20 feet of pump placement)
wells_depth['condition'] = 'not_dry'
wells_depth.loc[pd.isna(wells_depth['completed_depth_updated']) | pd.isna(wells_depth['gse_gwe']), 'condition'] = 'unknown'
wells_depth.loc[wells_depth['completed_depth_updated'] - 20 <= wells_depth['gse_gwe'], 'condition'] = 'dry'

In [36]:
# get all of the 'not dry' 2020 wells
not_dry_2020 = wells_depth.loc[wells_depth['condition'] == 'not_dry']
not_dry_2020.sample()

Unnamed: 0,wcr_no,TownshipRange,completed_depth_updated,year_work_ended,county,year,gse_gwe,condition
26738,WCR0175700,T10S R11E,125.0,1973.0,Merced,2020.0,14.7375,not_dry


In [490]:
# how many not dry wells are in each county?
not_dry_2020[not_dry_2020['county'].isin(main_counties)].groupby('county').size().reset_index(name='count')

Unnamed: 0,county,count
0,Fresno,12358
1,Kern,1445
2,Kings,1938
3,Madera,2346
4,Merced,5897
5,San Joaquin,8122
6,Stanislaus,7546
7,Tulare,5365


In [37]:
# merge the not dry 2020 with the range 2021
dry_wells_2021 = not_dry_2020.merge(range_levels_2021, on=["TownshipRange"], how="left")
dry_wells_2021 = dry_wells_2021.drop_duplicates(subset="wcr_no")
dry_wells_2021.rename(columns={"gse_gwe_x":"level_2020", "gse_gwe_y":"level_2021", "condition":"condition_2020"}, inplace=True)
dry_wells_2021.drop(columns=['year_x', 'year_y'], inplace=True)
dry_wells_2021.sample()

Unnamed: 0,wcr_no,TownshipRange,completed_depth_updated,year_work_ended,county,level_2020,condition_2020,level_2021
32981,WCR1979-002372,T13S R22E,140.0,1979.0,Fresno,47.46875,not_dry,50.327273


In [38]:
# categorize dry and not dry wells based on depth (allowing for 20 feet of pump placement)
dry_wells_2021['condition_2021'] = 'not_dry'
dry_wells_2021.loc[pd.isna(dry_wells_2021['completed_depth_updated']) | pd.isna(dry_wells_2021['level_2021']), 'condition_2021'] = 'unknown'
dry_wells_2021.loc[dry_wells_2021['completed_depth_updated'] - 20 <= dry_wells_2021['level_2021'], 'condition_2021'] = 'dry'
dry_wells_2021.sample()

Unnamed: 0,wcr_no,TownshipRange,completed_depth_updated,year_work_ended,county,level_2020,condition_2020,level_2021,condition_2021
3283,WCR1998-004245,T02S R07E,195.0,1998.0,San Joaquin,36.7,not_dry,10.0,not_dry


In [39]:
dry_wells_2021_range = dry_wells_2021.groupby(['TownshipRange','condition_2021']).size().reset_index(name='count')
dry_wells_2021_county = dry_wells_2021.groupby(['county','condition_2021']).size().reset_index(name='count')

In [40]:
dry_wells_2021_range = dry_wells_2021_range.pivot(index=['TownshipRange'], columns='condition_2021', values='count').reset_index()
dry_wells_2021_range = dry_wells_2021_range.fillna(0)
dry_wells_2021_range.head()

condition_2021,TownshipRange,dry,not_dry,unknown
0,T01N R02E,5.0,204.0,0.0
1,T01N R03E,2.0,559.0,0.0
2,T01N R06E,6.0,140.0,0.0
3,T01N R07E,0.0,308.0,0.0
4,T01N R08E,0.0,205.0,0.0


In [41]:
dry_wells_2021_range['dry'].sum()

1010.0

## Household water shortages by year

In [42]:
shortage_by_year = shortage.groupby(by=['CreateDateYear']).size().reset_index(name='Count')
shortage_by_year.rename(columns={'CreateDateYear':'Year'}, inplace=True)
shortage_by_year['Year'] = shortage_by_year['Year'].astype('int')

In [43]:
shortage_by_year

Unnamed: 0,Year,Count
0,2012,1
1,2014,166
2,2015,297
3,2016,1334
4,2017,168
5,2018,30
6,2019,29
7,2020,29
8,2021,465


In [44]:
# percent change frpm 2020 to 2021
(shortage_by_year['Count'].iloc[-1] -  shortage_by_year['Count'].iloc[-2]) / shortage_by_year['Count'].iloc[-2]

15.03448275862069

## Shortages by year and county

In [46]:
shortage_by_year_county = shortage[shortage['County'].isin(main_counties)].groupby(by=['CreateDateYear', 'County']).size().reset_index(name='Count')
shortage_by_year_county.rename(columns={'CreateDateYear':'Year'}, inplace=True)
shortage_by_year_county['Year'] = shortage_by_year_county['Year'].astype('int')

In [54]:
shortage_by_year_county[shortage_by_year_county['Year']==2021]

Unnamed: 0,Year,County,Count
42,2021,Fresno,113
43,2021,Kern,3
44,2021,Kings,15
45,2021,Madera,127
46,2021,Merced,44
47,2021,San Joaquin,19
48,2021,Stanislaus,9
49,2021,Tulare,133


In [61]:
# Percent of shortages that are from Tulare County
shortages_2021 = shortage_by_year[shortage_by_year['Year']==2021]['Count'].iloc[0]
shortages_2021_Tulare = shortage_by_year_county[(shortage_by_year_county['Year']==2021) & (shortage_by_year_county['County']=='Tulare')]['Count'].iloc[0]
shortages_2021_Tulare / shortages_2021

0.2860215053763441