In [8]:
import pandas as pd
import seaborn as sns
import numpy as np
from numpy import nan

In [9]:
import warnings
warnings.filterwarnings("ignore")

In [10]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('max_seq_item', None)

In [11]:
# DATA SOURCE: INFORMATION AND CONDITION OF SCHOOLS (ICOS), Pre-Disaster Mitigation module
# Data received from WA Office of Superintendent of Public Instruction (OSPI), 8/14/24, via public records request.
# https://www.k12.wa.us/policy-funding/school-buildings-facilities/information-and-condition-schools-icos
df_original=pd.read_csv("data/df_icos_clean_8.14.24.csv")

In [12]:
# Copy the data and work with the copy.
df = df_original

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6678 entries, 0 to 6677
Data columns (total 32 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Unique_Identifier                    6678 non-null   object 
 1   District                             6678 non-null   object 
 2   Site                                 6678 non-null   object 
 3   Site_Latitude                        6678 non-null   float64
 4   Site_Longitude                       6678 non-null   float64
 5   Site_Type                            6678 non-null   object 
 6   Site_EQHazardLevel                   6678 non-null   object 
 7   Site_EQRiskLevel                     2838 non-null   object 
 8   Site_Soil                            6638 non-null   object 
 9   Site_Liquefaction                    6638 non-null   object 
 10  Site_GroundMotionPercentile          6638 non-null   float64
 11  Site_HasGeotechnicalStudy     

In [14]:
df['Building_YearBuilt']=df['Building_YearBuilt'].astype('Int64')

In [15]:
mask = (df['Building_YearBuilt'] < 1998)
df_old = df[mask]
df_old.shape

(5148, 32)

In [16]:
df_old['Unique_Identifier'].nunique()

1458

In [17]:
df_old['Site_EQRiskLevel'].value_counts(dropna=False)

Site_EQRiskLevel
NaN          2837
Very High    1224
Low           496
Moderate      310
High          281
Name: count, dtype: int64

In [18]:
df_old = df_old[['Unique_Identifier', 'District', 'Site', 'Site_Latitude', 'Site_Longitude', 'Site_EQRiskLevel', \
                 'BuildingArea_ExistingRisk']]

In [19]:
df_old['Site_EQRiskLevel'] = df_old['Site_EQRiskLevel'].map({'Very High': 'complete', \
       'High': 'complete', 'Moderate': 'complete', 'Low': 'complete'})

In [20]:
df_old['Site_EQRiskLevel'].value_counts(dropna=False)

Site_EQRiskLevel
NaN         2837
complete    2311
Name: count, dtype: int64

In [21]:
mask = (df_old['Site_EQRiskLevel'] == 'complete')
df_old_site = df_old[mask]
df_old_site.shape

(2311, 7)

In [22]:
df_old_site['Unique_Identifier'].nunique() # 621 schools with pre 1998 buildings have a site risk assessment

621

In [23]:
mask = (df_old['Site_EQRiskLevel'] == 'complete')
df_old_area = df_old[~mask]
df_old_area.shape

(2837, 7)

In [24]:
df_old_area['Unique_Identifier'].nunique() # 837 schools with pre 1998 buildings have no site risk assessment

837

In [25]:
df_old_area['BuildingArea_ExistingRisk'].value_counts(dropna=False)

BuildingArea_ExistingRisk
NaN          2231
Low           252
Moderate      169
High          119
Very High      66
Name: count, dtype: int64

In [26]:
df_old_area['BuildingArea_ExistingRisk'] = df_old_area['BuildingArea_ExistingRisk'].map\
            ({'Very High': 'partial', 'High': 'partial', 'Moderate': 'partial', \
            'Low': 'partial'})

In [27]:
df_old_area['BuildingArea_ExistingRisk'].value_counts(dropna=False)

BuildingArea_ExistingRisk
NaN        2231
partial     606
Name: count, dtype: int64

In [28]:
mask = (df_old_area['BuildingArea_ExistingRisk'] == 'partial')
df_old_area_partial = df_old_area[mask]
df_old_area_partial.shape

(606, 7)

In [29]:
df_old_area_partial['Unique_Identifier'].nunique() # 261 schools have partial assessments

261

In [30]:
df_old_area_partial_ids = df_old_area_partial['Unique_Identifier'].drop_duplicates()
df_old_area_partial_ids.shape

(261,)

In [31]:
# find the schools opposite the 261
df_old_area_none = df_old_area[~df_old_area['Unique_Identifier'].isin(df_old_area_partial_ids)]
df_old_area_none.shape

(1888, 7)

In [32]:
df_old_area_none['Unique_Identifier'].nunique() # 576 schools have no partial or full assessments

576

In [33]:
# For 1,458 schools with one or more pre-1998 buildings, 621 schools have site assessments, and 837 schools
# do not have a site assessment. Out of the 837, there are 261 schools with partial assessments.

In [34]:
df_old_site_small = df_old_site[['Unique_Identifier', 'District', 'Site', 'Site_EQRiskLevel', 'Site_Latitude', \
                                       'Site_Longitude']].drop_duplicates()

In [35]:
df_old_site_small['Unique_Identifier'].nunique()

621

In [36]:
df_old_site_small.info()

<class 'pandas.core.frame.DataFrame'>
Index: 621 entries, 0 to 6676
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unique_Identifier  621 non-null    object 
 1   District           621 non-null    object 
 2   Site               621 non-null    object 
 3   Site_EQRiskLevel   621 non-null    object 
 4   Site_Latitude      621 non-null    float64
 5   Site_Longitude     621 non-null    float64
dtypes: float64(2), object(4)
memory usage: 34.0+ KB


In [37]:
df_old_area_partial.info()

<class 'pandas.core.frame.DataFrame'>
Index: 606 entries, 12 to 6669
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Unique_Identifier          606 non-null    object 
 1   District                   606 non-null    object 
 2   Site                       606 non-null    object 
 3   Site_Latitude              606 non-null    float64
 4   Site_Longitude             606 non-null    float64
 5   Site_EQRiskLevel           0 non-null      object 
 6   BuildingArea_ExistingRisk  606 non-null    object 
dtypes: float64(2), object(5)
memory usage: 37.9+ KB


In [38]:
df_old_area_partial_small = df_old_area_partial[['Unique_Identifier', 'District', 'Site', \
                            'BuildingArea_ExistingRisk', 'Site_Latitude', 'Site_Longitude']].drop_duplicates()

In [39]:
df_old_area_partial_small.info()

<class 'pandas.core.frame.DataFrame'>
Index: 261 entries, 12 to 6667
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Unique_Identifier          261 non-null    object 
 1   District                   261 non-null    object 
 2   Site                       261 non-null    object 
 3   BuildingArea_ExistingRisk  261 non-null    object 
 4   Site_Latitude              261 non-null    float64
 5   Site_Longitude             261 non-null    float64
dtypes: float64(2), object(4)
memory usage: 14.3+ KB


In [40]:
df_old_area_none_small = df_old_area_none[['Unique_Identifier', 'District', 'Site', 'BuildingArea_ExistingRisk', \
                         'Site_Latitude', 'Site_Longitude']].drop_duplicates()

In [41]:
df_old_area_none_small.info()

<class 'pandas.core.frame.DataFrame'>
Index: 576 entries, 13 to 6568
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Unique_Identifier          576 non-null    object 
 1   District                   576 non-null    object 
 2   Site                       576 non-null    object 
 3   BuildingArea_ExistingRisk  0 non-null      object 
 4   Site_Latitude              576 non-null    float64
 5   Site_Longitude             576 non-null    float64
dtypes: float64(2), object(4)
memory usage: 31.5+ KB


In [42]:
df_old_site_small = df_old_site_small.rename(columns={'Site_EQRiskLevel': 'assessment status'})

In [43]:
df_old_area_partial_small = df_old_area_partial_small.rename(columns={'BuildingArea_ExistingRisk': 'assessment status'})

In [44]:
df_old_area_none_small = df_old_area_none_small.rename(columns={'BuildingArea_ExistingRisk': 'assessment status'})

In [45]:
df_icos_assessments = pd.concat([df_old_site_small, df_old_area_partial_small, df_old_area_none_small])

In [46]:
df_icos_assessments['assessment status'] = df_icos_assessments['assessment status'].fillna('none')

In [47]:
df_icos_assessments.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1458 entries, 0 to 6568
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unique_Identifier  1458 non-null   object 
 1   District           1458 non-null   object 
 2   Site               1458 non-null   object 
 3   assessment status  1458 non-null   object 
 4   Site_Latitude      1458 non-null   float64
 5   Site_Longitude     1458 non-null   float64
dtypes: float64(2), object(4)
memory usage: 79.7+ KB


In [48]:
df_icos_assessments['assessment status'].value_counts()

assessment status
complete    621
none        576
partial     261
Name: count, dtype: int64

In [49]:
df_icos_assessments['assessment status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

assessment status
complete    42.6%
none        39.5%
partial     17.9%
Name: proportion, dtype: object

In [50]:
df_icos_assessments_small = df_icos_assessments[['assessment status', 'Site_Latitude', 'Site_Longitude']]

In [52]:
df_icos_assessments_small.to_csv('data/df_icos_assessments.csv', sep=',', index=False, encoding='utf-8')

In [None]:
# The resulting map is published here: https://datawrapper.dwcdn.net/kkORl/1/