# Data Processing

In [1]:
import pandas as pd
import geopandas as gpd

# Load data

In [2]:
directory = 'data/NZ_landslide_inventory/'
geo_filepath = f'{directory}/raw_data.gpkg'
gdf = gpd.read_file(geo_filepath, layer='landslides')
gdf.head()

Unnamed: 0,OBJECTID,landslide_point_type,landslide_point_description,Landslide_GUID_point,name,activitytype,areamaximum,description,isvalidlandslide,sourcedescription,...,widthrupture,xcoordinate,ycoordinate,dateoccurence,recentdateoccurrence,modifiedon,originaldatabaseid,primarymovementtypesubclass,GlobalID,geometry
0,1,Centre of Evacuation,,714143f7-4ae2-ed11-8847-00224818a32c,South-eastern end of Blockhouse Bay Beach Reserve,,,Slip debris blocks public path and extends ove...,Yes,Auckland Council,...,19.0,1751568.999,5911395.1893,2023-02-13 23:00:00,,2024-12-04 20:38:18,,,cb20f8d7-abeb-4ce5-bafa-a75c01b03136,POINT (174.70188 -36.93131)
1,2,Centre of Evacuation,,67def7c6-4de2-ed11-8847-00224818ad3f,Gittos Domain A,,,Minor landslide/scouring has occurred on north...,Yes,Auckland Council,...,4.0,1751927.3675,5912169.8622,2023-01-30 23:00:00,,2024-12-04 20:38:18,,,9da581bc-ff20-454f-b6fb-721279cd347a,POINT (174.70575 -36.92427)
2,3,Centre of Evacuation,,b012b8df-46e2-ed11-8847-00224818a32c,Arch Hill Scenic Reserve,,,Landslip undermined part of concrete walkway &...,Yes,Auckland Council,...,12.0,1754943.6098,5918430.289,2023-01-31 23:00:00,,2024-12-04 20:38:18,,,2a361da4-7adf-4f4d-b94c-f3cf9ccccc46,POINT (174.73832 -36.86737)
3,4,Centre of Evacuation,,fe1e4a15-b7e3-ed11-8847-00224818ad3f,"Rotary Reserve, Te Atahu",,,Landslide has occurred on eastern side of exis...,Yes,Auckland Council,...,6.0,1747554.2439,5919202.5713,2023-02-13 23:00:00,,2024-12-04 20:38:18,,,4470b30b-6c80-46be-b0f7-e4ec0a5d72fd,POINT (174.6553 -36.86159)
4,5,Centre of Evacuation,,043c5354-c2e3-ed11-8847-00224818a32c,Telephone Rd Reserve,,,Slip scarp head undercutting stairs + deck at ...,Yes,Auckland Council,...,5.0,1754134.7853,5923952.9585,2023-01-31 23:00:00,,2024-12-04 20:38:18,,,cfcff56a-7cc6-41f5-9e7e-8da0daa5dbde,POINT (174.72813 -36.81774)


In [3]:
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 146813 entries, 0 to 146812
Data columns (total 92 columns):
 #   Column                       Non-Null Count   Dtype   
---  ------                       --------------   -----   
 0   OBJECTID                     146813 non-null  int64   
 1   landslide_point_type         146813 non-null  object  
 2   landslide_point_description  23 non-null      object  
 3   Landslide_GUID_point         146813 non-null  object  
 4   name                         146806 non-null  object  
 5   activitytype                 146621 non-null  object  
 6   areamaximum                  146789 non-null  float64 
 7   description                  66 non-null      object  
 8   isvalidlandslide             146813 non-null  object  
 9   sourcedescription            146809 non-null  object  
 10  parentid_value               0 non-null       object  
 11  parentid_name                0 non-null       object  
 12  primarymovementid_value      146799 

## Columns Selection

In [4]:
gdf = gdf[
    ['GlobalID', 'geometry', 'latitude', 'longitude', 'region', 'dateoccurence', 'dateconfidence', 'geometrymodifiedon','createdon']
]

## Feature Processing

In [5]:
display(gdf.info())

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 146813 entries, 0 to 146812
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   GlobalID            146813 non-null  object  
 1   geometry            146813 non-null  geometry
 2   latitude            146812 non-null  float64 
 3   longitude           146812 non-null  float64 
 4   region              146812 non-null  object  
 5   dateoccurence       194 non-null     object  
 6   dateconfidence      50 non-null      object  
 7   geometrymodifiedon  34 non-null      object  
 8   createdon           146813 non-null  object  
dtypes: float64(2), geometry(1), object(6)
memory usage: 10.1+ MB


None

### Closest date of occurence

In [6]:
gdf[['dateoccurence', 'createdon', 'dateconfidence', 'geometrymodifiedon']]

Unnamed: 0,dateoccurence,createdon,dateconfidence,geometrymodifiedon
0,2023-02-13 23:00:00,2023-04-24 02:51:58,2023-03-16T12:00:00+13:00,4/26/2023 2:11 AM
1,2023-01-30 23:00:00,2023-04-24 03:12:02,,4/24/2023 3:16 AM
2,2023-01-31 23:00:00,2023-04-24 02:22:43,,4/25/2023 9:21 PM
3,2023-02-13 23:00:00,2023-04-25 22:18:23,,4/25/2023 10:21 PM
4,2023-01-31 23:00:00,2023-04-25 23:38:50,,4/25/2023 11:41 PM
...,...,...,...,...
146808,2023-01-27 11:00:00,2024-08-06 21:50:12,2023-01-27T21:00:00+13:00,
146809,1900-01-01 00:00:00,2023-11-03 02:57:09,,
146810,2009-09-30 00:00:00,2024-03-04 22:33:57,2023-02-06T12:00:00+13:00,
146811,2023-01-26 11:00:00,2024-11-12 00:44:16,2023-01-27T12:00:00+13:00,


In [7]:
# Convert columns to datetime
for col in [
    'dateoccurence', 'dateconfidence', 'createdon', 'geometrymodifiedon'
    ]:
    # Convert invalid values to NaT and use GMT timezone
    gdf[col] = pd.to_datetime(gdf[col], errors='coerce', utc=True)
    gdf[col] = gdf[col].dt.tz_localize(None)  #

Check if timezone make change in date

In [8]:
gdf[
    gdf['dateconfidence'].notna() &
    (gdf['dateconfidence'] < gdf['dateoccurence'])]
[['dateconfidence','dateoccurence']]

[['dateconfidence', 'dateoccurence']]

In [9]:
gdf['closestdate'] = gdf[
    ['dateoccurence', 'createdon', 'dateconfidence', 'geometrymodifiedon']
    ].min(axis=1)

In [10]:
gdf.drop(
    columns=['createdon', 'dateconfidence', 'geometrymodifiedon']
    , inplace=True)

In [11]:
gdf.drop(gdf[gdf['closestdate']<'2022-01-01'].index, inplace=True)

In [12]:
gdf.head()

Unnamed: 0,GlobalID,geometry,latitude,longitude,region,dateoccurence,closestdate
0,cb20f8d7-abeb-4ce5-bafa-a75c01b03136,POINT (174.70188 -36.93131),-36.931306,174.70188,Auckland Region,2023-02-13 23:00:00,2023-02-13 23:00:00
1,9da581bc-ff20-454f-b6fb-721279cd347a,POINT (174.70575 -36.92427),-36.924269,174.705747,Auckland Region,2023-01-30 23:00:00,2023-01-30 23:00:00
2,2a361da4-7adf-4f4d-b94c-f3cf9ccccc46,POINT (174.73832 -36.86737),-36.867368,174.738318,Auckland Region,2023-01-31 23:00:00,2023-01-31 23:00:00
3,4470b30b-6c80-46be-b0f7-e4ec0a5d72fd,POINT (174.6553 -36.86159),-36.861593,174.655298,Auckland Region,2023-02-13 23:00:00,2023-02-13 23:00:00
4,cfcff56a-7cc6-41f5-9e7e-8da0daa5dbde,POINT (174.72813 -36.81774),-36.817738,174.728127,Auckland Region,2023-01-31 23:00:00,2023-01-31 23:00:00


### Areas

In [13]:
gdf['region'].value_counts()

region
Hawke's Bay Region           104221
Gisborne Region               26895
Auckland Region               11916
Manawatu-Whanganui Region      3530
Waikato Region                  167
Northland Region                 44
Bay of Plenty Region              5
Name: count, dtype: int64

### Export Cleanned data

In [14]:
gdf[gdf['region']=='Auckland Region'].to_file(
    f'{directory}/cleaned_auckland.gpkg', driver='GPKG', layer='landslides')

gdf[gdf['region']!='Auckland Region'].to_file(
    f'{directory}/cleaned_non_auckland.gpkg', driver='GPKG', layer='landslides')

  write(
  write(
