# Data Processing

In [None]:
import pandas as pd
import geopandas as gpd

# Load data

In [None]:
directory = 'data/NZ_landslide_inventory/'
geo_filepath = f'{directory}/inventory.gpkg'

gdf_points = gpd.read_file(geo_filepath, layer='points')
gdf_lines = gpd.read_file(geo_filepath, layer='lines')
gdf_polygons = gpd.read_file(geo_filepath, layer='polygons')

In [None]:
# directory = 'data/NZ_landslide_inventory/inventory_'

# gdf_points = gpd.read_file(f'{directory}points.geojson')
# gdf_lines = gpd.read_file(f'{directory}lines.geojson')
# gdf_polygons = gpd.read_file(f'{directory}polygons.geojson')

## Check data

In [None]:
gdf_points.info()

In [None]:
gdf_lines.info()

In [None]:
gdf_polygons.info()

## Columns Selection

In [None]:
gdf = gdf[
    ['GlobalID', 'geometry', 'latitude', 'longitude', 'region', 'dateoccurence', 'dateconfidence', 'geometrymodifiedon','createdon']
]

## Feature Processing

In [None]:
display(gdf.info())

### Closest date of occurence

In [None]:
gdf[['dateoccurence', 'createdon', 'dateconfidence', 'geometrymodifiedon']]

In [None]:
# Convert columns to datetime
# for col in [
#     'dateoccurence', 'dateconfidence', 'createdon', 'geometrymodifiedon'
#     ]:
for col in ['dateoccurence']:
    # Convert invalid values to NaT and use GMT timezone
    gdf[col] = pd.to_datetime(gdf[col], errors='coerce', utc=True)
    gdf[col] = gdf[col].dt.tz_localize(None)  #

Check if timezone make change in date

In [None]:
# gdf[
#     gdf['dateconfidence'].notna() &
#     (gdf['dateconfidence'] < gdf['dateoccurence'])]
# [['dateconfidence','dateoccurence']]

In [None]:
# gdf['closestdate'] = gdf[
#     ['dateoccurence', 'createdon', 'dateconfidence', 'geometrymodifiedon']
#     ].min(axis=1)

In [None]:
# gdf.drop(
#     columns=['createdon', 'dateconfidence', 'geometrymodifiedon']
#     , inplace=True)

In [None]:
# Data error
# gdf.drop(gdf[gdf['closestdate']<'2022-01-01'].index, inplace=True)
gdf.drop(gdf[gdf['dateoccurence']<'2022-01-01'].index, inplace=True)

In [None]:
# group by month and year count number of landslides
landslide_counts = gdf.copy()
landslide_counts['year_month'] = landslide_counts['dateoccurence'].dt.to_period('M')
landslide_counts = landslide_counts.groupby('year_month').size().reset_index(name='counts')

landslide_counts

In [None]:
gdf.head()

### Areas

In [None]:
gdf['region'].value_counts()

### Export Cleanned data

In [None]:
gdf[gdf['region']=='Auckland Region'].to_file(
    f'{directory}/cleaned_auckland.gpkg', driver='GPKG', layer='landslides')

gdf[gdf['region']!='Auckland Region'].to_file(
    f'{directory}/cleaned_non_auckland.gpkg', driver='GPKG', layer='landslides')