In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import config

In [2]:
trips = pd.read_csv(config.TRIPS_1216_PRC, low_memory=False)
trips.shape

(44351, 17)

In [3]:
trips.head()

Unnamed: 0,TRIPID,PERSID,HHID,TRIPNO,TRAVDOW,LINKMODE,TRIPPURP,STARTIME,ARRTIME,TRIPTIME,TRAVTIME,WAITIME,ORIGSA1,DESTSA1,CUMDIST,HomeSA1,Mode
0,Y12H0000107P01T01,Y12H0000107P01,Y12H0000107,1,Sunday,Vehicle Driver,Social,630,650,20,20,0,20904122028,20904122002,2.6,20904122028,Car
1,Y12H0000107P01T02,Y12H0000107P01,Y12H0000107,2,Sunday,Vehicle Driver,Social,750,760,10,10,0,20904122002,20904122028,2.6,20904122028,Car
2,Y12H0000107P01T03,Y12H0000107P01,Y12H0000107,3,Sunday,Walking,Recreational,810,840,30,30,0,20904122028,20904122031,3.23,20904122028,Walk
3,Y12H0000107P01T04,Y12H0000107P01,Y12H0000107,4,Sunday,Walking,Recreational,855,885,30,30,0,20904122031,20904122028,3.23,20904122028,Walk
4,Y12H0000107P02T01,Y12H0000107P02,Y12H0000107,1,Sunday,Vehicle Passenger,Social,630,650,20,20,0,20904122028,20904122002,2.6,20904122028,Car


### Get SA1 shapefile

In [4]:
shape_file = "../data/raw/1270055001_sa1_2016_aust_shape.zip"
gdf = gpd.read_file(shape_file)

In [5]:
gdf.shape

(57523, 15)

In [6]:
gdf.head()

Unnamed: 0,SA1_MAIN16,SA1_7DIG16,SA2_MAIN16,SA2_5DIG16,SA2_NAME16,SA3_CODE16,SA3_NAME16,SA4_CODE16,SA4_NAME16,GCC_CODE16,GCC_NAME16,STE_CODE16,STE_NAME16,AREASQKM16,geometry
0,10102100701,1100701,101021007,11007,Braidwood,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,362.8727,"POLYGON ((149.71174 -35.12318, 149.71184 -35.1..."
1,10102100702,1100702,101021007,11007,Braidwood,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,229.7459,"POLYGON ((149.73421 -35.36758, 149.74475 -35.3..."
2,10102100703,1100703,101021007,11007,Braidwood,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,2.391,"POLYGON ((149.77998 -35.44067, 149.77996 -35.4..."
3,10102100704,1100704,101021007,11007,Braidwood,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,1.2816,"POLYGON ((149.79744 -35.44149, 149.79778 -35.4..."
4,10102100705,1100705,101021007,11007,Braidwood,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,1.1978,"POLYGON ((149.79498 -35.44902, 149.79517 -35.4..."


In [7]:
gdf.STE_NAME16.value_counts()

New South Wales                 18399
Victoria                        14073
Queensland                      11563
Western Australia                5984
South Australia                  4245
Tasmania                         1464
Australian Capital Territory     1147
Northern Territory                626
Other Territories                  22
Name: STE_NAME16, dtype: int64

In [8]:
# Only using SA1 from Victoria
gdf_vic = gdf[gdf.STE_NAME16 == "Victoria"]
gdf_vic.shape

(14073, 15)

In [9]:
gdf_vic.columns

Index(['SA1_MAIN16', 'SA1_7DIG16', 'SA2_MAIN16', 'SA2_5DIG16', 'SA2_NAME16',
       'SA3_CODE16', 'SA3_NAME16', 'SA4_CODE16', 'SA4_NAME16', 'GCC_CODE16',
       'GCC_NAME16', 'STE_CODE16', 'STE_NAME16', 'AREASQKM16', 'geometry'],
      dtype='object')

In [10]:
# Remove SA1 do not have geometry
gdf_vic.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [11]:
gdf_vic.isna().sum()

SA1_MAIN16    0
SA1_7DIG16    0
SA2_MAIN16    0
SA2_5DIG16    0
SA2_NAME16    0
SA3_CODE16    0
SA3_NAME16    0
SA4_CODE16    0
SA4_NAME16    0
GCC_CODE16    0
GCC_NAME16    0
STE_CODE16    0
STE_NAME16    0
AREASQKM16    0
geometry      0
dtype: int64

In [12]:
columns = ['SA1_MAIN16', 'SA2_MAIN16', 'SA2_NAME16', 
           'SA3_CODE16', 'SA3_NAME16', 'SA4_CODE16', 'SA4_NAME16', 
           'STE_NAME16', 'AREASQKM16', 'geometry']
gdf_vic = gdf_vic[columns]
gdf_vic.shape

(14069, 10)

In [13]:
gdf_vic["centroid"] = gdf_vic.geometry.apply(lambda x: x.centroid)

In [14]:
gdf_vic.head()

Unnamed: 0,SA1_MAIN16,SA2_MAIN16,SA2_NAME16,SA3_CODE16,SA3_NAME16,SA4_CODE16,SA4_NAME16,STE_NAME16,AREASQKM16,geometry,centroid
18399,20101100101,201011001,Alfredton,20101,Ballarat,201,Ballarat,Victoria,0.1805,"POLYGON ((143.80089 -37.55384, 143.80039 -37.5...",POINT (143.80268 -37.55164)
18400,20101100102,201011001,Alfredton,20101,Ballarat,201,Ballarat,Victoria,0.0728,"POLYGON ((143.79772 -37.55073, 143.79780 -37.5...",POINT (143.79768 -37.55221)
18401,20101100105,201011001,Alfredton,20101,Ballarat,201,Ballarat,Victoria,0.6349,"POLYGON ((143.81562 -37.55871, 143.81687 -37.5...",POINT (143.81432 -37.56436)
18402,20101100106,201011001,Alfredton,20101,Ballarat,201,Ballarat,Victoria,0.4161,"POLYGON ((143.80094 -37.55384, 143.80239 -37.5...",POINT (143.80248 -37.55616)
18403,20101100107,201011001,Alfredton,20101,Ballarat,201,Ballarat,Victoria,0.2292,"POLYGON ((143.81294 -37.55644, 143.81300 -37.5...",POINT (143.81556 -37.55658)


### Merge OD SA1 centroids to trips dataset

In [15]:
trips.head()

Unnamed: 0,TRIPID,PERSID,HHID,TRIPNO,TRAVDOW,LINKMODE,TRIPPURP,STARTIME,ARRTIME,TRIPTIME,TRAVTIME,WAITIME,ORIGSA1,DESTSA1,CUMDIST,HomeSA1,Mode
0,Y12H0000107P01T01,Y12H0000107P01,Y12H0000107,1,Sunday,Vehicle Driver,Social,630,650,20,20,0,20904122028,20904122002,2.6,20904122028,Car
1,Y12H0000107P01T02,Y12H0000107P01,Y12H0000107,2,Sunday,Vehicle Driver,Social,750,760,10,10,0,20904122002,20904122028,2.6,20904122028,Car
2,Y12H0000107P01T03,Y12H0000107P01,Y12H0000107,3,Sunday,Walking,Recreational,810,840,30,30,0,20904122028,20904122031,3.23,20904122028,Walk
3,Y12H0000107P01T04,Y12H0000107P01,Y12H0000107,4,Sunday,Walking,Recreational,855,885,30,30,0,20904122031,20904122028,3.23,20904122028,Walk
4,Y12H0000107P02T01,Y12H0000107P02,Y12H0000107,1,Sunday,Vehicle Passenger,Social,630,650,20,20,0,20904122028,20904122002,2.6,20904122028,Car


In [16]:
trips = trips.merge(gdf_vic[['SA1_MAIN16','centroid']], how="left", left_on="ORIGSA1", right_on="SA1_MAIN16")
trips = trips.rename(columns={"centroid":"ORIG_CENTROID"})
trips = trips.drop(columns=["SA1_MAIN16"], axis=1)

In [17]:
trips = trips.merge(gdf_vic[['SA1_MAIN16','centroid']], how="left", left_on="DESTSA1", right_on="SA1_MAIN16")
trips = trips.rename(columns={"centroid":"DEST_CENTROID"})
trips = trips.drop(columns=["SA1_MAIN16"], axis=1)

In [18]:
trips.head()

Unnamed: 0,TRIPID,PERSID,HHID,TRIPNO,TRAVDOW,LINKMODE,TRIPPURP,STARTIME,ARRTIME,TRIPTIME,TRAVTIME,WAITIME,ORIGSA1,DESTSA1,CUMDIST,HomeSA1,Mode,ORIG_CENTROID,DEST_CENTROID
0,Y12H0000107P01T01,Y12H0000107P01,Y12H0000107,1,Sunday,Vehicle Driver,Social,630,650,20,20,0,20904122028,20904122002,2.6,20904122028,Car,POINT (145.08730 -37.65974),POINT (145.07152 -37.65148)
1,Y12H0000107P01T02,Y12H0000107P01,Y12H0000107,2,Sunday,Vehicle Driver,Social,750,760,10,10,0,20904122002,20904122028,2.6,20904122028,Car,POINT (145.07152 -37.65148),POINT (145.08730 -37.65974)
2,Y12H0000107P01T03,Y12H0000107P01,Y12H0000107,3,Sunday,Walking,Recreational,810,840,30,30,0,20904122028,20904122031,3.23,20904122028,Walk,POINT (145.08730 -37.65974),POINT (145.06223 -37.66497)
3,Y12H0000107P01T04,Y12H0000107P01,Y12H0000107,4,Sunday,Walking,Recreational,855,885,30,30,0,20904122031,20904122028,3.23,20904122028,Walk,POINT (145.06223 -37.66497),POINT (145.08730 -37.65974)
4,Y12H0000107P02T01,Y12H0000107P02,Y12H0000107,1,Sunday,Vehicle Passenger,Social,630,650,20,20,0,20904122028,20904122002,2.6,20904122028,Car,POINT (145.08730 -37.65974),POINT (145.07152 -37.65148)


In [19]:
trips.TRAVDOW.value_counts()

Thursday     7119
Wednesday    7024
Friday       6920
Monday       6798
Tuesday      6529
Saturday     5233
Sunday       4728
Name: TRAVDOW, dtype: int64

### Saves trips OD locations into CSV

In [20]:
trips.to_csv("../data/processed/trip_od_locations.csv", index=False)

In [21]:
trips.shape

(44351, 19)