# Creating geojson file for choropleth map

To create the geojson file, a shapefile containing Federal Electorate polygon data will be merged with a DataFrame containing statistical information about the electorates.
The merged GeoDataFrame will then be exported as GeoJSON.
The GeoJSON file will contain the polygon data for mapping, as well as the statistical variable which determines the choropleth colours (% yes votes). It will also contain electorate information which will be used in map markers.

### Imports

In [1]:
import pandas as pd
import geopandas as gp

### Reading in shapefile

The shapefile was sourced from the [Australian Electoral Commission](https://aec.gov.au/Electorates/gis/gis_datadownload.htm) and contains polygon data for the Federal Electorates.

In [2]:
# Reading in the shapefile
myshpfile = gp.read_file('national-esri-16122011/COM20111216_ELB_region.shp')

In [3]:
# Viewing the shapefile
print(myshpfile)

         ELECT_DIV STATE  NUMCCDS  ACTUAL  PROJECTED  POPULATION  OVER_18  \
0         Lingiari    NT      335       0          0           0        0   
1          Solomon    NT      180       0          0           0        0   
2         Canberra   ACT      283       0          0           0        0   
3           Fraser   ACT      270       0          0           0        0   
4            Brand    WA      243       0          0           0        0   
..             ...   ...      ...     ...        ...         ...      ...   
145          Makin    SA      232       0          0           0        0   
146           Mayo    SA      276       0          0           0        0   
147  Port Adelaide    SA      299       0          0           0        0   
148          Sturt    SA      255       0          0           0        0   
149      Wakefield    SA      277       0          0           0        0   

        AREA_SQKM       SORTNAME  \
0    1.352034e+06       Lingiari   
1  

In [4]:
# Checkign the data type 
type(myshpfile)

geopandas.geodataframe.GeoDataFrame

### Reading in CSV's and merging DataFrames

In [5]:
# Reading in and viewing marriage_postal_results_df
marriage_postal_results_df = pd.read_csv("../data/05-output_marriage_postal_results/marriage_postal_results.csv")
marriage_postal_results_df.head()

Unnamed: 0,division_id,yes_count,no_count,total_responses,response_unclear,non_responding
0,103,37736,46343,84079,247,20928
1,104,37153,47984,85137,226,24008
2,105,42943,43215,86158,244,19973
3,106,48471,40369,88840,212,16038
4,107,20406,57926,78332,220,25883


In [6]:
# Calculating percentage of yes votes
a = marriage_postal_results_df["yes_count"]
b = marriage_postal_results_df["total_responses"]
percentage_yes = (a/b)*100

In [7]:
# Creating new dataframe
percentage_yes_df = pd.DataFrame({'division_id': marriage_postal_results_df["division_id"], 
                           'percentage_yes': percentage_yes})
# Viewing the dataframe
percentage_yes_df.head()

Unnamed: 0,division_id,percentage_yes
0,103,44.881599
1,104,43.639076
2,105,49.84215
3,106,54.559883
4,107,26.050656


In [8]:
# Reading in marriage_postal_turnout_df
marriage_postal_turnout_df = pd.read_csv("../data/06-output_marriage_postal_turnout/marriage_postal_turnout.csv")
marriage_postal_turnout_df.head()

Unnamed: 0,division_id,total_eligible,total_participants,turnout_percent
0,179,108708,88422,81.339
1,197,95253,77706,81.578533
2,198,113368,92583,81.665902
3,103,104682,83846,80.09591
4,180,105297,81124,77.043031


In [9]:
# Keeping the required columns
marriage_postal_turnout_df = marriage_postal_turnout_df[["division_id", "turnout_percent"]]
marriage_postal_turnout_df.head()

Unnamed: 0,division_id,turnout_percent
0,179,81.339
1,197,81.578533
2,198,81.665902
3,103,80.09591
4,180,77.043031


In [10]:
# Merging the first two dataframes
combined_df = pd.merge(percentage_yes_df, marriage_postal_turnout_df, how="outer", on="division_id")
combined_df.head()

Unnamed: 0,division_id,percentage_yes,turnout_percent
0,103,44.881599,80.09591
1,104,43.639076,78.018658
2,105,49.84215,81.20865
3,106,54.559883,84.733731
4,107,26.050656,75.200192


In [11]:
# Reading in election_results_df
election_results_df = pd.read_csv("../data/02-output_election_results/02-election_results.csv")
election_results_df.head()

Unnamed: 0,division_id,enrolment,demographic,previous_party,previous_seat_status,successful_party,seat_status
0,179,109217,Inner Metropolitan,Australian Labor Party,Marginal,Australian Labor Party,Marginal
1,197,96043,Outer Metropolitan,Liberal,Fairly Safe,Liberal,Fairly Safe
2,198,110755,Provincial,Australian Labor Party,Marginal,Australian Labor Party,Fairly Safe
3,103,104891,Inner Metropolitan,Liberal,Marginal,Liberal,Marginal
4,180,105600,Rural,Liberal,Safe,Liberal,Marginal


In [12]:
# Keeping the required columns
election_results_df = election_results_df[["division_id", "successful_party"]]
election_results_df.head()

Unnamed: 0,division_id,successful_party
0,179,Australian Labor Party
1,197,Liberal
2,198,Australian Labor Party
3,103,Liberal
4,180,Liberal


In [13]:
# Merging combined_df with election_results_df
second_combined_df = pd.merge(combined_df, election_results_df, how="outer", on="division_id")
second_combined_df.head()

Unnamed: 0,division_id,percentage_yes,turnout_percent,successful_party
0,103,44.881599,80.09591,Liberal
1,104,43.639076,78.018658,Australian Labor Party
2,105,49.84215,81.20865,Liberal
3,106,54.559883,84.733731,Liberal
4,107,26.050656,75.200192,Australian Labor Party


In [14]:
# Reading in electoral_division_df
electoral_division_df = pd.read_csv("../data/01-output_electoral_division/electoral_division.csv")
electoral_division_df.head()

Unnamed: 0,division_id,electoral_division,state
0,179,Adelaide,SA
1,197,Aston,VIC
2,198,Ballarat,VIC
3,103,Banks,NSW
4,180,Barker,SA


In [15]:
# Keeping the required columns
electoral_division_df = electoral_division_df[["division_id", "electoral_division"]] 
electoral_division_df.head()

Unnamed: 0,division_id,electoral_division
0,179,Adelaide
1,197,Aston
2,198,Ballarat
3,103,Banks
4,180,Barker


In [16]:
# Merging second_combined_df with electoral_division_df
third_combined_df = pd.merge(second_combined_df, electoral_division_df, how="outer", on="division_id")
third_combined_df.head()

Unnamed: 0,division_id,percentage_yes,turnout_percent,successful_party,electoral_division
0,103,44.881599,80.09591,Liberal,Banks
1,104,43.639076,78.018658,Australian Labor Party,Barton
2,105,49.84215,81.20865,Liberal,Bennelong
3,106,54.559883,84.733731,Liberal,Berowra
4,107,26.050656,75.200192,Australian Labor Party,Blaxland


In [17]:
# Changing the name of electoral_division column so that it matches the shapefile
final_df = third_combined_df.rename(columns = {"electoral_division": "ELECT_DIV"})
final_df.head()

Unnamed: 0,division_id,percentage_yes,turnout_percent,successful_party,ELECT_DIV
0,103,44.881599,80.09591,Liberal,Banks
1,104,43.639076,78.018658,Australian Labor Party,Barton
2,105,49.84215,81.20865,Liberal,Bennelong
3,106,54.559883,84.733731,Liberal,Berowra
4,107,26.050656,75.200192,Australian Labor Party,Blaxland


In [18]:
# Merging final_df with myshpfile
# Have myshpfile as the first merge element as this will ensure the result is a GeoDataFrame. 
# This ensures it can be exported as geojson

geojson_df = pd.merge(myshpfile, final_df, how="inner", on="ELECT_DIV")

In [19]:
# Viewing the GeoDataFrame
geojson_df.head()

Unnamed: 0,ELECT_DIV,STATE,NUMCCDS,ACTUAL,PROJECTED,POPULATION,OVER_18,AREA_SQKM,SORTNAME,geometry,division_id,percentage_yes,turnout_percent,successful_party
0,Lingiari,NT,335,0,0,0,0,1352034.05,Lingiari,"MULTIPOLYGON (((138.00117 -18.62066, 138.00120...",306,54.478296,50.112256,Australian Labor Party
1,Solomon,NT,180,0,0,0,0,336.686,Solomon,"MULTIPOLYGON (((130.81592 -12.40270, 130.81747...",307,65.255654,66.766021,Australian Labor Party
2,Canberra,ACT,283,0,0,0,0,1920.48,Canberra,"POLYGON ((149.10952 -35.28433, 149.11030 -35.2...",101,74.071318,83.221551,Australian Labor Party
3,Brand,WA,243,0,0,0,0,410.53,Brand,"MULTIPOLYGON (((115.88708 -32.25541, 115.88710...",235,67.093267,76.007266,Australian Labor Party
4,Canning,WA,268,0,0,0,0,6178.04,Canning,"POLYGON ((116.65681 -32.80318, 116.65772 -32.8...",236,60.227315,78.438074,Liberal


In [20]:
# Confirming the data type
type(geojson_df)

geopandas.geodataframe.GeoDataFrame

In [21]:
# Checking null values 
geojson_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 145 entries, 0 to 144
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   ELECT_DIV         145 non-null    object  
 1   STATE             145 non-null    object  
 2   NUMCCDS           145 non-null    int64   
 3   ACTUAL            145 non-null    int64   
 4   PROJECTED         145 non-null    int64   
 5   POPULATION        145 non-null    int64   
 6   OVER_18           145 non-null    int64   
 7   AREA_SQKM         145 non-null    float64 
 8   SORTNAME          145 non-null    object  
 9   geometry          145 non-null    geometry
 10  division_id       145 non-null    int64   
 11  percentage_yes    145 non-null    float64 
 12  turnout_percent   145 non-null    float64 
 13  successful_party  145 non-null    object  
dtypes: float64(3), geometry(1), int64(6), object(4)
memory usage: 17.0+ KB


The combined dataframe contains 145 rows, rather than the 150 we require. When testing how the dataframes would merge we discovered that the sources contained slightly different electorates. This would be due to electorate boundaries changing in between the times the data was created. 

We decided to do an inner merge so that we would have all the required information and no null values. 

In [22]:
# Dropping uneeded columns
geojson_df = geojson_df.drop(columns=['ACTUAL', 'PROJECTED', 'POPULATION', 'OVER_18'])
geojson_df.head()

Unnamed: 0,ELECT_DIV,STATE,NUMCCDS,AREA_SQKM,SORTNAME,geometry,division_id,percentage_yes,turnout_percent,successful_party
0,Lingiari,NT,335,1352034.05,Lingiari,"MULTIPOLYGON (((138.00117 -18.62066, 138.00120...",306,54.478296,50.112256,Australian Labor Party
1,Solomon,NT,180,336.686,Solomon,"MULTIPOLYGON (((130.81592 -12.40270, 130.81747...",307,65.255654,66.766021,Australian Labor Party
2,Canberra,ACT,283,1920.48,Canberra,"POLYGON ((149.10952 -35.28433, 149.11030 -35.2...",101,74.071318,83.221551,Australian Labor Party
3,Brand,WA,243,410.53,Brand,"MULTIPOLYGON (((115.88708 -32.25541, 115.88710...",235,67.093267,76.007266,Australian Labor Party
4,Canning,WA,268,6178.04,Canning,"POLYGON ((116.65681 -32.80318, 116.65772 -32.8...",236,60.227315,78.438074,Liberal


In [23]:
# Checking datatypes
geojson_df.dtypes

ELECT_DIV             object
STATE                 object
NUMCCDS                int64
AREA_SQKM            float64
SORTNAME              object
geometry            geometry
division_id            int64
percentage_yes       float64
turnout_percent      float64
successful_party      object
dtype: object

In [24]:
# Converting division_id datatype to object
geojson_df['division_id'] = geojson_df['division_id'].astype(object)

In [25]:
# Viewing GeoDataFrame
geojson_df.head()

Unnamed: 0,ELECT_DIV,STATE,NUMCCDS,AREA_SQKM,SORTNAME,geometry,division_id,percentage_yes,turnout_percent,successful_party
0,Lingiari,NT,335,1352034.05,Lingiari,"MULTIPOLYGON (((138.00117 -18.62066, 138.00120...",306,54.478296,50.112256,Australian Labor Party
1,Solomon,NT,180,336.686,Solomon,"MULTIPOLYGON (((130.81592 -12.40270, 130.81747...",307,65.255654,66.766021,Australian Labor Party
2,Canberra,ACT,283,1920.48,Canberra,"POLYGON ((149.10952 -35.28433, 149.11030 -35.2...",101,74.071318,83.221551,Australian Labor Party
3,Brand,WA,243,410.53,Brand,"MULTIPOLYGON (((115.88708 -32.25541, 115.88710...",235,67.093267,76.007266,Australian Labor Party
4,Canning,WA,268,6178.04,Canning,"POLYGON ((116.65681 -32.80318, 116.65772 -32.8...",236,60.227315,78.438074,Liberal


In [26]:
# Exporting the GeoDataFrame as a geoJSON file
geojson_df.to_file('ourJson.geojson', driver='GeoJSON')