**The objective of this notebook is to collate the codes for cleaning below data:**
1. Bond yields (F)
2. Ineterest rate (F)
3. Population (K)
4. Construction (K)
5. Weekly income (A)
6. Household size (A)

(please add features to the list if there's any additional ones)

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

### 1. Bond Yields ###
`yields_join`

In [2]:
yields_data = "Files/Bond Yields/f02hist.xls"
yields = pd.read_excel(yields_data, sheet_name='Data', usecols='A:B,E', header=None, skiprows=range(0,12))
yields.columns = ['Date', '2yBonds%', '10yBonds%']

dates = pd.to_datetime(yields["Date"])

yields["Year"] = dates.dt.year
yields["Quarter"] = dates.dt.quarter

# set datetime as index
yields.set_index('Date', inplace = True)

# calculate average
yields_quarter_rates = yields.resample('QS').mean()

# convert year and quarter to int
yields_quarter_rates["Year"] = yields_quarter_rates["Year"].astype(int)
yields_quarter_rates["Quarter"] = yields_quarter_rates["Quarter"].astype(int)

# create time period column for join
yields_quarter_rates["time_period"] = yields_quarter_rates["Year"].map(str) + " Q" + yields_quarter_rates["Quarter"].map(str)

# create subset before joining
yields_join = yields_quarter_rates[["time_period", "2yBonds%", "10yBonds%"]]

yields_join.head()

Unnamed: 0_level_0,time_period,2yBonds%,10yBonds%
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-07-01,2013 Q3,2.575,3.868333
2013-10-01,2013 Q4,2.729167,4.1125
2014-01-01,2014 Q1,2.7,4.135
2014-04-01,2014 Q2,2.685,3.835833
2014-07-01,2014 Q3,2.583333,3.474167


### 2. Interest Rate ###
`quarter_rates`  
  
Please paste cleaning codes below

In [3]:
# read data in
interest = "Files/Interest Rates/f01d.xls"
interest = pd.read_excel(interest, sheet_name = "Data", usecols = "A:B", header = None, skiprows = range(0,12))
interest.columns = ['Date', 'Rate']

# get date time
dates = pd.to_datetime(interest["Date"])
interest["Year"] = dates.dt.year
interest["Quarter"] = dates.dt.quarter

# set date as index
interest.set_index('Date', inplace = True)


# calculate average
quarter_rates = interest.resample('QS').mean()

# create new column with average rate per quarter
quarter_rates["Year"] = quarter_rates["Year"].astype(int)
quarter_rates["Quarter"] = quarter_rates["Quarter"].astype(int)

# time period
quarter_rates["time_period"] = quarter_rates["Year"].map(str) + " Q" + quarter_rates["Quarter"].map(str)

# remove Year and Quarter
quarter_rates = quarter_rates[['Rate','time_period']]
quarter_rates.head()

Unnamed: 0_level_0,Rate,time_period
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2011-01-01,4.75,2011 Q1
2011-04-01,4.75,2011 Q2
2011-07-01,4.75,2011 Q3
2011-10-01,4.52381,2011 Q4
2012-01-01,4.25,2012 Q1


### 3. Population / Age bands 

#### Household count
`df_hhold_wide`  
  
Options:
 - Use 2016, 2021, and delta
 - Use 2016 and 2021 data only
 - Use delta only

In [4]:
popfile = "Files/Population/2019 NSW Population Projections ASGS 2019 LGA.xlsx"
df_hhold_wide = pd.read_excel(popfile,sheet_name='LGA Household Totals',header=6,usecols="A:C",skipfooter=3)
df_hhold_wide.columns=['LGA','hhold_count_2016','hhold_count_2021']
df_hhold_wide['LGA'] = df_hhold_wide.LGA.str.split('(').str.get(0)

df_hhold_wide['hhold_count_delta'] = df_hhold_wide.hhold_count_2021 - df_hhold_wide.hhold_count_2016

df_hhold_wide.head()

Unnamed: 0,LGA,hhold_count_2016,hhold_count_2021,hhold_count_delta
0,Albury,21940,23227,1287
1,Armidale Regional,11755,13041,1286
2,Ballina,18178,19080,902
3,Balranald,963,1015,52
4,Bathurst Regional,16105,17351,1246


#### Population movement in 5 year period
`df_move_pivot`

In [5]:
df_move = pd.read_excel(popfile,sheet_name='LGA population accounts', header=5, skipfooter=3, usecols="A:C")
df_move.columns=['LGA','pop_move','2016-2021']

df_move_melt = pd.melt(df_move,id_vars=['LGA','pop_move'], value_vars=['2016-2021'], var_name='year')
df_move_pivot = df_move_melt.pivot(index=['LGA','year'], columns='pop_move', values='value').reset_index()
df_move_pivot['LGA'] = df_move_pivot.LGA.str.split('(').str.get(0)
df_move_pivot['pop_delta'] = df_move_pivot['Population at End of Period'] - df_move_pivot['Population at Start of Period']
df_move_pivot.head()

pop_move,LGA,year,Births,Deaths,Natural change,Net Migration (all sources),Population at End of Period,Population at Start of Period,pop_delta
0,Albury,2016-2021,3390.0,2219.0,1171.0,1031.0,54374.0,52171.0,2203.0
1,Armidale Regional,2016-2021,1768.0,1266.0,502.0,1921.0,32736.0,30313.0,2423.0
2,Ballina,2016-2021,1790.0,2491.0,-701.0,1945.0,44237.0,42993.0,1244.0
3,Balranald,2016-2021,194.0,96.0,98.0,8.0,2437.0,2330.0,107.0
4,Bathurst Regional,2016-2021,2500.0,1710.0,790.0,1277.0,44310.0,42244.0,2066.0


#### Population Age
`age_bracket_delta`
- Are age brackets reasonable?
- Check dataframe style

In [6]:
popfile = "Files/Population/2019 NSW Population Projections ASGS 2019 LGA.xlsx"
df_age = pd.read_excel(popfile,sheet_name='LGA Sex Age projections',header=5,usecols="A:E",skipfooter=3)
df_age.columns=['LGA','sex','age','2016','2021']
df_age['age_delta'] = df_age['2021'] - df_age['2016']
df_age['LGA'] = df_age.LGA.str.split('(').str.get(0)

df_age_pivot = pd.pivot_table(df_age,index=['LGA','age'], values=['2016','2021','age_delta'], 
               aggfunc=({'2016':np.sum, '2021':np.sum, 'age_delta':np.sum})).reset_index()
df_age_pivot.head()

Unnamed: 0,LGA,age,2016,2021,age_delta
0,Albury,00-04,3505,3401,-104
1,Albury,05-09,3279,3510,231
2,Albury,10-14,3228,3370,142
3,Albury,15-19,3381,3306,-75
4,Albury,20-24,3744,3448,-296


In [7]:
#clusters

Child = df_age_pivot.age.unique()[:3]
Youth = df_age_pivot.age.unique()[3:5]
Adult = df_age_pivot.age.unique()[5:9]
MiddleAge = df_age_pivot.age.unique()[9:13]
Senior = df_age_pivot.age.unique()[13:]

print('Child',Child)
print('Youth',Youth)
print('Adult',Adult)
print('MiddleAge',MiddleAge)
print('Senior',Senior)

Child ['00-04' '05-09' '10-14']
Youth ['15-19' '20-24']
Adult ['25-29' '30-34' '35-39' '40-44']
MiddleAge ['45-49' '50-54' '55-59' '60-64']
Senior ['65-69' '70-74' '75-79' '80-84' '85+']


In [8]:
age_categ = [df_age_pivot['age'].isin(Child),
             df_age_pivot['age'].isin(Youth),
             df_age_pivot['age'].isin(Adult),
             df_age_pivot['age'].isin(MiddleAge),
             df_age_pivot['age'].isin(Senior)]
age_output = ['Child','Youth','Adult','MiddleAge','Senior']

df_age_pivot['age_bracket'] = np.select(age_categ,age_output)
df_age_pivot.head(20)

Unnamed: 0,LGA,age,2016,2021,age_delta,age_bracket
0,Albury,00-04,3505,3401,-104,Child
1,Albury,05-09,3279,3510,231,Child
2,Albury,10-14,3228,3370,142,Child
3,Albury,15-19,3381,3306,-75,Youth
4,Albury,20-24,3744,3448,-296,Youth
5,Albury,25-29,3485,3505,20,Adult
6,Albury,30-34,3400,3543,143,Adult
7,Albury,35-39,3143,3526,383,Adult
8,Albury,40-44,3206,3145,-61,Adult
9,Albury,45-49,3330,3284,-46,MiddleAge


In [16]:
age_bracket_delta = pd.pivot_table(df_age_pivot, index=['LGA'], columns=['age_bracket'], 
                                   values='age_delta', aggfunc=np.sum)
age_bracket_delta

age_bracket,Adult,Child,MiddleAge,Senior,Youth
LGA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Albury,485,269,302,1519,-371
Armidale Regional,1501,195,211,630,-114
Ballina,-62,-186,5,1535,-48
Balranald,14,36,-29,106,-20
Bathurst Regional,716,-73,476,1062,-112
...,...,...,...,...,...
Wingecarribee,-149,-580,86,1517,177
Wollondilly,748,553,986,1804,192
Wollongong,4643,1772,2339,4078,-483
Woollahra,-1552,820,181,478,-309


### 4. Construction  
`df_cons_clean`

In [9]:
# --read file, --rename columns
construction_file = "Files/Construction/Quarterly, Building construction prices rose, due to Homebuilder grants and government infrastructure investment.xlsx"
df_cons = pd.read_excel(construction_file,header=1,usecols="A:B", skipfooter=2)
df_cons.columns=['date','constr_index']

# --convert to datetime
df_cons['date'] = pd.to_datetime(df_cons['date'],format='%b-%y')

# --get year and quarter, --concatenate as time_period format, --drop other columns
df_cons['year'] = df_cons.date.dt.year
df_cons['quarter'] = df_cons.date.dt.quarter
df_cons['time_period'] = df_cons.year.map(str) + " Q" + df_cons.quarter.map(str)
df_cons_clean = df_cons.drop(columns=['date','year','quarter'],axis=1)
df_cons_clean.head()

Unnamed: 0,constr_index,time_period
0,100.1,2012 Q2
1,100.3,2012 Q3
2,100.2,2012 Q4
3,101.0,2013 Q1
4,101.6,2013 Q2


### 5. Weekly Income

In [10]:
# Read data in to the raw da
census_INCP = "Files/Census/POA (UR) by INCP Toal Personal Income (Weekly).csv"

incp_raw = pd.read_csv(census_INCP, skiprows=9, nrows=11142,
                       usecols=['POA (UR)', 'INCP Total Personal Income (weekly)', 'Count'])

# Rename column for easier referencing
incp_cols = {'POA (UR)':'postcode', 'INCP Total Personal Income (weekly)':'INCP_WK'}
incp_raw.rename(columns=incp_cols, inplace=True)

# Unstack
incp = incp_raw.groupby(['postcode','INCP_WK'])['Count'].sum().unstack()

# Remove the last row (grand total)
incp = incp[:-1]

incp.head(2)

INCP_WK,"$1,000-$1,249 ($52,000-$64,999)","$1,250-$1,499 ($65,000-$77,999)","$1,500-$1,749 ($78,000-$90,999)","$1,750-$1,999 ($91,000-$103,999)","$1-$149 ($1-$7,799)","$150-$299 ($7,800-$15,599)","$2,000-$2,999 ($104,000-$155,999)","$3,000 or more ($156,000 or more)","$300-$399 ($15,600-$20,799)","$400-$499 ($20,800-$25,999)","$500-$649 ($26,000-$33,799)","$650-$799 ($33,800-$41,599)","$800-$999 ($41,600-$51,999)",Negative income,Nil income,Not applicable,Not stated,Total
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
"2000, NSW",1676,1028,977,677,502,1112,1456,1796,1978,2201,1915,1599,1638,204,3297,1251,4115,27411
"2006, NSW",11,9,9,3,275,291,4,11,98,47,31,5,3,9,267,15,156,1261


In [11]:
# Remove 'NSW' in the index and cast postcode to int64
incp.reset_index(inplace=True)
incp['postcode'] = incp['postcode'].str.split(",", n=1, expand=True)
incp['postcode'] = incp['postcode'].astype('int64')
incp = incp.set_index('postcode')

ValueError: Columns must be same length as key

In [None]:
# Clean column names
income_cols= {'$1,000-$1,249 ($52,000-$64,999)' : '$1000-1249', 
            '$1,250-$1,499 ($65,000-$77,999)' : '$1250-1499',
            '$1,500-$1,749 ($78,000-$90,999)' : '$1500-1749 ', 
            '$1,750-$1,999 ($91,000-$103,999)': '$1750-1999',
            '$1-$149 ($1-$7,799)': '$1-149', 
            '$150-$299 ($7,800-$15,599)' : '$150-299',
            '$2,000-$2,999 ($104,000-$155,999)':'$2000-2999',
            '$3,000 or more ($156,000 or more)':'>=$3000', 
            '$300-$399 ($15,600-$20,799)':'$300-399',
            '$400-$499 ($20,800-$25,999)':'$400-499', 
            '$500-$649 ($26,000-$33,799)':'$500-649',
            '$650-$799 ($33,800-$41,599)':'$650-799', 
            '$800-$999 ($41,600-$51,999)':'$800-999'}
incp.rename(columns=income_cols, inplace=True)

# Combine 'not applicable' and 'not stated' into 'total_na'
incp['total_na'] = incp['Not applicable'] + incp['Not stated']

# Drop the 'Total column'
incp = incp.drop(columns=['Not applicable', 'Not stated', 'Total'], axis=1)

# Reorder columns
cols = incp.columns.tolist()
cols = ['$1-149','$150-299','$300-399','$400-499','$500-649','$650-799',
        '$800-999','$1000-1249','$1250-1499','$1500-1749 ',
        '$1750-1999','$2000-2999','>=$3000',
        'Negative income','Nil income','total_na']
incp=incp[cols]

incp.head(1)

In [None]:
# Create income buckets and save into incp_gr
incp['INCP_LOW'] = incp.iloc[:, 0:6].sum(axis=1)
incp['INCP_MID'] = incp.iloc[:, 6:10].sum(axis=1)
incp['INCP_HIGH'] = incp.iloc[:, 10:13].sum(axis=1)
incp['INCP_NEG_NIL'] = incp.iloc[:, 13:15].sum(axis=1)
incp_gr = incp[['INCP_LOW', 'INCP_MID', 'INCP_HIGH', 'INCP_NEG_NIL']]

# Reset index
incp_gr.reset_index(inplace=True)

incp_gr.head(1)

*The resulting cleanead df is <b>incp_gr</b>*

----

### 6. Household size

In [None]:
# Read data
census_cprf = "Files/Census/POA by CPRF Count of Persons in Family by STATE.xlsx"
cprf = pd.read_excel(census_cprf, sheet_name="Data Sheet 0", skiprows=9, nrows=619)

# Remove redundant rows and columns 
cprf = cprf[1:] #remove the first row
cprf = cprf.drop(columns='CPRF Count of Persons in Family') # remove the first column

# Rename columns
cprf_cols= {'Unnamed: 1' : 'postcode', 
            'Two persons in family' : 'CPRF_2',
            'Three persons in family' : 'CPRF_3', 
            'Four persons in family': 'CPRF_4',
            'Five persons in family': 'CPRF_5', 
            'Six or more persons in family' : 'CPRF_6+',
            'Not applicable':'CPRF_na',
            'Total' :'CPRF_TOTAL_FAM_NO'}
cprf.rename(columns=cprf_cols, inplace=True)

cprf.head(1)

In [None]:
# Remove 'NSW' in the index and cast postcode to int64
cprf.reset_index(inplace=True)
cprf['postcode'] = cprf['postcode'].str.split(",", n=1, expand=True)
cprf['postcode'] = cprf['postcode'].astype('int64')
cprf = cprf.set_index('postcode')
cprf = cprf.drop(columns='index', axis=1)

In [None]:
# Reset index for merging
cprf.reset_index(inplace=True)
cprf.head(1)

*The resulting cleanead df is <b>cprf</b>*

----

### 7. Additional Feature 1 
Please paste cleaning codes below

### 8. Additional Feature 2
Please paste cleaning codes below

### 9. Additional Feature 3
Please paste cleaning codes below

## USE BELOW CELL TO MERGE FEATURES INTO THE MASTER DF, IGNORE FOR NOW

--------