In [7]:
import os, sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

filepath = os.path.abspath('')
utilpath = f'{filepath}/../util/'
sys.path.append(utilpath)
data_dir = f'{filepath}/../data/'

melb_data = pd.read_hdf(f'{data_dir}/melb_data.h5')

relevant_features = [
    'Distance',
    'Postcode',
    'Latitude',
    'Longitude',
    'Regionname',
    'CouncilArea'
]
council_area = melb_data[relevant_features]
council_area.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13580 entries, 0 to 13579
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Distance     13580 non-null  float64
 1   Postcode     13580 non-null  float64
 2   Latitude     13580 non-null  float64
 3   Longitude    13580 non-null  float64
 4   Regionname   13580 non-null  object 
 5   CouncilArea  12211 non-null  object 
dtypes: float64(4), object(2)
memory usage: 742.7+ KB


In [9]:
council_area.isna().describe()

Unnamed: 0,Distance,Postcode,Latitude,Longitude,Regionname,CouncilArea
count,13580,13580,13580,13580,13580,13580
unique,1,1,1,1,1,2
top,False,False,False,False,False,False
freq,13580,13580,13580,13580,13580,12211


In [20]:
for feature in council_area.columns:
    values = council_area.loc[:, feature].sort_values().unique()
    print(f'''
        feature: {feature}          len: {len(values)}

        {values}
    ''')


        feature: Distance          len: 202

        [ 0.   0.7  1.2  1.3  1.5  1.6  1.8  1.9  2.   2.1  2.3  2.4  2.5  2.6
  2.7  2.8  3.   3.1  3.2  3.3  3.4  3.5  3.6  3.7  3.8  4.   4.1  4.2
  4.3  4.4  4.5  4.6  5.   5.1  5.2  5.3  5.4  5.5  5.6  5.7  5.8  5.9
  6.1  6.2  6.3  6.4  6.5  6.6  6.7  6.8  6.9  7.   7.2  7.3  7.4  7.5
  7.7  7.8  7.9  8.   8.1  8.2  8.4  8.5  8.6  8.7  8.8  8.9  9.   9.1
  9.2  9.3  9.4  9.5  9.7  9.8  9.9 10.1 10.2 10.3 10.4 10.5 10.6 10.7
 10.8 10.9 11.  11.1 11.2 11.4 11.5 11.7 11.8 12.  12.1 12.2 12.3 12.4
 12.5 12.6 12.7 12.8 12.9 13.  13.1 13.3 13.4 13.5 13.6 13.7 13.8 13.9
 14.  14.2 14.3 14.5 14.6 14.7 14.8 14.9 15.  15.2 15.3 15.4 15.5 16.
 16.1 16.2 16.3 16.5 16.6 16.7 17.2 17.3 17.4 17.5 17.6 17.9 18.  18.4
 18.8 19.6 19.9 20.  20.4 20.5 20.6 20.8 21.1 21.3 21.5 21.8 22.2 22.7
 23.  23.2 23.3 23.5 23.8 24.7 24.8 25.  25.2 25.5 25.9 26.  26.1 26.5
 27.  27.1 27.2 27.7 28.5 28.8 29.3 29.8 31.2 31.4 31.6 31.7 32.3 33.3
 34.1 34.6 34.7 34.9 35.

In [191]:
# Our first hypothesis that `Postcode` uniquely determindes the `CouncilArea` is wrong
postcodes_of_nan_council_areas = council_area[council_area.CouncilArea.isna()].Postcode.sort_values().unique()
for postcode in postcodes_of_nan_council_areas:
    council_area_with_postcode = council_area[council_area.Postcode == postcode][['Postcode', 'CouncilArea']]
    values = council_area_with_postcode.CouncilArea.sort_values().unique()
    non_na = council_area_with_postcode.CouncilArea.sort_values().count()
    na = len(council_area_with_postcode[council_area_with_postcode.CouncilArea.isna()])
    fraction_non_na = non_na / (non_na + na) * 100
    print(f'percent non_na:  {fraction_non_na:4.0f} %      postcode:  {postcode:.0f}      length:  {len(values)}      {values}\n')
    # print(council_area_with_postcode.CouncilArea.sort_values())

percent non_na:    94 %      postcode:  3003      length:  2      ['Melbourne' nan]

percent non_na:    96 %      postcode:  3011      length:  2      ['Maribyrnong' nan]

percent non_na:    93 %      postcode:  3012      length:  4      ['Brimbank' 'Hobsons Bay' 'Maribyrnong' nan]

percent non_na:    94 %      postcode:  3013      length:  2      ['Maribyrnong' nan]

percent non_na:    95 %      postcode:  3015      length:  2      ['Hobsons Bay' nan]

percent non_na:    94 %      postcode:  3016      length:  2      ['Hobsons Bay' nan]

percent non_na:    94 %      postcode:  3018      length:  2      ['Hobsons Bay' nan]

percent non_na:    96 %      postcode:  3019      length:  2      ['Maribyrnong' nan]

percent non_na:    94 %      postcode:  3020      length:  2      ['Brimbank' nan]

percent non_na:    82 %      postcode:  3021      length:  2      ['Brimbank' nan]

percent non_na:    67 %      postcode:  3022      length:  2      ['Brimbank' nan]

percent non_na:    76 %      

In [259]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict, train_test_split, cross_validate

council_area_cv = council_area.dropna()

train_features = [
    # 'Regionname', 
    # 'Distance',
    'Latitude',
    'Longitude',
    'Postcode',
]

X, y = council_area_cv.loc[:, council_area_cv.columns.isin(train_features)], council_area_cv.CouncilArea

# model = DecisionTreeClassifier(random_state=0)
model = RandomForestClassifier(random_state=7)

cross_validate(model, X, y)



{'fit_time': array([0.77447391, 0.76894999, 0.78539109, 0.76905799, 0.71593809]),
 'score_time': array([0.03256011, 0.03353119, 0.03397179, 0.03528571, 0.03846192]),
 'test_score': array([0.98690135, 0.97461097, 0.98648649, 0.98484848, 0.98321048])}

In [260]:
model.fit(X=X, y=y)

council_area_impute = council_area[council_area.CouncilArea.isna()].loc[:, council_area.columns.isin(train_features)]
y_impute = model.predict(council_area_impute)

imputed_council_area = council_area.copy()
imputed_council_area.loc[council_area.CouncilArea.isna(), ['CouncilArea']] = y_impute
imputed_council_area

Unnamed: 0,Distance,Postcode,Latitude,Longitude,Regionname,CouncilArea
0,2.5,3067.0,-37.79960,144.99840,Northern Metropolitan,Yarra
1,2.5,3067.0,-37.80790,144.99340,Northern Metropolitan,Yarra
2,2.5,3067.0,-37.80930,144.99440,Northern Metropolitan,Yarra
3,2.5,3067.0,-37.79690,144.99690,Northern Metropolitan,Yarra
4,2.5,3067.0,-37.80720,144.99410,Northern Metropolitan,Yarra
...,...,...,...,...,...,...
13575,16.7,3150.0,-37.90562,145.16761,South-Eastern Metropolitan,Monash
13576,6.8,3016.0,-37.85927,144.87904,Western Metropolitan,Hobsons Bay
13577,6.8,3016.0,-37.85274,144.88738,Western Metropolitan,Hobsons Bay
13578,6.8,3016.0,-37.85908,144.89299,Western Metropolitan,Hobsons Bay


# Check Reasonability of imputation

In [261]:
# Our first hypothesis that `Postcode` uniquely determindes the `CouncilArea` is wrong
postcodes_of_nan_council_areas = council_area[council_area.CouncilArea.isna()].Postcode.sort_values().unique()
for postcode in postcodes_of_nan_council_areas:
    imputed_council_area_with_postcode = imputed_council_area[council_area.Postcode == postcode][['Postcode', 'CouncilArea']]
    council_area_with_postcode = council_area[council_area.Postcode == postcode][['Postcode', 'CouncilArea']]
    imputed_values = imputed_council_area_with_postcode.CouncilArea.sort_values().unique()

    council_area_with_postcode = council_area[council_area.Postcode == postcode][['Postcode', 'CouncilArea']]
    values = council_area_with_postcode.CouncilArea.sort_values().unique()
    values = [ val for val in values if not pd.isna(val)]
    # ratio non_na vs na
    non_na = council_area_with_postcode.CouncilArea.sort_values().count()
    na = len(council_area_with_postcode[council_area_with_postcode.CouncilArea.isna()])

    compare = pd.concat([imputed_council_area_with_postcode.CouncilArea, council_area_with_postcode.CouncilArea], axis=1)
    imputed_council_area_with_postcode.CouncilArea.sort_values()

    diff = set(imputed_values).difference(set(values))
    if diff:
        print(f'==> WARNING: CouncilArea imputed that previously did not belong to this postcode! <==')
        print(f'postcode:  {postcode:.0f}      ratio non_na | na:            {non_na:3.0f} | {na:3.0f}    =    {100 * non_na / (non_na + na) :3.0f} %')
        print(f'length:              {len(imputed_values)}      {imputed_values}')
        print(f'length:              {len(values)}      {values}')
        for d in list(diff):
            cnt = imputed_council_area_with_postcode.CouncilArea[imputed_council_area_with_postcode['CouncilArea'] == d].count()
            print(f'number of outliers:  {cnt}')
            print(f'......................')
        print(compare)
        print(f'\n=====================================================================================\n')

postcode:  3020      ratio non_na | na:            288 |  18    =     94 %
length:              2      ['Brimbank' 'Maribyrnong']
length:              1      ['Brimbank']
number of outliers:  1
......................
      CouncilArea CouncilArea
5924     Brimbank    Brimbank
5925     Brimbank    Brimbank
5926     Brimbank    Brimbank
5927     Brimbank    Brimbank
5928     Brimbank    Brimbank
...           ...         ...
13306    Brimbank         NaN
13549    Brimbank         NaN
13550    Brimbank         NaN
13551    Brimbank         NaN
13552    Brimbank         NaN

[306 rows x 2 columns]


postcode:  3047      ratio non_na | na:             43 |  11    =     80 %
length:              2      ['Hume' 'Moreland']
length:              1      ['Hume']
number of outliers:  1
......................
      CouncilArea CouncilArea
7077         Hume        Hume
7078         Hume        Hume
7079         Hume        Hume
7080         Hume        Hume
7081         Hume        Hume
7082       

The results seem very trustworthy. The two outliers for `Postcode`s `3020` and `3047` are corrected to `Brimbank` and `Hume` resp.
Finally, we write back to disk.

In [268]:
idx = imputed_council_area[(imputed_council_area.Postcode == 3020) & (imputed_council_area.CouncilArea == 'Maribyrnong')].index
imputed_council_area.loc[idx, 'CouncilArea'] = 'Brimbank'
idx = imputed_council_area[(imputed_council_area.Postcode == 3047) & (imputed_council_area.CouncilArea == 'Moreland')].index
imputed_council_area.loc[idx, 'CouncilArea'] = 'Hume'

melb_data_imputed_council_area = melb_data.copy()
melb_data_imputed_council_area.loc[melb_data.CouncilArea.isna(), ['CouncilArea']] = y_impute

melb_data_imputed_council_area.to_hdf(f'{data_dir}/melb_data_imputed_council_area.h5', key='melb_data_imputed_council_area', mode='w')