In [1]:
%store -r Census_2011

Census_2011.head()

Unnamed: 0_level_0,Dwelling Type,Apartments,Detached house,Semi-detached house,Terraced house,Not stated
Small Area,Period Built,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
268038003,before 1919,0,0,0,0,0
268038003,1919 - 1945,0,0,0,0,0
268038003,1946 - 1960,0,0,0,0,0
268038003,1961 - 1970,0,0,0,0,0
268038003,1971 - 1980,0,0,0,0,0


Example of method: <br>

For row indexed by building period 2001-2005 there are:
- 114 Apartments
- 1 Semi-detached house
- 1 detached house
- 1 Not stated 

Therefore, the one Not stated value is most likely to be an apartment.  To capture this in an equation; it will be redistributed in proportion to its relative weight so:

    New Apartment value = Old Apartment value + round(1 x 114/117) 
    
The round function rounds up the fraction to the nearest integer.  In this case the equation will return 1 and so the New Apartment value is 115

In [2]:
## Function to redistribute not stated

def distribute_not_stated_column(old_value, not_stated_value, row_total):
    """ Docstring:
    
        For row indexed by building period 2001-2005 there are:
        - 114 Apartments
        - 1 Semi-detached house
        - 1 detached house
        - 1 Not stated 

        Therefore, the one Not stated value is most likely to be an apartment.  To capture this in an equation; it will be redistributed in proportion to its relative weight so:

            New Apartment value = Old Apartment value + round(1 x 114/117) 

        The round function rounds up the fraction to the nearest integer.  In this case the equation will return 1 and so the New Apartment value is 115

        """
    
    if not_stated_value > 0:
    
        return old_value + not_stated_value*round(old_value/row_total)

    # If not_stated_value is 0 then return the old_value unchanged
    return old_value

distribute_not_stated_column_vec = np.vectorize(distribute_not_stated_column)

In [3]:
## Function to redistribute not stated

def distribute_not_stated_column(old_value, not_stated_value, row_total):
    """ Docstring:
    
        For row indexed by building period 2001-2005 there are:
        - 114 Apartments
        - 1 Semi-detached house
        - 1 detached house
        - 1 Not stated 

        Therefore, the one Not stated value is most likely to be an apartment.  To capture this in an equation; it will be redistributed in proportion to its relative weight so:

            New Apartment value = Old Apartment value + round(1 x 114/117) 

        The round function rounds up the fraction to the nearest integer.  In this case the equation will return 1 and so the New Apartment value is 115

        """
    
    if not_stated_value > 0:
    
        return old_value + not_stated_value*round(old_value/row_total)

    # If not_stated_value is 0 then return the old_value unchanged
    return old_value

distribute_not_stated_column_vec = np.vectorize(distribute_not_stated_column)

In [4]:
def redistribute_not_stated_Dwelling_Type(df):

    df.loc[:, "Total"] = df.sum(axis=1).astype(np.int64).fillna(0)
    cols = df.columns.tolist()
    original_data = cols[0:-2]
    data_with_not_stated = [s + "*" for s in original_data]

    for old_column, new_column in zip(original_data, data_with_not_stated):

        df[new_column] = distribute_not_stated_column_vec(
            df[old_column].fillna(0).values,
            df["Not stated"].values,
            df["Total"].values,
        )

    # Delete irrelevant columns:
    df.drop(list(df.columns[:6]), axis="columns", inplace=True)

    return df

Census_2011 = redistribute_not_stated_Dwelling_Type(Census_2011)

Census_2011.head(10)

Unnamed: 0_level_0,Dwelling Type,Apartments*,Detached house*,Semi-detached house*,Terraced house*
Small Area,Period Built,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
268038003,before 1919,0,0,0,0
268038003,1919 - 1945,0,0,0,0
268038003,1946 - 1960,0,0,0,0
268038003,1961 - 1970,0,0,0,0
268038003,1971 - 1980,0,0,0,0
268038003,1981 - 1990,0,0,0,0
268038003,1991 - 2000,3,0,0,0
268038003,2001 - 2005,115,0,1,1
268038003,2006 or later,0,0,0,0
268038003,Not stated,3,0,0,1


In [5]:
def redistribute_not_stated_Period_Built(df):

    ## Initialise data for manipulation:

    # Drop duplicated values with same 'Small Area' and 'Period Built'
    df = df.reset_index()
    df = df.drop_duplicates(subset=["Small Area", "Period Built"])
    df.set_index(['Small Area', 'Period Built'],inplace=True)
    
    # Set Dwelling type as index labels with .stack() and Period Built as new column labels with .unstack()
    df = df.unstack(-1).stack(0)
    
    # Calculate the Total across each row
    df.loc[:, "Total"] = df.sum(axis=1)

    cols = df.columns.tolist()
    original_data = [cols[-2]] + cols[0:-3]
    data_with_not_stated = [s + "*" for s in original_data]
    
    ## Redistribute 'Not stated' column:

    for old_column, new_column in zip(original_data, data_with_not_stated):
        
        df[new_column] = distribute_not_stated_column_vec(
            df[old_column].values,
            df["Not stated"].values,
            df["Total"].values,
        )

    ## Delete irrelevant columns:

    df.drop(cols[0:12], axis="columns", inplace=True)

    # Reorder so Period Built becomes index labels again with 'before 1919' as first label
    df = df.unstack(level=-1).stack(level=0)
    period_built_names = list(df.index.levels[1])
    period_built_names = [period_built_names[-1]] + period_built_names[:-1]
    df = df.reindex(level="Period Built", labels=period_built_names)

    return df


# Redistribute 'Not stated' row and column
Census_2011 = redistribute_not_stated_Period_Built(Census_2011)

Census_2011.head(9)

Unnamed: 0_level_0,Dwelling Type,Apartments*,Detached house*,Semi-detached house*,Terraced house*
Small Area,Period Built,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
267001001,before 1919*,0.0,0.0,0.0,0.0
267001001,1919 - 1945*,0.0,0.0,0.0,0.0
267001001,1946 - 1960*,0.0,0.0,0.0,0.0
267001001,1961 - 1970*,0.0,0.0,0.0,0.0
267001001,1971 - 1980*,0.0,0.0,0.0,0.0
267001001,1981 - 1990*,0.0,0.0,0.0,0.0
267001001,1991 - 2000*,8.0,0.0,0.0,0.0
267001001,2001 - 2005*,47.0,0.0,1.0,0.0
267001001,2006 or later*,8.0,1.0,0.0,0.0


In [6]:
%store Census_2011

Stored 'Census_2011' (DataFrame)
