In [12]:
import pandas as pd
import numpy as np

In [13]:
## Get access to the input data folder via a path 

from os.path import dirname
from os import getcwd

filepath = getcwd()
parent_of_parent_dir_of_file = dirname(dirname(filepath))
path_to_data_folder = str(parent_of_parent_dir_of_file) 

print(path_to_data_folder)
# Source: https://codereview.stackexchange.com/questions/181068/getting-the-grandparent-directory-of-the-current-code

C:\Users\RowanM\Documents\Masterplan_Code\Residential_Code


In [14]:
Census_2011 = pd.DataFrame()

excel_file = pd.ExcelFile(path_to_data_folder + r"\raw_inputs\Census_2011.xlsx")
sheet_names = [
    "DCC",
    "SD",
    "DLR",
    "Fingal",
]

for sheet_name in sheet_names:

    # Read in from Excel, replace NaN with 0, replace <3 with 1 and '.' with 0
    temp_df = pd.read_excel(
        excel_file, header=[0, 1], index_col=[0], sheet_name=sheet_name,
    ).replace(to_replace=["<3", ".", ">3"], value=[1, 0, 3])
    temp_df.fillna(0, inplace=True)
    """ NOTE!
        - Replacing all <3 with 1
        - Replacing all >3 with 3
        """

    if sheet_name == "DLR" or sheet_name == "Fingal":
        temp_df = temp_df.stack(0)
    else:
        temp_df = temp_df.stack()

    # Reorder index labels so goes from before 1919 to 2006 or later
    period_built_names = list(temp_df.index.levels[-1])
    period_built_names = [period_built_names[-1]] + period_built_names[:-1]
    temp_df = temp_df.reindex(level=-1, labels=period_built_names)

    # Set Period built as Multi-Index
    temp_df.index.set_names(['Small Area','Period Built'],inplace=True)
    temp_df.columns.set_names(['Dwelling Type'],inplace=True)
    
    Census_2011 = Census_2011.append(temp_df)

# ------------------------------------------------------
## Remove 'Dublin City', 'South Dublin' from DataFrame AS THEY AREN'T SAs!
Census_2011.drop(labels=['Dublin City','South Dublin'],level='Small Area', axis=0, inplace=True)
# ------------------------------------------------------

Census_2011.head(10)

Unnamed: 0_level_0,Dwelling Type,Bed-sit,Detached house,Flat/apartment in a converted house or commercial building,Flat/apartment in a purpose-built block,Not stated,Semi-detached house,Terraced house
Small Area,Period Built,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
268038003,before 1919,0.0,0,0,0,0.0,0,0
268038003,1919 - 1945,0.0,0,0,0,0.0,0,0
268038003,1946 - 1960,0.0,0,0,0,0.0,0,0
268038003,1961 - 1970,0.0,0,0,0,0.0,0,0
268038003,1971 - 1980,0.0,0,0,0,0.0,0,0
268038003,1981 - 1990,0.0,0,0,0,0.0,0,0
268038003,1991 - 2000,0.0,0,0,3,0.0,0,0
268038003,2001 - 2005,0.0,0,1,113,1.0,1,1
268038003,2006 or later,0.0,0,0,0,0.0,0,0
268038003,Not stated,0.0,0,0,3,3.0,0,1


In [15]:
# Convert all values to type np.int16

def reduce_mem_usage(df):
    """ Docstring:
        iterate through all the columns of a dataframe and 
        modify the data type to reduce memory usage.        
        """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print(("Memory usage of dataframe is {:.2f} MB").format(start_mem))

    for col in df.columns:
        df[col].fillna(0,inplace=True)
        df[col] = df[col].astype(np.int16)
        end_mem = df.memory_usage().sum() / 1024 ** 2

    print(("Memory usage after optimization is: {:.2f}" "MB").format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

    return df

Census_2011 = reduce_mem_usage(Census_2011)
Census_2011.info()

# Source: https://towardsdatascience.com/how-to-learn-from-bigdata-files-on-low-memory-incremental-learning-d377282d38ff

Memory usage of dataframe is 2.74 MB
Memory usage after optimization is: 0.82MB
Decreased by 70.2%
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 48060 entries, (268038003, before 1919) to (267065027/267065031, Not stated)
Data columns (total 7 columns):
Bed-sit                                                       48060 non-null int16
Detached house                                                48060 non-null int16
Flat/apartment in a converted house or commercial building    48060 non-null int16
Flat/apartment in a purpose-built block                       48060 non-null int16
Not stated                                                    48060 non-null int16
Semi-detached house                                           48060 non-null int16
Terraced house                                                48060 non-null int16
dtypes: int16(7)
memory usage: 835.6+ KB


In [16]:
%store Census_2011

Stored 'Census_2011' (DataFrame)
