# Import Libraries

In [5]:
# Import Libaries
import pandas as pd
import numpy as np

# Data Cleaning for OneMap data (Planning Area)

## Planning area id

- Data contains names of all planning area and id in Singapore for the year 1998, 2008, 2014 and 2019.
- Data extracted is exported to csv file and pickle file with the columns.
	1. planning_area_id
	2. planning_area
	3. year

In [6]:
# Read Dataset
planning_area_id_df = pd.read_csv('../assets/onemap/planning_area_id.csv')

# Dataset Info
planning_area_id_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220 entries, 0 to 219
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   planning_area_id  220 non-null    int64 
 1   planning_area     220 non-null    object
 2   year              220 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 5.3+ KB


- Observed that there are no missing values,
- Next: to compare the IDs among the years to create one set of "planning_area_id" and "planning_area".
    - concatenation of data will be decided at a later stage after further analysis of data.

In [7]:
# Check for rows with the planning_area and planning_area_id
duplicate_rows = planning_area_id_df[planning_area_id_df.duplicated(['planning_area', 'planning_area_id'], keep=False)]
duplicate_rows

Unnamed: 0,planning_area_id,planning_area,year
0,114,BEDOK,1998
1,120,BUKIT TIMAH,1998
2,117,BUKIT BATOK,1998
3,118,BUKIT MERAH,1998
4,121,CENTRAL WATER CATCHMENT,1998
...,...,...,...
215,125,CLEMENTI,2019
216,162,TOA PAYOH,2019
217,155,SINGAPORE RIVER,2019
218,158,SUNGEI KADUT,2019


- Observed that there after comparing duplicates for planning_area and planning_area_id, the number of count is the same at 220.
- This implies that the planning area id across the years are the same.
- Next:
    - to drop duplicated values and to drop the year column. 


In [68]:
# Create a Dictionary to check if all the planning_area have the same id
planning_area_dict = {}

# Loop throough data dictionary
for index, row in planning_area_id_df.iterrows():
    retrieve_planning_area_id = row["planning_area_id"]   
    retrieve_planning_area = row["planning_area"]
    
    # Check if planning area already exist as a key in dictionary    
    if planning_area_dict.get(retrieve_planning_area) is not None:
        dict_planning_area_id = planning_area_dict[retrieve_planning_area]["planning_area_id"]
        if(dict_planning_area_id != retrieve_planning_area_id):
            planning_area_dict[retrieve_planning_area]["Status"] = "not ok"
        
    # Add planning area to dictionary
    else:
        planning_area_dict[retrieve_planning_area] = {}
        planning_area_dict[retrieve_planning_area]["planning_area_id"] = retrieve_planning_area_id
        planning_area_dict[retrieve_planning_area]["Status"] = "ok"

In [69]:
# View dictionary to see if the status is ok. 
planning_area_dict

{'BEDOK': {'planning_area_id': 114, 'Status': 'ok'},
 'BUKIT TIMAH': {'planning_area_id': 120, 'Status': 'ok'},
 'BUKIT BATOK': {'planning_area_id': 117, 'Status': 'ok'},
 'BUKIT MERAH': {'planning_area_id': 118, 'Status': 'ok'},
 'CENTRAL WATER CATCHMENT': {'planning_area_id': 121, 'Status': 'ok'},
 'DOWNTOWN CORE': {'planning_area_id': 126, 'Status': 'ok'},
 'CHANGI': {'planning_area_id': 122, 'Status': 'ok'},
 'CHANGI BAY': {'planning_area_id': 123, 'Status': 'ok'},
 'LIM CHU KANG': {'planning_area_id': 132, 'Status': 'ok'},
 'BOON LAY': {'planning_area_id': 116, 'Status': 'ok'},
 'WESTERN WATER CATCHMENT': {'planning_area_id': 165, 'Status': 'ok'},
 'WOODLANDS': {'planning_area_id': 166, 'Status': 'ok'},
 'MARINE PARADE': {'planning_area_id': 136, 'Status': 'ok'},
 'NEWTON': {'planning_area_id': 138, 'Status': 'ok'},
 'NORTH-EASTERN ISLANDS': {'planning_area_id': 139, 'Status': 'ok'},
 'ORCHARD': {'planning_area_id': 141, 'Status': 'ok'},
 'PASIR RIS': {'planning_area_id': 143, 'St

- Observed that the dictionary status is ok, this implies that for each planning_area key, there is only one planning_area_id assign to it. 

- Next: to export the with columns data "planning_area_id" and "planning_area" into a cleaned dataframe

In [75]:
# Create an empty dataframe
planning_area_id_cleaned_df = pd.DataFrame(columns=['planning_area', 'planning_area_id'])

# Append value in dictionary as a new dataframe.

for planning_area in planning_area_dict:
    retrieved_planning_area = planning_area
    retrieved_planning_area_id = planning_area_dict[planning_area]["planning_area_id"]
    temp_row = {'planning_area': retrieve_planning_area
                , 'planning_area_id': retrieved_planning_area_id
               }
    temp_df = pd.DataFrame([temp_row])
    planning_area_id_cleaned_df = pd.concat([planning_area_id_cleaned_df, temp_df], axis=0, ignore_index=True)
    
    
# Get info on datafra,e
print(planning_area_id_cleaned_df.info())        

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   planning_area     55 non-null     object
 1   planning_area_id  55 non-null     object
dtypes: object(2)
memory usage: 1008.0+ bytes
None


- Observed  that that planning_area and planning_area_id has been successfully retrieved. 
- Next: export the dataset.

### Export Planning area id cleaned data

In [105]:
# export to csv
planning_area_id_cleaned_df.to_csv('../assets/data_clean/planning_area_id_cleaned.csv', encoding='utf-8', index=False)

# export to pickle file
planning_area_id_cleaned_df.to_pickle('../assets/data_clean/planning_area_id_cleaned.pkl')

- Observed that the files are successfully exported.

# Planning area polygons

- Data contains planning area polygons of Singapore for the year 1998, 2008, 2014 and 2019.
- Data extracted is exported to csv file and pickle file with the columns.
	1. planning_area e.g. Bedok.
	2. longitude
	3. latitude
	4. year

In [78]:
# Read Dataset
planning_area_polygon_df = pd.read_csv('../assets/onemap/planning_area_polygons.csv')

# Dataset Info
planning_area_polygon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126824 entries, 0 to 126823
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   planning_area  126824 non-null  object 
 1   longitude      126824 non-null  float64
 2   latitude       126824 non-null  float64
 3   year           126824 non-null  int64  
dtypes: float64(2), int64(1), object(1)
memory usage: 3.9+ MB


- Observed that there are no missing values.
- CSV file contains information for planning_area and longitude and latitude information.
- Resale flat with price dataset already has "planning_query_town" mapped to based on longitude and latitude.
- Current dataset is not required and will not be exported as a cleaned dataset.

## Planning area polygons

- API retrieve planning area based on latitude and longitude.
- Data extracted is export to csv file with the columns:
	1. month
	2. town 
	3. longitude 
	4. latitude 
	5. planning_query_town

In [79]:
# Read Dataset
planning_area_query_df = pd.read_csv('../assets/onemap/planning_area_query.csv')

# Dataset Info
planning_area_query_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903393 entries, 0 to 903392
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   month                903393 non-null  object 
 1   town                 903393 non-null  object 
 2   longitude            903393 non-null  float64
 3   latitude             903393 non-null  float64
 4   planning_query_town  804485 non-null  object 
dtypes: float64(2), object(3)
memory usage: 34.5+ MB


- Observed that there are missing values.
- CSV file contains information for planning_query_town, town, longitude and latitude information.
- The following file is already cleaned up and used in 3a_DataCleaning_ResaleFlat file
- Current dataset is not required and will not be exported as a cleaned dataset.

# Data Cleaning for OneMap data (Population)

## Economic Status

- API retrieves data related to economic status for given planning area name, year and gender. 
- Data is available for the year 2000, 2010, 2015, and 2020.
- Data is exported to csv file with the columns:
	1. planning_area
	2. employed
	3. unemployed
	4. inactive
	5. year
	6. gender

In [80]:
# Read Dataset
population_economic_status_df = pd.read_csv('../assets/onemap/population_economic_status.csv')

# Dataset Info
population_economic_status_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354 entries, 0 to 353
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   planning_area  354 non-null    object
 1   employed       354 non-null    int64 
 2   unemployed     354 non-null    int64 
 3   inactive       354 non-null    int64 
 4   year           354 non-null    int64 
 5   gender         354 non-null    object
dtypes: int64(4), object(2)
memory usage: 16.7+ KB


- Observed that there are no missing values
- Next: 
    - Dataset will be averaged by planning_area.
    - Year column will be dropped from dataset and exported as cleaned dataframe.

In [107]:
# Group data by planning area and get the mean of the values.
population_economic_status_group_df = population_economic_status_df.groupby('planning_area')[['planning_area'
                                                                   ,'employed'
                                                                   , 'unemployed'
                                                                   ,'inactive']].mean().reset_index()

# view the information of the dataset.
population_economic_status_group_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   planning_area  55 non-null     object 
 1   employed       55 non-null     float64
 2   unemployed     55 non-null     float64
 3   inactive       55 non-null     float64
dtypes: float64(3), object(1)
memory usage: 1.8+ KB


- Observed that the dataset values has been averaged by planning_area
- Next: Export out the dataset as cleaned Data


### Export Population economic status cleaned data

In [108]:
# export  to csv
population_economic_status_group_df.to_csv('../assets/data_clean/population_economic_status_cleaned.csv', encoding='utf-8', index=False)

# export as pickle file
population_economic_status_group_df.to_pickle('../assets/data_clean/population_economic_status_cleaned.pkl')

- Observed that the files are successfully exported.

## Education Status

- API retrieves data related to economic status for given planning area name and year. 
- Data is available for the year 2000, 2010, 2015, and 2020.
- Data extracted is exported to csv file with the columns:
	1. planning_area
	2. pre_primary 
	3. primary 
	4. secondary 
	5. post_secondary
	6. polytechnic
	7. prof_qualification_diploma
	8. university
	9. year

In [109]:
# Read Dataset
population_education_status_df = pd.read_csv('../assets/onemap/population_education_status.csv')

# Dataset Info
population_education_status_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   planning_area               177 non-null    object
 1   pre_primary                 177 non-null    int64 
 2   primary                     177 non-null    int64 
 3   secondary                   177 non-null    int64 
 4   post_secondary              177 non-null    int64 
 5   polytechnic                 177 non-null    int64 
 6   prof_qualification_diploma  177 non-null    int64 
 7   university                  177 non-null    int64 
 8   year                        177 non-null    int64 
dtypes: int64(8), object(1)
memory usage: 12.6+ KB


- Observed that there are no missing values.
- Next: 
    - Dataset will be averaged by planning_area.
    - Year column will be dropped from dataset and exported as cleaned dataframe.

In [112]:
# Group data by planning area and get the mean of the values.
population_education_status_group_df = population_education_status_df.groupby('planning_area')[['pre_primary'
                                                                   ,'primary'
                                                                   , 'secondary'
                                                                   ,'post_secondary'
                                                                   , 'polytechnic'
                                                                   , 'prof_qualification_diploma'
                                                                   , 'university'                      
                                                                   ]].mean().reset_index()

# view the information of the dataset.
population_education_status_group_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   planning_area               55 non-null     object 
 1   pre_primary                 55 non-null     float64
 2   primary                     55 non-null     float64
 3   secondary                   55 non-null     float64
 4   post_secondary              55 non-null     float64
 5   polytechnic                 55 non-null     float64
 6   prof_qualification_diploma  55 non-null     float64
 7   university                  55 non-null     float64
dtypes: float64(7), object(1)
memory usage: 3.6+ KB


- Observed that the dataset values has been averaged by planning_area
- Next: Export out the dataset as cleaned Data

### Export Population education status cleaned data

In [113]:
# export  to csv
population_education_status_group_df.to_csv('../assets/data_clean/population_education_status_cleaned.csv', encoding='utf-8', index=False)

# export as pickle file
population_education_status_group_df.to_pickle('../assets/data_clean/population_education_status_cleaned.pkl')

- Observed that the files are successfully exported.

## Ethnic Status

- API retrieves data related to ethnic distribution for given planning area name and year for the specified gender.
- Data is available for the year 2000, 2010, 2015, and 2020.
- Data extracted is exported to csv file with the columns:
	1. planning_area
	2. chinese
	3. malays
	4. indian
	5. others
	6. year
	7. gender

In [115]:
# Read Dataset
population_ethnic_status_df = pd.read_csv('../assets/onemap/population_ethnic_status.csv')

# Dataset Info
population_ethnic_status_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340 entries, 0 to 339
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   planning_area  340 non-null    object
 1   chinese        340 non-null    int64 
 2   malays         340 non-null    int64 
 3   indian         340 non-null    int64 
 4   others         340 non-null    int64 
 5   year           340 non-null    int64 
 6   gender         340 non-null    object
dtypes: int64(5), object(2)
memory usage: 18.7+ KB


- Observed that there are no missing values.
- Next: 
    - Dataset will be averaged by planning_area.
    - Year and Gender column will be dropped from dataset and exported as cleaned dataframe.

In [117]:
# Group data by planning area and get the mean of the values.
population_ethnic_status_group_df = population_ethnic_status_df.groupby('planning_area')[['chinese'
                                                                   ,'malays'
                                                                   ,'indian'
                                                                   , 'others'                    
                                                                   ]].mean().reset_index()

# view the information of the dataset.
population_ethnic_status_group_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   planning_area  55 non-null     object 
 1   chinese        55 non-null     float64
 2   malays         55 non-null     float64
 3   indian         55 non-null     float64
 4   others         55 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.3+ KB


- Observed that the dataset values has been averaged by planning_area
- Next: Export out the dataset as cleaned Data

### Export Population ethnic status cleaned data

In [119]:
# export  to csv
population_ethnic_status_group_df.to_csv('../assets/data_clean/population_ethnic_status_cleaned.csv', encoding='utf-8', index=False)

# export as pickle file
population_ethnic_status_group_df.to_pickle('../assets/data_clean/population_ethnic_status_cleaned.pkl')

- Observed that the files are successfully exported.

## Work Income For Household (Monthly)

- API retrieves data related monthly household income for given planning area name and year.
- Data is available for the year 2000, 2010, 2015, and 2020.
- Data extracted is exported to csv file with the columns:
	1. planning_area
	2. total
	3. below_sgd_1000
	4. no_working_person
	5. sgd_10000_over
	6. sgd_10000_to_10999
	7. sgd_11000_to_11999
	8. sgd_1000_to_1999
	9. sgd_12000_to_12999
	10. sgd_13000_to_13999
	11. sgd_14000_to_14999
	12. sgd_15000_to_17499
	13. sgd_17500_to_19999
	14. sgd_20000_over
	15. sgd_2000_to_2999
	16. sgd_3000_to_3999
	17. sgd_4000_to_4999
	18. sgd_5000_to_5999
	19. sgd_6000_to_6999
	20. sgd_7000_to_7999
	21. sgd_8000_over
	22. sgd_8000_to_8999
	23. sgd_9000_to_9999
	24. year

In [120]:
# Read Dataset
population_household_monthly_income_status_df = pd.read_csv('../assets/onemap/population_household_monthly_income_status.csv')

# Dataset Info
population_household_monthly_income_status_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149 entries, 0 to 148
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   planning_area       149 non-null    object 
 1   total               149 non-null    int64  
 2   below_sgd_1000      149 non-null    int64  
 3   no_working_person   117 non-null    float64
 4   sgd_10000_over      89 non-null     float64
 5   sgd_10000_to_10999  82 non-null     float64
 6   sgd_11000_to_11999  82 non-null     float64
 7   sgd_1000_to_1999    149 non-null    int64  
 8   sgd_12000_to_12999  82 non-null     float64
 9   sgd_13000_to_13999  82 non-null     float64
 10  sgd_14000_to_14999  82 non-null     float64
 11  sgd_15000_to_17499  82 non-null     float64
 12  sgd_17500_to_19999  82 non-null     float64
 13  sgd_20000_over      82 non-null     float64
 14  sgd_2000_to_2999    149 non-null    int64  
 15  sgd_3000_to_3999    149 non-null    int64  
 16  sgd_4000

- Observed that there are null values.
- Next: fill null values with 0

In [123]:
# view sample dataset with NaN value.
population_household_monthly_income_status_df[population_household_monthly_income_status_df["sgd_10000_over"].isnull()].head()

Unnamed: 0,planning_area,total,below_sgd_1000,no_working_person,sgd_10000_over,sgd_10000_to_10999,sgd_11000_to_11999,sgd_1000_to_1999,sgd_12000_to_12999,sgd_13000_to_13999,...,sgd_2000_to_2999,sgd_3000_to_3999,sgd_4000_to_4999,sgd_5000_to_5999,sgd_6000_to_6999,sgd_7000_to_7999,sgd_8000_over,sgd_8000_to_8999,sgd_9000_to_9999,year
0,Bedok,74833,10217,,,,,9496,,,...,9733,8560,6923,5192,4436,3386,16891.0,,,2000
2,Bedok,92200,2200,10000.0,,2700.0,3000.0,7200,3300.0,2600.0,...,4100,5300,5300,6200,5200,3200,,4400.0,2900.0,2015
4,Bukit Timah,16812,1730,,,,,412,,,...,650,832,612,764,702,694,10416.0,,,2000
6,Bukit Timah,23900,300,2900.0,,900.0,400.0,700,700.0,400.0,...,400,400,200,300,300,700,,700.0,200.0,2015
8,Bukit Batok,36253,2987,,,,,4490,,,...,5282,4916,3749,3135,2518,1961,7216.0,,,2000


In [135]:
# fill dataframe with 0 where na
population_household_monthly_income_status_df = population_household_monthly_income_status_df.fillna(0)

# view dataframe info
print(population_household_monthly_income_status_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149 entries, 0 to 148
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   planning_area       149 non-null    object 
 1   total               149 non-null    int64  
 2   below_sgd_1000      149 non-null    int64  
 3   no_working_person   149 non-null    float64
 4   sgd_10000_over      149 non-null    float64
 5   sgd_10000_to_10999  149 non-null    float64
 6   sgd_11000_to_11999  149 non-null    float64
 7   sgd_1000_to_1999    149 non-null    int64  
 8   sgd_12000_to_12999  149 non-null    float64
 9   sgd_13000_to_13999  149 non-null    float64
 10  sgd_14000_to_14999  149 non-null    float64
 11  sgd_15000_to_17499  149 non-null    float64
 12  sgd_17500_to_19999  149 non-null    float64
 13  sgd_20000_over      149 non-null    float64
 14  sgd_2000_to_2999    149 non-null    int64  
 15  sgd_3000_to_3999    149 non-null    int64  
 16  sgd_4000

- Observed that null values have been filled with 0.
- Next:
    - Dataset will be averaged by planning_area.
    - Year column will be dropped from dataset and exported as cleaned dataframe.

In [131]:
# Group data by planning area and get the mean of the values.
population_household_monthly_income_status_group_df = population_household_monthly_income_status_df.groupby('planning_area')[
                                                                  ['total'
                                                                   ,'below_sgd_1000'
                                                                   ,'no_working_person'
                                                                   , 'sgd_10000_over'
                                                                   , 'sgd_10000_to_10999'
                                                                   
                                                                   , 'sgd_11000_to_11999' 
                                                                   , 'sgd_1000_to_1999' 
                                                                   , 'sgd_12000_to_12999' 
                                                                   , 'sgd_13000_to_13999'                                                                   
                                                                   , 'sgd_14000_to_14999' 
                                                                   
                                                                   , 'sgd_15000_to_17499' 
                                                                   , 'sgd_17500_to_19999' 
                                                                   , 'sgd_20000_over' 
                                                                   , 'sgd_2000_to_2999' 
                                                                   , 'sgd_3000_to_3999'
                                                                   
                                                                   , 'sgd_4000_to_4999' 
                                                                   , 'sgd_5000_to_5999' 
                                                                   , 'sgd_6000_to_6999' 
                                                                   , 'sgd_7000_to_7999' 
                                                                   , 'sgd_8000_over' 
                                                                   
                                                                   , 'sgd_8000_to_8999'
                                                                   , 'sgd_9000_to_9999'
                                                                  
                                                                   ]].mean().reset_index()

# view the information of the dataset.
population_household_monthly_income_status_group_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   planning_area       54 non-null     object 
 1   total               54 non-null     float64
 2   below_sgd_1000      54 non-null     float64
 3   no_working_person   54 non-null     float64
 4   sgd_10000_over      54 non-null     float64
 5   sgd_10000_to_10999  54 non-null     float64
 6   sgd_11000_to_11999  54 non-null     float64
 7   sgd_1000_to_1999    54 non-null     float64
 8   sgd_12000_to_12999  54 non-null     float64
 9   sgd_13000_to_13999  54 non-null     float64
 10  sgd_14000_to_14999  54 non-null     float64
 11  sgd_15000_to_17499  54 non-null     float64
 12  sgd_17500_to_19999  54 non-null     float64
 13  sgd_20000_over      54 non-null     float64
 14  sgd_2000_to_2999    54 non-null     float64
 15  sgd_3000_to_3999    54 non-null     float64
 16  sgd_4000_t

- Observed that the dataset values has been averaged by planning_area
- Next: Export out the dataset as cleaned Data

### Export Population household monthly income status cleaned data

In [132]:
# export  to csv
population_household_monthly_income_status_group_df.to_csv('../assets/data_clean/population_household_monthly_income_status_cleaned.csv', encoding='utf-8', index=False)

# export as pickle file
population_household_monthly_income_status_group_df.to_pickle('../assets/data_clean/population_household_monthly_income_status_cleaned.pkl')

- Observed that the files are successfully exported.

## Population household structure data

- API retrieves data related to household structure for given planning area name and year.
- Data is available for the year 2000, 2010, 2015, and 2020.
- Data extracted is exported to csv file with the columns:
	1. planning_area
	2. no_family_nucleus
	3. ofn_1_gen
	4. ofn_2_gen
	5. ofn_3_more_gen
	6. tfn_1to2_gen
	7. tfn_3_more_gen
	8. three_more_fam_nucleus
	9. year

In [138]:
# Read Dataset
population_household_structure_df = pd.read_csv('../assets/onemap/population_household_structure.csv')

# Dataset Info
population_household_structure_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   planning_area           177 non-null    object 
 1   no_family_nucleus       177 non-null    int64  
 2   ofn_1_gen               177 non-null    int64  
 3   ofn_2_gen               177 non-null    int64  
 4   ofn_3_more_gen          177 non-null    int64  
 5   tfn_1to2_gen            122 non-null    float64
 6   tfn_3_more_gen          122 non-null    float64
 7   three_more_fam_nucleus  122 non-null    float64
 8   year                    177 non-null    int64  
dtypes: float64(3), int64(5), object(1)
memory usage: 12.6+ KB


In [139]:
# fill dataframe with 0 where na
population_household_structure_df = population_household_structure_df.fillna(0)

# view dataframe info
print(population_household_structure_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   planning_area           177 non-null    object 
 1   no_family_nucleus       177 non-null    int64  
 2   ofn_1_gen               177 non-null    int64  
 3   ofn_2_gen               177 non-null    int64  
 4   ofn_3_more_gen          177 non-null    int64  
 5   tfn_1to2_gen            177 non-null    float64
 6   tfn_3_more_gen          177 non-null    float64
 7   three_more_fam_nucleus  177 non-null    float64
 8   year                    177 non-null    int64  
dtypes: float64(3), int64(5), object(1)
memory usage: 12.6+ KB
None


- Observed that null values have been filled with 0.
- Next:
    - Dataset will be averaged by planning_area.
    - Year column will be dropped from dataset and exported as cleaned dataframe.

In [140]:
# Group data by planning area and get the mean of the values.
population_household_structure_group_df = population_household_structure_df.groupby('planning_area')[
                                                                  ['no_family_nucleus'
                                                                   ,'ofn_1_gen'
                                                                   ,'ofn_2_gen'
                                                                   , 'ofn_3_more_gen'
                                                                   , 'tfn_1to2_gen'
                                                                   
                                                                   , 'tfn_3_more_gen' 
                                                                   , 'three_more_fam_nucleus'                                                                   
                                                                   ]].mean().reset_index()

# view the information of the dataset.
population_household_structure_group_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   planning_area           55 non-null     object 
 1   no_family_nucleus       55 non-null     float64
 2   ofn_1_gen               55 non-null     float64
 3   ofn_2_gen               55 non-null     float64
 4   ofn_3_more_gen          55 non-null     float64
 5   tfn_1to2_gen            55 non-null     float64
 6   tfn_3_more_gen          55 non-null     float64
 7   three_more_fam_nucleus  55 non-null     float64
dtypes: float64(7), object(1)
memory usage: 3.6+ KB


- Observed that the dataset values has been averaged by planning_area
- Next: Export out the dataset as cleaned Data

### Export Population household structure cleaned data

In [142]:
# export  to csv
population_household_structure_group_df.to_csv('../assets/data_clean/population_household_structure_cleaned.csv', encoding='utf-8', index=False)

# export as pickle file
population_household_structure_group_df.to_pickle('../assets/data_clean/population_household_structure_cleaned.pkl')

- Observed that the files are successfully exported.

## Income From Work Data

- API retrieves data related income from work data for given planning area name and year.
- Data is available for the year 2000, 2010, 2015, and 2020.
- Data extracted is exported to csv file with the columns:
	1. planning_area
	2. total
	3. below_sgd_1000
	4. sgd_10000_to_10999
	5. sgd_11000_to_11999
	6. sgd_12000_over
	7. sgd_1000_to_1499
	8. sgd_1500_to_1999
	9. sgd_2000_to_2499
	10. sgd_2500_to_2999
	11. sgd_3000_to_3999
	12. sgd_4000_to_4999
	13. sgd_5000_to_5999
	14. sgd_6000_over
	15. sgd_6000_to_6999
	16. sgd_7000_to_7999
	17. sgd_8000_over
	18. sgd_8000_to_8999
	19. sgd_9000_to_9999
	20. sgd_1000_to_1999
	21. sgd_2000_to_2999
	22. sgd_12000_14999
	23. sgd_15000_over
	24. year

In [143]:
# Read Dataset
population_income_from_work_df = pd.read_csv('../assets/onemap/population_income_from_work.csv')

# Dataset Info
population_income_from_work_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   planning_area       150 non-null    object 
 1   total               150 non-null    int64  
 2   below_sgd_1000      150 non-null    int64  
 3   sgd_10000_to_10999  83 non-null     float64
 4   sgd_11000_to_11999  83 non-null     float64
 5   sgd_12000_over      83 non-null     float64
 6   sgd_1000_to_1499    150 non-null    int64  
 7   sgd_1500_to_1999    150 non-null    int64  
 8   sgd_2000_to_2499    150 non-null    int64  
 9   sgd_2500_to_2999    150 non-null    int64  
 10  sgd_3000_to_3999    150 non-null    int64  
 11  sgd_4000_to_4999    150 non-null    int64  
 12  sgd_5000_to_5999    150 non-null    int64  
 13  sgd_6000_over       87 non-null     float64
 14  sgd_6000_to_6999    118 non-null    float64
 15  sgd_7000_to_7999    118 non-null    float64
 16  sgd_8000

- Observed that there are null values.
- Next: fill null values with 0

In [145]:
# fill dataframe with 0 where na
population_income_from_work_df = population_income_from_work_df.fillna(0)

# view dataframe info
print(population_income_from_work_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   planning_area       150 non-null    object 
 1   total               150 non-null    int64  
 2   below_sgd_1000      150 non-null    int64  
 3   sgd_10000_to_10999  150 non-null    float64
 4   sgd_11000_to_11999  150 non-null    float64
 5   sgd_12000_over      150 non-null    float64
 6   sgd_1000_to_1499    150 non-null    int64  
 7   sgd_1500_to_1999    150 non-null    int64  
 8   sgd_2000_to_2499    150 non-null    int64  
 9   sgd_2500_to_2999    150 non-null    int64  
 10  sgd_3000_to_3999    150 non-null    int64  
 11  sgd_4000_to_4999    150 non-null    int64  
 12  sgd_5000_to_5999    150 non-null    int64  
 13  sgd_6000_over       150 non-null    float64
 14  sgd_6000_to_6999    150 non-null    float64
 15  sgd_7000_to_7999    150 non-null    float64
 16  sgd_8000

- Observed that null values have been filled with 0.
- Next:
    - Dataset will be averaged by planning_area.
    - Year column will be dropped from dataset and exported as cleaned dataframe.

In [150]:
# Group data by planning area and get the mean of the values.
population_income_from_work_group_df = population_income_from_work_df.groupby('planning_area')[
                                                                  ['total'
                                                                   ,'below_sgd_1000'
                                                                   ,'sgd_10000_to_10999'
                                                                   , 'sgd_11000_to_11999'
                                                                   , 'sgd_12000_over'
                                                                   
                                                                   , 'sgd_1000_to_1499' 
                                                                   , 'sgd_1000_to_1999' 
                                                                   , 'sgd_2000_to_2499' 
                                                                   , 'sgd_2500_to_2999'                                                                   
                                                                   , 'sgd_3000_to_3999' 
                                                                   
                                                                   , 'sgd_4000_to_4999' 
                                                                   , 'sgd_5000_to_5999' 
                                                                   , 'sgd_6000_over' 
                                                                   , 'sgd_6000_to_6999' 
                                                                   , 'sgd_7000_to_7999'
                                                                   
                                                                   , 'sgd_8000_over' 
                                                                   , 'sgd_8000_to_8999' 
                                                                   , 'sgd_9000_to_9999' 
                                                                   , 'sgd_1000_to_1999' 
                                                                   , 'sgd_2000_to_2999' 
                                                                   
                                                                   , 'sgd_12000_14999'
                                                                   , 'sgd_15000_over'
                                                                  
                                                                   ]].mean().reset_index()

# view the information of the dataset.
population_income_from_work_group_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   planning_area       55 non-null     object 
 1   total               55 non-null     float64
 2   below_sgd_1000      55 non-null     float64
 3   sgd_10000_to_10999  55 non-null     float64
 4   sgd_11000_to_11999  55 non-null     float64
 5   sgd_12000_over      55 non-null     float64
 6   sgd_1000_to_1499    55 non-null     float64
 7   sgd_1000_to_1999    55 non-null     float64
 8   sgd_2000_to_2499    55 non-null     float64
 9   sgd_2500_to_2999    55 non-null     float64
 10  sgd_3000_to_3999    55 non-null     float64
 11  sgd_4000_to_4999    55 non-null     float64
 12  sgd_5000_to_5999    55 non-null     float64
 13  sgd_6000_over       55 non-null     float64
 14  sgd_6000_to_6999    55 non-null     float64
 15  sgd_7000_to_7999    55 non-null     float64
 16  sgd_8000_o

### Export Income From Work Data cleaned data

In [151]:
# export  to csv
population_income_from_work_group_df.to_csv('../assets/data_clean/population_income_from_work_cleaned.csv', encoding='utf-8', index=False)

# export as pickle file
population_income_from_work_group_df.to_pickle('../assets/data_clean/population_income_from_work_cleaned.pkl')

- Observed that the files are successfully exported.

## Planning Area Industry

- API retrieves data related to industry of population for given planning area name and year.
- Data is available for the year 2000, 2010, 2015, and 2020.
- Data extracted is exported to csv file with the columns:
	1. planning_area
	2. manufacturing
	3. construction
	4. wholesale_retail_trade
	5. transportation_storage
	6. accommodation_food_services
	7. information_communications
	8. financial_insurance_services
	9. real_estate_services
	10. professional_services
	11. admin_support_services
	12. public_admin_education
	13. health_social_services
	14. arts_entertainment_recreation
	15. other_comm_social_personal
	16. others
	17. hotels_restaurants
	18. transport_communications
	19. business_services
	20. other_services_industries
	21. year

In [153]:
# Read Dataset
population_planning_area_industry_df = pd.read_csv('../assets/onemap/population_planning_area_industry.csv')

# Dataset Info
population_planning_area_industry_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   planning_area                  177 non-null    object 
 1   manufacturing                  177 non-null    int64  
 2   construction                   177 non-null    int64  
 3   wholesale_retail_trade         177 non-null    int64  
 4   transportation_storage         177 non-null    int64  
 5   accommodation_food_services    177 non-null    int64  
 6   information_communications     177 non-null    int64  
 7   financial_insurance_services   177 non-null    int64  
 8   real_estate_services           177 non-null    int64  
 9   professional_services          177 non-null    int64  
 10  admin_support_services         177 non-null    int64  
 11  public_admin_education         177 non-null    int64  
 12  health_social_services         177 non-null    int

- Observed that there are null values.
- Next: fill null values with 0

In [154]:
# fill dataframe with 0 where na
population_planning_area_industry_df = population_planning_area_industry_df.fillna(0)

# view dataframe info
print(population_planning_area_industry_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   planning_area                  177 non-null    object 
 1   manufacturing                  177 non-null    int64  
 2   construction                   177 non-null    int64  
 3   wholesale_retail_trade         177 non-null    int64  
 4   transportation_storage         177 non-null    int64  
 5   accommodation_food_services    177 non-null    int64  
 6   information_communications     177 non-null    int64  
 7   financial_insurance_services   177 non-null    int64  
 8   real_estate_services           177 non-null    int64  
 9   professional_services          177 non-null    int64  
 10  admin_support_services         177 non-null    int64  
 11  public_admin_education         177 non-null    int64  
 12  health_social_services         177 non-null    int

- Observed that null values have been filled with 0.
- Next:
    - Dataset will be averaged by planning_area.
    - Year column will be dropped from dataset and exported as cleaned dataframe.

In [155]:
# Group data by planning area and get the mean of the values.
population_planning_area_industry_group_df = population_planning_area_industry_df.groupby('planning_area')[
                                                                  ['manufacturing'
                                                                   ,'construction'
                                                                   ,'wholesale_retail_trade'
                                                                   , 'transportation_storage'
                                                                   , 'accommodation_food_services'
                                                                   
                                                                   , 'information_communications' 
                                                                   , 'financial_insurance_services' 
                                                                   , 'real_estate_services' 
                                                                   , 'professional_services'                                                                   
                                                                   , 'admin_support_services' 
                                                                   
                                                                   , 'public_admin_education' 
                                                                   , 'health_social_services' 
                                                                   , 'arts_entertainment_recreation' 
                                                                   , 'other_comm_social_personal' 
                                                                   , 'others'
                                                                   
                                                                   , 'hotels_restaurants' 
                                                                   , 'transport_communications' 
                                                                   , 'business_services' 
                                                                   , 'other_services_industries' 
                                                                  
                                                                   ]].mean().reset_index()

# view the information of the dataset.
population_planning_area_industry_group_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   planning_area                  55 non-null     object 
 1   manufacturing                  55 non-null     float64
 2   construction                   55 non-null     float64
 3   wholesale_retail_trade         55 non-null     float64
 4   transportation_storage         55 non-null     float64
 5   accommodation_food_services    55 non-null     float64
 6   information_communications     55 non-null     float64
 7   financial_insurance_services   55 non-null     float64
 8   real_estate_services           55 non-null     float64
 9   professional_services          55 non-null     float64
 10  admin_support_services         55 non-null     float64
 11  public_admin_education         55 non-null     float64
 12  health_social_services         55 non-null     float

### Export Planning Area Industry cleaned data

In [156]:
# export  to csv
population_planning_area_industry_group_df.to_csv('../assets/data_clean/population_planning_area_industry_cleaned.csv', encoding='utf-8', index=False)

# export as pickle file
population_planning_area_industry_group_df.to_pickle('../assets/data_clean/population_planning_area_industry_cleaned.pkl')

- Observed that the files are successfully exported.

## Population Language Literacy

- API retrieves data related to language literacy for given planning area name and year.
- Data is available for the year 2000, 2010, 2015, and 2020.
- Data extracted is exported to csv file with the columns:
	1. planning_area
	2. no_literate
	3. l1_chi
	4. l1_eng
	5. l1_mal
	6. l1_tam
	7. l1_non_off
	8. l2_eng_chi
	9. l2_eng_mal
	10. l2_eng_tam
	11. l2_other_two
	12. l3_eng_chi_mal
	13. l3_eng_mal_tam
	14. l3_other_three
	15. year
	16. l2_eng_non_off


In [157]:
# Read Dataset
population_language_literacy_df = pd.read_csv('../assets/onemap/population_language_literacy.csv')

# Dataset Info
population_language_literacy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   planning_area   177 non-null    object 
 1   no_literate     177 non-null    int64  
 2   l1_chi          177 non-null    int64  
 3   l1_eng          177 non-null    int64  
 4   l1_mal          177 non-null    int64  
 5   l1_tam          177 non-null    int64  
 6   l1_non_off      177 non-null    int64  
 7   l2_eng_chi      177 non-null    int64  
 8   l2_eng_mal      177 non-null    int64  
 9   l2_eng_tam      177 non-null    int64  
 10  l2_other_two    177 non-null    int64  
 11  l3_eng_chi_mal  122 non-null    float64
 12  l3_eng_mal_tam  122 non-null    float64
 13  l3_other_three  67 non-null     float64
 14  year            177 non-null    int64  
 15  l2_eng_non_off  177 non-null    int64  
dtypes: float64(3), int64(12), object(1)
memory usage: 22.2+ KB


- Observed that there are null values.
-  Next: fill null values with 0

In [159]:
# fill dataframe with 0 where na
population_language_literacy_df = population_language_literacy_df.fillna(0)

# view dataframe info
print(population_language_literacy_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   planning_area   177 non-null    object 
 1   no_literate     177 non-null    int64  
 2   l1_chi          177 non-null    int64  
 3   l1_eng          177 non-null    int64  
 4   l1_mal          177 non-null    int64  
 5   l1_tam          177 non-null    int64  
 6   l1_non_off      177 non-null    int64  
 7   l2_eng_chi      177 non-null    int64  
 8   l2_eng_mal      177 non-null    int64  
 9   l2_eng_tam      177 non-null    int64  
 10  l2_other_two    177 non-null    int64  
 11  l3_eng_chi_mal  177 non-null    float64
 12  l3_eng_mal_tam  177 non-null    float64
 13  l3_other_three  177 non-null    float64
 14  year            177 non-null    int64  
 15  l2_eng_non_off  177 non-null    int64  
dtypes: float64(3), int64(12), object(1)
memory usage: 22.2+ KB
None


- Observed that null values have been filled with 0.
- Next:
    - Dataset will be averaged by planning_area.
    - Year column will be dropped from dataset and exported as cleaned dataframe.

In [160]:
# Group data by planning area and get the mean of the values.
population_language_literacy_group_df = population_language_literacy_df.groupby('planning_area')[
                                                                  ['no_literate'
                                                                   ,'l1_chi'
                                                                   ,'l1_eng'
                                                                   , 'l1_mal'
                                                                   , 'l1_tam'
                                                                   
                                                                   , 'l1_non_off' 
                                                                   , 'l2_eng_chi' 
                                                                   , 'l2_eng_mal' 
                                                                   , 'l2_eng_tam'                                                                   
                                                                   , 'l2_other_two' 
                                                                   
                                                                   , 'l3_eng_chi_mal' 
                                                                   , 'l3_eng_mal_tam' 
                                                                   , 'l3_other_three' 
                                                                   , 'l2_eng_non_off' 
                                                                 
                                                                   ]].mean().reset_index()

# view the information of the dataset.
population_language_literacy_group_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   planning_area   55 non-null     object 
 1   no_literate     55 non-null     float64
 2   l1_chi          55 non-null     float64
 3   l1_eng          55 non-null     float64
 4   l1_mal          55 non-null     float64
 5   l1_tam          55 non-null     float64
 6   l1_non_off      55 non-null     float64
 7   l2_eng_chi      55 non-null     float64
 8   l2_eng_mal      55 non-null     float64
 9   l2_eng_tam      55 non-null     float64
 10  l2_other_two    55 non-null     float64
 11  l3_eng_chi_mal  55 non-null     float64
 12  l3_eng_mal_tam  55 non-null     float64
 13  l3_other_three  55 non-null     float64
 14  l2_eng_non_off  55 non-null     float64
dtypes: float64(14), object(1)
memory usage: 6.6+ KB


- Observed that null values have been filled with 0.
- Next:
    - Dataset will be averaged by planning_area.
    - Year column will be dropped from dataset and exported as cleaned dataframe.

### Export Population Language Literacy cleaned data

In [161]:
# export  to csv
population_language_literacy_group_df.to_csv('../assets/data_clean/population_language_literacy_cleaned.csv', encoding='utf-8', index=False)

# export as pickle file
population_language_literacy_group_df.to_pickle('../assets/data_clean/population_language_literacy_cleaned.pkl')

- Observed that the files are successfully exported.

## Population Martial Status
- API retrieves data related to marital status for given planning area name and year.
- Data is available for the year 2000, 2010, 2015, and 2020.
- Data extracted is exported to csv file with the columns:
	1. planning_area
	2. single
	3. married
	4. widowed
	5. divorced
	6. year
	7. gender

In [162]:
# Read Dataset
population_maritial_status_df = pd.read_csv('../assets/onemap/population_martial_status.csv')

# Dataset Info
population_maritial_status_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354 entries, 0 to 353
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   planning_area  354 non-null    object
 1   single         354 non-null    int64 
 2   married        354 non-null    int64 
 3   widowed        354 non-null    int64 
 4   divorced       354 non-null    int64 
 5   year           354 non-null    int64 
 6   gender         354 non-null    object
dtypes: int64(5), object(2)
memory usage: 19.5+ KB


- Observed that there are no missing values.
- Next:
    - Dataset will be averaged by planning_area.
    - Year and Gender column will be dropped from dataset and exported as cleaned dataframe.

In [163]:
# Group data by planning area and get the mean of the values.
population_maritial_status_group_df = population_maritial_status_df.groupby('planning_area')[
                                                                  ['single'
                                                                   ,'married'
                                                                   ,'widowed'
                                                                   , 'divorced'
                                                                 
                                                                   ]].mean().reset_index()

# view the information of the dataset.
population_maritial_status_group_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   planning_area  55 non-null     object 
 1   single         55 non-null     float64
 2   married        55 non-null     float64
 3   widowed        55 non-null     float64
 4   divorced       55 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.3+ KB


### Export Population Marital Status cleaned data

In [164]:
# export  to csv
population_language_literacy_group_df.to_csv('../assets/data_clean/population_martial_status_cleaned.csv', encoding='utf-8', index=False)

# export as pickle file
population_language_literacy_group_df.to_pickle('../assets/data_clean/population_martial_status_cleaned.pkl')

- Observed that the files are successfully exported.

## Population Transport Mode To School

- API data related to mode of transport to school for given planning area name and year.
- Data is available for the year 2000, 2010, 2015, and 2020.
- Data extracted is exported to csv file with the columns:
	1. planning_area
	2. bus
	3. mrt
	4. mrt_bus
	5. mrt_car
	6. mrt_other
	7. taxi
	8. car
	9. pvt_chartered_bus
	10. lorry_pickup
	11. motorcycle_scooter
	12. others
	13. no_transport_required
	14. other_combi_mrt_or_bus
	15. mrt_lrt_only
	16. mrt_lrt_and_bus
	17. other_combi_mrt_lrt_or_bus
	18. taxi_pvt_hire_car_only
	19. pvt_chartered_bus_van
	20. year

In [165]:
# Read Dataset
population_transport_mode_school_df = pd.read_csv('../assets/onemap/population_transport_mode_school.csv')

# Dataset Info
population_transport_mode_school_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   planning_area               177 non-null    object
 1   bus                         177 non-null    int64 
 2   mrt                         177 non-null    int64 
 3   mrt_bus                     177 non-null    int64 
 4   mrt_car                     177 non-null    int64 
 5   mrt_other                   177 non-null    int64 
 6   taxi                        177 non-null    int64 
 7   car                         177 non-null    int64 
 8   pvt_chartered_bus           177 non-null    int64 
 9   lorry_pickup                177 non-null    int64 
 10  motorcycle_scooter          177 non-null    int64 
 11  others                      177 non-null    int64 
 12  no_transport_required       177 non-null    int64 
 13  other_combi_mrt_or_bus      177 non-null    int64 

- Observed that there are no missing values.
- Next:
    - Dataset will be averaged by planning_area.
    - Year column will be dropped from dataset and exported as cleaned dataframe.

In [166]:
# Group data by planning area and get the mean of the values.
population_transport_mode_school_group_df = population_transport_mode_school_df.groupby('planning_area')[
                                                                  ['bus'
                                                                   ,'mrt'
                                                                   ,'mrt_bus'
                                                                   , 'mrt_car'
                                                                   , 'mrt_other'
                                                                   
                                                                   , 'taxi' 
                                                                   , 'car' 
                                                                   , 'pvt_chartered_bus' 
                                                                   , 'lorry_pickup'                                                                   
                                                                   , 'motorcycle_scooter' 
                                                                   
                                                                   , 'others' 
                                                                   , 'no_transport_required' 
                                                                   , 'other_combi_mrt_or_bus' 
                                                                   , 'mrt_lrt_only'
                                                                   , 'mrt_lrt_and_bus' 
                                                                   
                                                                   , 'other_combi_mrt_lrt_or_bus' 
                                                                   , 'taxi_pvt_hire_car_only' 
                                                                   , 'pvt_chartered_bus_van' 

                                                                   ]].mean().reset_index()

# view the information of the dataset.
population_transport_mode_school_group_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   planning_area               55 non-null     object 
 1   bus                         55 non-null     float64
 2   mrt                         55 non-null     float64
 3   mrt_bus                     55 non-null     float64
 4   mrt_car                     55 non-null     float64
 5   mrt_other                   55 non-null     float64
 6   taxi                        55 non-null     float64
 7   car                         55 non-null     float64
 8   pvt_chartered_bus           55 non-null     float64
 9   lorry_pickup                55 non-null     float64
 10  motorcycle_scooter          55 non-null     float64
 11  others                      55 non-null     float64
 12  no_transport_required       55 non-null     float64
 13  other_combi_mrt_or_bus      55 non-nu

### Export Population Transport Mode To School cleaned data

In [167]:
# export  to csv
population_transport_mode_school_group_df.to_csv('../assets/data_clean/population_transport_mode_school_cleaned.csv', encoding='utf-8', index=False)

# export as pickle file
population_transport_mode_school_group_df.to_pickle('../assets/data_clean/population_transport_mode_school_cleaned.pkl')

- Observed that the files are successfully exported.

## Population Transport Mode To Work
- API retrieve data related to mode of transport to work for given planning area name and year.
- Data is available for the year 2000, 2010, 2015, and 2020.
- Data extracted is exported to csv file with the columns:
	1. planning_area
	2. bus
	3. mrt
	4. mrt_bus
	5. mrt_car
	6. mrt_other
	7. taxi
	8. car
	9. pvt_chartered_bus
	10. lorry_pickup
	11. motorcycle_scooter
	12. others
	13. no_transport_required
	14. other_combi_mrt_or_bus
	15. mrt_lrt_only
	16. mrt_lrt_and_bus
	17. other_combi_mrt_lrt_or_bus
	18. taxi_pvt_hire_car_only
	19. pvt_chartered_bus_van
	20. year

In [168]:
# Read Dataset
population_transport_mode_work_df = pd.read_csv('../assets/onemap/population_transport_mode_work.csv')

# Dataset Info
population_transport_mode_work_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   planning_area               177 non-null    object
 1   bus                         177 non-null    int64 
 2   mrt                         177 non-null    int64 
 3   mrt_bus                     177 non-null    int64 
 4   mrt_car                     177 non-null    int64 
 5   mrt_other                   177 non-null    int64 
 6   taxi                        177 non-null    int64 
 7   car                         177 non-null    int64 
 8   pvt_chartered_bus           177 non-null    int64 
 9   lorry_pickup                177 non-null    int64 
 10  motorcycle_scooter          177 non-null    int64 
 11  others                      177 non-null    int64 
 12  no_transport_required       177 non-null    int64 
 13  other_combi_mrt_or_bus      177 non-null    int64 

- Observed that there are no missing values.
- Next:
    - Dataset will be averaged by planning_area.
    - Year column will be dropped from dataset and exported as cleaned dataframe.

In [169]:
# Group data by planning area and get the mean of the values.
population_transport_mode_work_group_df = population_transport_mode_work_df.groupby('planning_area')[
                                                                  ['bus'
                                                                   ,'mrt'
                                                                   ,'mrt_bus'
                                                                   , 'mrt_car'
                                                                   , 'mrt_other'
                                                                   
                                                                   , 'taxi' 
                                                                   , 'car' 
                                                                   , 'pvt_chartered_bus' 
                                                                   , 'lorry_pickup'                                                                   
                                                                   , 'motorcycle_scooter' 
                                                                   
                                                                   , 'others' 
                                                                   , 'no_transport_required' 
                                                                   , 'other_combi_mrt_or_bus' 
                                                                   , 'mrt_lrt_only'
                                                                   , 'mrt_lrt_and_bus' 
                                                                   
                                                                   , 'other_combi_mrt_lrt_or_bus' 
                                                                   , 'taxi_pvt_hire_car_only' 
                                                                   , 'pvt_chartered_bus_van' 

                                                                   ]].mean().reset_index()

# view the information of the dataset.
population_transport_mode_work_group_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   planning_area               55 non-null     object 
 1   bus                         55 non-null     float64
 2   mrt                         55 non-null     float64
 3   mrt_bus                     55 non-null     float64
 4   mrt_car                     55 non-null     float64
 5   mrt_other                   55 non-null     float64
 6   taxi                        55 non-null     float64
 7   car                         55 non-null     float64
 8   pvt_chartered_bus           55 non-null     float64
 9   lorry_pickup                55 non-null     float64
 10  motorcycle_scooter          55 non-null     float64
 11  others                      55 non-null     float64
 12  no_transport_required       55 non-null     float64
 13  other_combi_mrt_or_bus      55 non-nu

### Export Population Transport Mode To Work cleaned data

In [170]:
# export  to csv
population_transport_mode_work_group_df.to_csv('../assets/data_clean/population_transport_mode_work_cleaned.csv', encoding='utf-8', index=False)

# export as pickle file
population_transport_mode_work_group_df.to_pickle('../assets/data_clean/population_transport_mode_work_cleaned.pkl')

- Observed that the files are successfully exported.

## Population Age Group
- API retrieves data related to age group for given planning area name and year.
- Data is available for the year 2000, 2010, 2015, and 2020.
- Data extracted is exported to csv file with the columns:
	1. planning_area
	2. age_0_4
	3. age_5_9
	4. age_10_14
	5. age_15_19
	6. age_20_24
	7. age_25_29
	8. age_30_34
	9. age_35_39
	10. age_40_44
	11. age_45_49
	12. age_50_54
	13. age_55_59
	14. age_60_64
	15. age_65_69
	16. age_70_74
	17. age_75_79
	18. age_80_84
	19. age_85_over
	20. total
	21. gender
	22. year

In [175]:
# Read Dataset
population_age_group_df = pd.read_csv('../assets/onemap/population_age_group.csv')

# Dataset Info
population_age_group_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438 entries, 0 to 437
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   planning_area  438 non-null    object
 1   age_0_4        438 non-null    int64 
 2   age_5_9        438 non-null    int64 
 3   age_10_14      438 non-null    int64 
 4   age_15_19      438 non-null    int64 
 5   age_20_24      438 non-null    int64 
 6   age_25_29      438 non-null    int64 
 7   age_30_34      438 non-null    int64 
 8   age_35_39      438 non-null    int64 
 9   age_40_44      438 non-null    int64 
 10  age_45_49      438 non-null    int64 
 11  age_50_54      438 non-null    int64 
 12  age_55_59      438 non-null    int64 
 13  age_60_64      438 non-null    int64 
 14  age_65_69      438 non-null    int64 
 15  age_70_74      438 non-null    int64 
 16  age_75_79      438 non-null    int64 
 17  age_80_84      438 non-null    int64 
 18  age_85_over    438 non-null   

- Observed that there are no missing values.
- Next:
    - Dataset will be averaged by planning_area.
    - Year and Gender column will be dropped from dataset and exported as cleaned dataframe.

In [176]:
# Group data by planning area and get the mean of the values.
population_age_group_grouped_df = population_age_group_df.groupby('planning_area')[
                                                                  ['age_0_4'
                                                                   ,'age_5_9'
                                                                   ,'age_10_14'
                                                                   , 'age_15_19'
                                                                   , 'age_20_24'
                                                                   
                                                                   , 'age_25_29' 
                                                                   , 'age_30_34' 
                                                                   , 'age_35_39' 
                                                                   , 'age_40_44'                                                                   
                                                                   , 'age_45_49' 
                                                                   
                                                                   , 'age_50_54' 
                                                                   , 'age_55_59' 
                                                                   , 'age_60_64' 
                                                                   , 'age_65_69'
                                                                   , 'age_70_74' 
                                                                   
                                                                   , 'age_75_79' 
                                                                   , 'age_80_84' 
                                                                   , 'age_85_over' 

                                                                   ]].mean().reset_index()

# view the information of the dataset.
population_age_group_grouped_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   planning_area  55 non-null     object 
 1   age_0_4        55 non-null     float64
 2   age_5_9        55 non-null     float64
 3   age_10_14      55 non-null     float64
 4   age_15_19      55 non-null     float64
 5   age_20_24      55 non-null     float64
 6   age_25_29      55 non-null     float64
 7   age_30_34      55 non-null     float64
 8   age_35_39      55 non-null     float64
 9   age_40_44      55 non-null     float64
 10  age_45_49      55 non-null     float64
 11  age_50_54      55 non-null     float64
 12  age_55_59      55 non-null     float64
 13  age_60_64      55 non-null     float64
 14  age_65_69      55 non-null     float64
 15  age_70_74      55 non-null     float64
 16  age_75_79      55 non-null     float64
 17  age_80_84      55 non-null     float64
 18  age_85_over 

### Export Population Age Group cleaned data

In [177]:
# export  to csv
population_age_group_grouped_df.to_csv('../assets/data_clean/population_age_group_cleaned.csv', encoding='utf-8', index=False)

# export as pickle file
population_age_group_grouped_df.to_pickle('../assets/data_clean/population_age_group_cleaned.pkl')

- Observed that the files are successfully exported.

## Population Religion

- API retrieves data related to religion for given planning area name and year.
- Data is available for the year 2000, 2010, 2015, and 2020.
- Data extracted is exported to csv file with the columns:
	1. planning_area
	2. no_religion
	3. buddhism
	4. taoism
	5. islam
	6. hinduism
	7. sikhism
	8. catholic_christian
	9. other_christians
	10. other_religions
	11. year

In [174]:
# Read Dataset
population_religion_df = pd.read_csv('../assets/onemap/population_religion.csv')

# Dataset Info
population_religion_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   planning_area       177 non-null    object
 1   no_religion         177 non-null    int64 
 2   buddhism            177 non-null    int64 
 3   taoism              177 non-null    int64 
 4   islam               177 non-null    int64 
 5   hinduism            177 non-null    int64 
 6   sikhism             177 non-null    int64 
 7   catholic_christian  177 non-null    int64 
 8   other_christians    177 non-null    int64 
 9   other_religions     177 non-null    int64 
 10  year                177 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 15.3+ KB


- Observed that there are no missing values.
- Next:
    - Dataset will be averaged by planning_area.
    - Year column will be dropped from dataset and exported as cleaned dataframe.

In [178]:
# Group data by planning area and get the mean of the values.
population_religion_group_df = population_religion_df.groupby('planning_area')[
                                                                  ['no_religion'
                                                                   ,'buddhism'
                                                                   ,'taoism'
                                                                   , 'islam'
                                                                   , 'hinduism'
                                                                   
                                                                   , 'sikhism' 
                                                                   , 'catholic_christian' 
                                                                   , 'other_christians' 
                                                                   , 'other_religions'                                                                  
 
                                                                   ]].mean().reset_index()

# view the information of the dataset.
population_religion_group_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   planning_area       55 non-null     object 
 1   no_religion         55 non-null     float64
 2   buddhism            55 non-null     float64
 3   taoism              55 non-null     float64
 4   islam               55 non-null     float64
 5   hinduism            55 non-null     float64
 6   sikhism             55 non-null     float64
 7   catholic_christian  55 non-null     float64
 8   other_christians    55 non-null     float64
 9   other_religions     55 non-null     float64
dtypes: float64(9), object(1)
memory usage: 4.4+ KB


### Export Population Religion cleaned data

In [179]:
# export  to csv
population_religion_group_df.to_csv('../assets/data_clean/population_religion_cleaned.csv', encoding='utf-8', index=False)

# export as pickle file
population_religion_group_df.to_pickle('../assets/data_clean/population_religion_cleaned.pkl')

- Observed that the files are successfully exported.

## Population Spoken Language
- API retrieves data related to spoken language for given planning area name and year.
- Data is available for the year 2000, 2010, 2015, and 2020.
- Data extracted is exported to csv file with the columns:
	1. planning_area
	2. english
	3. mandarin
	4. chinese_dialects
	5. malay
	6. tamil
	7. other_indian_languages
	8. others
	9. eng_mand
	10. eng_chn_dlt
	11. eng_mly
	12. eng_oth_ind_lang
	13. eng_oth_lang
	14. mand_eng
	15. mand_chn_dlt
	16. mand_oth_lang
	17. chn_dlt_eng
	18. chn_dlt_mand
	19. chn_dlt_oth_lang
	20. mly_eng
	21. mly_oth_lang
	22. tml_eng
	23. tml_oth_lang
	24. oth_ind_lang_eng
	25. oth_ind_lang_oth_lang
	26. oth_lang_eng
	27. oth_lang_oth_non_eng_lang
	28. eng_tml
	29. year

In [180]:
# Read Dataset
population_spoken_language_df = pd.read_csv('../assets/onemap/population_spoken_language.csv')

# Dataset Info
population_spoken_language_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 29 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   planning_area              177 non-null    object
 1   english                    177 non-null    int64 
 2   mandarin                   177 non-null    int64 
 3   chinese_dialects           177 non-null    int64 
 4   malay                      177 non-null    int64 
 5   tamil                      177 non-null    int64 
 6   other_indian_languages     177 non-null    int64 
 7   others                     177 non-null    int64 
 8   eng_mand                   177 non-null    int64 
 9   eng_chn_dlt                177 non-null    int64 
 10  eng_mly                    177 non-null    int64 
 11  eng_oth_ind_lang           177 non-null    int64 
 12  eng_oth_lang               177 non-null    int64 
 13  mand_eng                   177 non-null    int64 
 14  mand_chn_d

- Observed that there are no missing values.
- Next:
    - Dataset will be averaged by planning_area.
    - Year column will be dropped from dataset and exported as cleaned dataframe.

In [182]:
# Group data by planning area and get the mean of the values.
population_spoken_language_group_df = population_spoken_language_df.groupby('planning_area')[
                                                                  ['english'
                                                                   ,'mandarin'
                                                                   ,'chinese_dialects'
                                                                   , 'malay'
                                                                   , 'tamil'
                                                                   
                                                                   , 'other_indian_languages' 
                                                                   , 'others' 
                                                                   , 'eng_mand' 
                                                                   , 'eng_chn_dlt'
                                                                   , 'eng_mly'
                                                                   
                                                                   , 'eng_oth_ind_lang' 
                                                                   , 'eng_oth_lang' 
                                                                   , 'mand_eng' 
                                                                   , 'mand_chn_dlt'
                                                                   , 'mand_oth_lang'
                                                                   
                                                                   , 'chn_dlt_eng' 
                                                                   , 'chn_dlt_mand' 
                                                                   , 'chn_dlt_oth_lang' 
                                                                   , 'mly_eng'
                                                                   , 'mly_oth_lang' 
                                                                   
                                                                   , 'tml_eng' 
                                                                   , 'tml_oth_lang' 
                                                                   , 'oth_ind_lang_eng' 
                                                                   , 'oth_ind_lang_oth_lang'
                                                                   , 'oth_lang_eng'                                                                     

                                                                                                                                      , 'oth_ind_lang_oth_lang'
                                                                   , 'oth_lang_oth_non_eng_lang'
                                                                   , 'eng_tml'
                                                            
                                                                   
 
                                                                   ]].mean().reset_index()

# view the information of the dataset.
population_spoken_language_group_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 29 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   planning_area              55 non-null     object 
 1   english                    55 non-null     float64
 2   mandarin                   55 non-null     float64
 3   chinese_dialects           55 non-null     float64
 4   malay                      55 non-null     float64
 5   tamil                      55 non-null     float64
 6   other_indian_languages     55 non-null     float64
 7   others                     55 non-null     float64
 8   eng_mand                   55 non-null     float64
 9   eng_chn_dlt                55 non-null     float64
 10  eng_mly                    55 non-null     float64
 11  eng_oth_ind_lang           55 non-null     float64
 12  eng_oth_lang               55 non-null     float64
 13  mand_eng                   55 non-null     float64
 

### Export Population Spoken Language cleaned data

In [183]:
# export  to csv
population_spoken_language_group_df.to_csv('../assets/data_clean/population_spoken_language_cleaned.csv', encoding='utf-8', index=False)

# export as pickle file
population_spoken_language_group_df.to_pickle('../assets/data_clean/population_spoken_language_cleaned.pkl')

- Observed that the files are successfully exported.

## Population Tenancy

- API retrieves data related to tenancy for the given planning area name and year.
- Data is available for the year 2000, 2010, 2015, and 2020.
- Data extracted is exported to csv file with the columns:
	1. planning_area
	2. owner
	3. tenant
	4. others
	5. year

In [184]:
# Read Dataset
population_tenancy_df = pd.read_csv('../assets/onemap/population_tenancy.csv')

# Dataset Info
population_tenancy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   planning_area  177 non-null    object
 1   owner          177 non-null    int64 
 2   tenant         177 non-null    int64 
 3   others         177 non-null    int64 
 4   year           177 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.0+ KB


- Observed that there are no missing values.
- Next:
    - Dataset will be averaged by planning_area.
    - Year column will be dropped from dataset and exported as cleaned dataframe.

In [185]:
# Group data by planning area and get the mean of the values.
population_tenancy_group_df = population_tenancy_df.groupby('planning_area')[
                                                                  ['owner'
                                                                   ,'tenant'
                                                                   ,'others'                                                                   
 
                                                                   ]].mean().reset_index()

# view the information of the dataset.
population_tenancy_group_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   planning_area  55 non-null     object 
 1   owner          55 non-null     float64
 2   tenant         55 non-null     float64
 3   others         55 non-null     float64
dtypes: float64(3), object(1)
memory usage: 1.8+ KB


### Export Population Tenancy cleaned data

In [186]:
# export  to csv
population_tenancy_group_df.to_csv('../assets/data_clean/population_tenancy_cleaned.csv', encoding='utf-8', index=False)

# export as pickle file
population_tenancy_group_df.to_pickle('../assets/data_clean/population_tenancy_cleaned.pkl')

- Observed that the files are successfully exported.

## Population Dwelling  Type Household Data
- API retrieves data related to dwelling type household for given the planning area name and year.
- Data is available for the year 2000, 2010, 2015, and 2020.
- Data extracted is exported to csv file with the columns:
	1. planning_area
	2. hdb_1_and_2_room_flats
	3. hdb_3_room_flats
	4. hdb_4_room_flats
	5. hdb_5_room_and_executive_flats
	6. condominiums_and_other_apartments
	7. landed_properties
	8. others
	9. year
	10. total_hdb

In [187]:
# Read Dataset
population_dwelling_type_household_df = pd.read_csv('../assets/onemap/population_dwelling_type_household.csv')

# Dataset Info
population_dwelling_type_household_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 10 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   planning_area                      150 non-null    object
 1   hdb_1_and_2_room_flats             150 non-null    int64 
 2   hdb_3_room_flats                   150 non-null    int64 
 3   hdb_4_room_flats                   150 non-null    int64 
 4   hdb_5_room_and_executive_flats     150 non-null    int64 
 5   condominiums_and_other_apartments  150 non-null    int64 
 6   landed_properties                  150 non-null    int64 
 7   others                             150 non-null    int64 
 8   year                               150 non-null    int64 
 9   total_hdb                          150 non-null    int64 
dtypes: int64(9), object(1)
memory usage: 11.8+ KB


- Observed that there are no missing values.
- Next:
    - Dataset will be averaged by planning_area.
    - Year column will be dropped from dataset and exported as cleaned dataframe.

In [188]:
# Group data by planning area and get the mean of the values.
population_dwelling_type_household_group_df = population_dwelling_type_household_df.groupby('planning_area')[
                                                                  ['hdb_1_and_2_room_flats'
                                                                   ,'hdb_3_room_flats'
                                                                   ,'hdb_4_room_flats'
                                                                   , 'hdb_5_room_and_executive_flats'
                                                                   , 'condominiums_and_other_apartments'
                                                                   
                                                                   , 'landed_properties' 
                                                                   , 'others' 
                                                                   , 'total_hdb' 
                                                                   
 
                                                                   ]].mean().reset_index()

# view the information of the dataset.
population_dwelling_type_household_group_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 9 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   planning_area                      55 non-null     object 
 1   hdb_1_and_2_room_flats             55 non-null     float64
 2   hdb_3_room_flats                   55 non-null     float64
 3   hdb_4_room_flats                   55 non-null     float64
 4   hdb_5_room_and_executive_flats     55 non-null     float64
 5   condominiums_and_other_apartments  55 non-null     float64
 6   landed_properties                  55 non-null     float64
 7   others                             55 non-null     float64
 8   total_hdb                          55 non-null     float64
dtypes: float64(8), object(1)
memory usage: 4.0+ KB


### Export Population Dwelling Type Household cleaned data

In [189]:
# export  to csv
population_dwelling_type_household_group_df.to_csv('../assets/data_clean/population_dwelling_type_household_cleaned.csv', encoding='utf-8', index=False)

# export as pickle file
population_dwelling_type_household_group_df.to_pickle('../assets/data_clean/population_dwelling_type_household_cleaned.pkl')

- Observed that the files are successfully exported.

## Population Dwelling  Type Population data
- API retrieves data related to dwelling type population for given the planning area name and year.
- Data is available for the year 2000, 2010, 2015, and 2020.
- Data extracted is exported to csv file with the columns:
	1. planning_area
	2. year
	3. hdb_1_and_2_room_flats
	4. hdb_3_room_flats
	5 hdb_4_room_flats
	6. hdb_5_room_and_executive_flats
	7. condominiums_and_other_apartments
	8. landed_properties
	9. others
	10. total_hdb
	11. total

In [190]:
# Read Dataset
population_dwelling_type_population_df = pd.read_csv('../assets/onemap/population_dwelling_type_population.csv')

# Dataset Info
population_dwelling_type_population_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 11 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   planning_area                      219 non-null    object
 1   year                               219 non-null    int64 
 2   hdb_1_and_2_room_flats             219 non-null    int64 
 3   hdb_3_room_flats                   219 non-null    int64 
 4   hdb_4_room_flats                   219 non-null    int64 
 5   hdb_5_room_and_executive_flats     219 non-null    int64 
 6   condominiums_and_other_apartments  219 non-null    int64 
 7   landed_properties                  219 non-null    int64 
 8   others                             219 non-null    int64 
 9   total_hdb                          219 non-null    int64 
 10  total                              219 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 18.9+ KB


- Observed that there are no missing values.
- Next:
    - Dataset will be averaged by planning_area.
    - Year column will be dropped from dataset and exported as cleaned dataframe.

In [191]:
# Group data by planning area and get the mean of the values.
population_dwelling_type_population_group_df = population_dwelling_type_population_df.groupby('planning_area')[
                                                                  ['hdb_1_and_2_room_flats'
                                                                   ,'hdb_3_room_flats'
                                                                   ,'hdb_4_room_flats'
                                                                   , 'hdb_5_room_and_executive_flats'
                                                                   , 'condominiums_and_other_apartments'
                                                                   
                                                                   , 'landed_properties' 
                                                                   , 'others' 
                                                                   , 'total_hdb'
                                                                   , 'total'
                                                                   
 
                                                                   ]].mean().reset_index()

# view the information of the dataset.
population_dwelling_type_population_group_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 10 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   planning_area                      55 non-null     object 
 1   hdb_1_and_2_room_flats             55 non-null     float64
 2   hdb_3_room_flats                   55 non-null     float64
 3   hdb_4_room_flats                   55 non-null     float64
 4   hdb_5_room_and_executive_flats     55 non-null     float64
 5   condominiums_and_other_apartments  55 non-null     float64
 6   landed_properties                  55 non-null     float64
 7   others                             55 non-null     float64
 8   total_hdb                          55 non-null     float64
 9   total                              55 non-null     float64
dtypes: float64(9), object(1)
memory usage: 4.4+ KB


### Export Population Dwelling Type Population cleaned data¶

In [192]:
# export  to csv
population_dwelling_type_population_group_df.to_csv('../assets/data_clean/population_dwelling_type_population_cleaned.csv', encoding='utf-8', index=False)

# export as pickle file
population_dwelling_type_population_group_df.to_pickle('../assets/data_clean/population_dwelling_type_population_cleaned.pkl')

- Observed that the files are successfully exported.