In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

import os
files_in_folder = os.listdir('Data/pub_describe')

In [2]:
files_in_folder

['.ipynb_checkpoints',
 'LFS_PUMF_EPA_FGMD_codebook.csv',
 'LFS_PUMF_EPA_FGMD_recordlayout.csv',
 'LFS_PUMF_FGMD_EPA_README_LISEZ MOI.txt']

In [3]:
# https://www150.statcan.gc.ca/n1/en/catalogue/71M0001X
# https://www.statcan.gc.ca/en/statistical-programs/instrument/3701_Q1_V9

df = pd.read_csv('Data/datasets_pub/pub1223.csv')

In [4]:
df.shape

(108024, 60)

In [8]:
df.sample()

Unnamed: 0,REC_NUM,SURVYEAR,SURVMNTH,LFSSTAT,PROV,CMA,AGE_12,AGE_6,SEX,MARSTAT,EDUC,MJH,EVERWORK,FTPTLAST,COWMAIN,IMMIG,NAICS_21,NOC_10,NOC_43,YABSENT,WKSAWAY,PAYAWAY,UHRSMAIN,AHRSMAIN,FTPTMAIN,UTOTHRS,ATOTHRS,HRSAWAY,YAWAY,PAIDOT,UNPAIDOT,XTRAHRS,WHYPT,TENURE,PREVTEN,HRLYEARN,UNION,PERMTEMP,ESTSIZE,FIRMSIZE,DURUNEMP,FLOWUNEM,UNEMFTPT,WHYLEFTO,WHYLEFTN,DURJLESS,AVAILABL,LKPUBAG,LKEMPLOY,LKRELS,LKATADS,LKANSADS,LKOTHERN,PRIORACT,YNOLOOK,TLOLOOK,SCHOOLN,EFAMTYPE,AGYOWNK,FINALWT
39742,39743,2023,12,3,47,0,9,,2,1,2,,2.0,,,2,,,,,,,,,,,,,,,,,,,,,,,,,99.0,4.0,1.0,,,42.0,2.0,,1.0,,1.0,1.0,,1.0,,,1.0,5,,115


In [6]:
df_info = pd.read_csv('Data/pub_describe/LFS_PUMF_EPA_FGMD_codebook.csv', encoding="latin")

In [7]:
df_info['Variable_Variable'].unique()

array(['rec_num', '1-9999999', 'survyear', '1976-', 'survmnth', '01',
       '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12',
       ' lfsstat    ', '1', '2', '3', '4', 'prov', '13', '24', '35', '46',
       '47', '48', '59', 'cma', '5', '6', '7', '8', '9', '0', 'age_12',
       'age_6    ', 'blank', 'sex', 'marstat', 'educ', 'mjh', 'everwork',
       'ftptlast', 'cowmain', 'immig', 'naics_21', '14', '15', '16', '17',
       '18', '19', '20', '21', 'noc_10    ', 'noc_43', '22', '23', '25',
       '26', '27', '28', '29', '30', '31', '32', '33', '34', '36', '37',
       '38', '39', '40', '41', '42', '43', 'yabsent', 'wksaway', 'Jan-99',
       'payaway', 'uhrsmain', '001-990', 'ahrsmain', '000-990',
       'ftptmain', 'utothrs', 'atothrs', 'hrsaway', 'yaway', 'paidot',
       'unpaidot', 'xtrahrs', 'whypt', 'tenure', '001-240', 'prevten',
       'hrlyearn', '000001-999999', 'union', 'permtemp', 'estsize',
       'firmsize', 'durunemp', '01-99', 'flowunem', 'unemftpt',
  

In [12]:
df_info.sample()

Unnamed: 0,Field_Champ,Position_Position,Length_Longueur,Variable_Variable,EnglishLabel_EtiquetteAnglais,FrenchLabel_EtiquetteFrancais,EnglishUniverse_UniversAnglais,FrenchUniverse_UniversFrancais,EnglishNote_NoteAnglais,FrenchNote_NoteFrancais
127,,,,18,"Information, culture and recreation","Information, culture et loisirs",,,"51, 71","51, 71"


In [9]:
notes = df_info[['EnglishNote_NoteAnglais']].dropna()

In [11]:
notes['EnglishNote_NoteAnglais'].unique()

array(['Codes based on NAICS 2017 labour variant',
       '111-112, 1100, 1151-1152', '113, 1153', '114', '21, 2100', '22',
       '23', '321, 327, 331-337', '311-316, 322-326, 339', '41', '44-45',
       '48-49', '52', '53', '54', '55-56', '61', '62', '51, 71', '72',
       '81', '91', 'Codes based on NOC 2021 labour variant',
       '00, 10, 20, 30, 40, 50, 60, 70, 80, 90', '14-Nov', '21-22',
       '31-33', '41-45', '51-55', '62-65', '72-75', '82-85', '92-95', '0',
       '10, 20, 30, 40, 50', '60', '70, 80, 90', '111', '112', '12', '13',
       '14', '211', '212', '213', '311', '312', '313', '32', '33', '411',
       '412', '413', '414', '421', '422', '43', '51', '54-55', '63', '64',
       '65', '73', '74', '75', '82-83', '84-85', '92-93', '94', '95',
       'Decimal implied',
       'Before taxes and other deductions. Includes tips, commission and bonuses. Two decimals implied',
       'Includes those with single location',
       'Not known for future starts', 'Links pre-and pos

In [14]:
wages_info = df_info[df_info.EnglishNote_NoteAnglais =='Before taxes and other deductions. Includes tips, commission and bonuses. Two decimals implied']
wages_info

Unnamed: 0,Field_Champ,Position_Position,Length_Longueur,Variable_Variable,EnglishLabel_EtiquetteAnglais,FrenchLabel_EtiquetteFrancais,EnglishUniverse_UniversAnglais,FrenchUniverse_UniversFrancais,EnglishNote_NoteAnglais,FrenchNote_NoteFrancais
253,36.0,83.0,6.0,hrlyearn,Usual hourly wages,Salaire horaire habituel,"Currently employed, employees","Occupés, employés",Before taxes and other deductions. Includes ti...,Avant impôt et autres déductions. Comprend les...


In [15]:
col_desc = list(zip(df_info['Variable_Variable'], df_info['EnglishLabel_EtiquetteAnglais']))
for pair in col_desc:
    print(pair)

('rec_num', 'Order of record in file')
('1-9999999', nan)
('survyear', 'Survey year')
('1976-', nan)
('survmnth', 'Survey month')
('01', 'January')
('02', 'February')
('03', 'March')
('04', 'April')
('05', 'May')
('06', 'June')
('07', 'July')
('08', 'August')
('09', 'September')
('10', 'October')
('11', 'November')
('12', 'December')
(' lfsstat    ', 'Labour force status')
('1', 'Employed, at work')
('2', 'Employed, absent from work')
('3', 'Unemployed')
('4', 'Not in labour force')
('prov', 'Province')
('10', 'Newfoundland and Labrador')
('11', 'Prince Edward Island')
('12', 'Nova Scotia')
('13', 'New Brunswick')
('24', 'Quebec')
('35', 'Ontario')
('46', 'Manitoba')
('47', 'Saskatchewan')
('48', 'Alberta')
('59', 'British Columbia')
('cma', 'Nine largest CMAs')
('1', 'Québec')
('2', 'Montréal')
('3', 'Ottawa\x96Gatineau (Ontario part)')
('4', 'Toronto')
('5', 'Hamilton')
('6', 'Winnipeg')
('7', 'Calgary')
('8', 'Edmonton')
('9', 'Vancouver')
('0', 'Other CMA or non-CMA')
('age_12', 'F

In [16]:
cols = [col.lower() for col in df.columns]

In [17]:
full_col_names = []
for pair in col_desc:
    for col in cols:
        if pair[0].strip() == col:
            print(col, '-', pair[1])
            full_col_names.append(pair[1])

rec_num - Order of record in file
survyear - Survey year
survmnth - Survey month
lfsstat - Labour force status
prov - Province
cma - Nine largest CMAs
age_12 - Five-year age group of respondent
age_6 - Age in 2 and 3 year groups, 15 to 29
sex - Sex of respondent
marstat - Marital status of respondent
educ - Highest educational attainment
mjh - Single or multiple jobholder
everwork - Identifies if a person has worked in the last year
ftptlast - Full- or part-time status of last job
cowmain - Class of worker, main job
immig - Immigrant status
naics_21 - Industry of main job
noc_10 - Occupation at main job
noc_43 - Occupation at main job
yabsent - Reason of absence, full week
wksaway - Number of weeks absent from work
payaway - Paid for time off, full-week absence only
uhrsmain - Usual hours worked per week at main job
ahrsmain - Actual hours worked per week at main job
ftptmain - Full- or part-time status at main or only job
utothrs - Usual hours worked per week at all jobs
atothrs - Act

In [18]:
print([col.lower().replace(' ', '_').replace('_of_respondent', '').replace('-', '_') for col in full_col_names])

['order_of_record_in_file', 'survey_year', 'survey_month', 'labour_force_status', 'province', 'nine_largest_cmas', 'five_year_age_group', 'age_in_2_and_3_year_groups,_15_to_29', 'sex', 'marital_status', 'highest_educational_attainment', 'single_or_multiple_jobholder', 'identifies_if_a_person_has_worked_in_the_last_year', 'full__or_part_time_status_of_last_job', 'class_of_worker,_main_job', 'immigrant_status', 'industry_of_main_job', 'occupation_at_main_job', 'occupation_at_main_job', 'reason_of_absence,_full_week', 'number_of_weeks_absent_from_work', 'paid_for_time_off,_full_week_absence_only', 'usual_hours_worked_per_week_at_main_job', 'actual_hours_worked_per_week_at_main_job', 'full__or_part_time_status_at_main_or_only_job', 'usual_hours_worked_per_week_at_all_jobs', 'actual_hours_worked_per_week_at_all_jobs', 'hours_away_from_work,_part_week_absence_only', 'reason_for_part_week_absence', 'paid_overtime_hours_in_reference_week', 'unpaid_overtime_hours_in_reference_week', 'number_of_

In [19]:
df.columns = ['record_order', 'survey_year', 'survey_month', 'labour_force_status', 'province', 'nine_largest_cmas', 
 'five_year_age_group', 'age_group_15_29', 'sex', 'marital_status', 'highest_educational_attainment', 
 'single_or_multiple_jobholder', 'person_has_worked_in_last_year', 'full_or_part_time_last_job', 'class_of_worker_main_job', 
 'immigrant_status', 'industry_of_main_job', 'occupation_at_main_job_col1', 'occupation_at_main_job_col2', 
 'reason_of_absence_full_week', 'number_of_weeks_absent_from_work', 'paid_for_time_off_full_week_absence', 
 'usual_hours_worked_per_week_at_main_job', 'actual_hours_worked_per_week_at_main_job', 
 'full_or_part_time_main_job', 'usual_hours_worked_per_week_all_jobs', 'actual_hours_worked_per_week_all_jobs', 
 'hours_away_from_work_part_week_absence', 'reason_for_part_week_absence', 'paid_overtime_hours_in_reference_week', 
 'unpaid_overtime_hours_in_reference_week', 'number_of_overtime_or_extra_hours_worked', 'reason_for_part_time_work', 
 'job_tenure_months_current_employer', 'job_tenure_previous_employer', 'usual_hourly_wages', 'union_status', 'job_permanency', 
 'establishment_size', 'firm_size', 'duration_of_unemployment_weeks', 'flows_into_unemployment', 
 'job_seekers_by_type_of_work_sought_and_temporary_layoffs_by_work_status_of_last_job', 
 'reason_for_leaving_job_previous_year_col1', 'reason_for_leaving_job_previous_year_col2', 'duration_of_joblessness_months', 
 'availability_during_reference_week', 'unemployed,_used_public_employment_agency', 
 'unemployed,_checked_with_employers_directly', 'unemployed,_checked_with_friends_or_relatives', 
 'unemployed,_looked_at_job_ads', 'unemployed,_placed_or_answered_ads', 'unemployed,_other_methods', 
 'main_activity_before_started_looking_for_work', 'reason_for_not_looking_for_work_during_the_reference_week', 
 'temporary_layoff,_looked_for_work_during_the_last_four_weeks', 'current_student_status', 'type_of_economic_family', 
 'age_of_youngest_child', 'standard_final_weight']

In [20]:
df['number_of_weeks_absent_from_work'].isnull().sum()/len(df), df['reason_of_absence_full_week'].isnull().sum()/len(df), df['hours_away_from_work_part_week_absence'].isnull().sum()/len(df)

(0.9558986891801822, 0.9558986891801822, 0.5250592460934607)

In [53]:
df.columns = [col.lower().replace(' ', '_').replace('_of_respondent', '') for col in full_col_names]

In [54]:
df.sample()

Unnamed: 0,order_of_record_in_file,survey_year,survey_month,labour_force_status,province,nine_largest_cmas,five-year_age_group,"age_in_2_and_3_year_groups,_15_to_29",sex,marital_status,highest_educational_attainment,single_or_multiple_jobholder,identifies_if_a_person_has_worked_in_the_last_year,full-_or_part-time_status_of_last_job,"class_of_worker,_main_job",immigrant_status,industry_of_main_job,occupation_at_main_job,occupation_at_main_job.1,"reason_of_absence,_full_week",number_of_weeks_absent_from_work,"paid_for_time_off,_full-week_absence_only",usual_hours_worked_per_week_at_main_job,actual_hours_worked_per_week_at_main_job,full-_or_part-time_status_at_main_or_only_job,usual_hours_worked_per_week_at_all_jobs,actual_hours_worked_per_week_at_all_jobs,"hours_away_from_work,_part-week_absence_only",reason_for_part-week_absence,paid_overtime_hours_in_reference_week,unpaid_overtime_hours_in_reference_week,number_of_overtime_or_extra_hours_worked,reason_for_part-time_work,job_tenure_with_current_employer,job_tenure_with_previous_employer,usual_hourly_wages,union_status,job_permanency,establishment_size,firm_size,duration_of_unemployment,flows_into_unemployment,job_seekers_by_type_of_work_sought_and_temporary_layoffs_by_work_status_of_last_job,reason_for_leaving_job_during_previous_year_,reason_for_leaving_job_during_previous_year_.1,duration_of_joblessness,availability_during_the_reference_week,"unemployed,_used_public_employment_agency","unemployed,_checked_with_employers_directly","unemployed,_checked_with_friends_or_relatives","unemployed,_looked_at_job_ads","unemployed,_placed_or_answered_ads","unemployed,_other_methods",main_activity_before_started_looking_for_work,reason_for_not_looking_for_work_during_the_reference_week,"temporary_layoff,_looked_for_work_during_the_last_four_weeks",current_student_status,type_of_economic_family,age_of_youngest_child,standard_final_weight
25251,25252,2023,12,4,35,0,12,,1,1,0,,2.0,,,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,93.0,,,,,,,,,,,,8,,171


In [27]:
df_desc = df.copy()

In [30]:
df_desc

Unnamed: 0,record_order,survey_year,survey_month,labour_force_status,province,nine_largest_cmas,five_year_age_group,age_group_15_29,sex,marital_status,highest_educational_attainment,single_or_multiple_jobholder,person_has_worked_in_last_year,full_or_part_time_last_job,class_of_worker_main_job,immigrant_status,industry_of_main_job,occupation_at_main_job_col1,occupation_at_main_job_col2,reason_of_absence_full_week,number_of_weeks_absent_from_work,paid_for_time_off_full_week_absence,usual_hours_worked_per_week_at_main_job,actual_hours_worked_per_week_at_main_job,full_or_part_time_main_job,usual_hours_worked_per_week_all_jobs,actual_hours_worked_per_week_all_jobs,hours_away_from_work_part_week_absence,reason_for_part_week_absence,paid_overtime_hours_in_reference_week,unpaid_overtime_hours_in_reference_week,number_of_overtime_or_extra_hours_worked,reason_for_part_time_work,job_tenure_months_current_employer,job_tenure_previous_employer,usual_hourly_wages,union_status,job_permanency,establishment_size,firm_size,duration_of_unemployment_weeks,flows_into_unemployment,job_seekers_by_type_of_work_sought_and_temporary_layoffs_by_work_status_of_last_job,reason_for_leaving_job_previous_year_col1,reason_for_leaving_job_previous_year_col2,duration_of_joblessness_months,availability_during_reference_week,"unemployed,_used_public_employment_agency","unemployed,_checked_with_employers_directly","unemployed,_checked_with_friends_or_relatives","unemployed,_looked_at_job_ads","unemployed,_placed_or_answered_ads","unemployed,_other_methods",main_activity_before_started_looking_for_work,reason_for_not_looking_for_work_during_the_reference_week,"temporary_layoff,_looked_for_work_during_the_last_four_weeks",current_student_status,type_of_economic_family,age_of_youngest_child,standard_final_weight
0,1,2023,12,4,12,0,12,,1,5,3,,2.0,,,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,53.0,,,,,,,,,,,,8,,132
1,2,2023,12,4,12,0,3,5.0,2,6,6,,2.0,,,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,86.0,,,,,,,,,2.0,,1.0,1,,102
2,3,2023,12,4,24,1,11,,2,2,5,,3.0,,,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11,,455
3,4,2023,12,1,10,0,2,3.0,1,6,2,1.0,,,2.0,3,10.0,7.0,33.0,,,,400.0,400.0,1.0,400.0,400.0,0.0,,0.0,0.0,0.0,,27.0,,2575.0,3.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,1.0,14,,38
4,5,2023,12,1,35,0,7,,2,6,4,1.0,,,1.0,2,16.0,5.0,20.0,,,,400.0,400.0,1.0,400.0,400.0,0.0,,0.0,0.0,0.0,,155.0,,5192.0,1.0,1.0,4.0,4.0,,,,,,,,,,,,,,,,,1.0,18,,160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108019,108020,2023,12,2,35,0,6,,2,1,5,1.0,,,1.0,3,16.0,5.0,20.0,2.0,74.0,2.0,400.0,0.0,1.0,400.0,0.0,,,,,,,170.0,,3400.0,1.0,1.0,1.0,4.0,,,,,,,,,,,,,,,,,1.0,3,1.0,192
108020,108021,2023,12,1,59,0,4,,1,6,5,1.0,,,6.0,3,17.0,5.0,26.0,,,,60.0,20.0,2.0,60.0,20.0,,,,,,3.0,52.0,,,,,,,,,,,,,,,,,,,,,,,3.0,18,,241
108021,108022,2023,12,1,11,0,7,,1,2,4,1.0,,,2.0,3,11.0,8.0,38.0,,,,400.0,500.0,1.0,400.0,500.0,0.0,,100.0,0.0,100.0,,137.0,,3300.0,1.0,1.0,1.0,4.0,,,,,,,,,,,,,,,,,1.0,5,,15
108022,108023,2023,12,1,46,0,3,6.0,2,6,2,1.0,,,2.0,3,6.0,8.0,36.0,,,,805.0,805.0,1.0,805.0,805.0,0.0,,365.0,0.0,365.0,,89.0,,3400.0,3.0,1.0,4.0,4.0,,,,,,,,,,,,,,,,,1.0,18,,106


In [None]:
cols_to_divide_by_ten = ['usual_hours_worked_per_week_at_main_job', 
                        'usual_hours_worked_per_week_at_all_jobs',
                        'actual_hours_worked_per_week_at_all_jobs',
                        'hours_away_from_work,_part-week_absence_only',
                        'number_of_overtime_or_extra_hours_worked',
                        'unpaid_overtime_hours_in_reference_week',
                        'paid_overtime_hours_in_reference_week']

numeric_cols = ['number_of_weeks_absent_from_work',  'usual_hours_worked_per_week_at_main_job',
 'actual_hours_worked_per_week_at_main_job', 'usual_hours_worked_per_week_all_jobs',
 'actual_hours_worked_per_week_all_jobs',
 'hours_away_from_work_part_week_absence',  'paid_overtime_hours_in_reference_week',
 'unpaid_overtime_hours_in_reference_week',
 'number_of_overtime_or_extra_hours_worked',
                'job_tenure_months_current_employer', 
 'job_tenure_previous_employer',
 'usual_hourly_wages', 
               'duration_of_unemployment_weeks',
                 'duration_of_joblessness_months',
                'standard_final_weight']

cols_1_or_nan = [ 'unemployed,_used_public_employment_agency',
 'unemployed,_checked_with_employers_directly',
 'unemployed,_checked_with_friends_or_relatives',
 'unemployed,_looked_at_job_ads', 
'unemployed,_placed_or_answered_ads',
 'unemployed,_other_methods']

In [21]:
col_map_dict = {'labour_force_status': {1:'Employed, at work', 2: 'Employed, absent from work', 
                         3: 'Unemployed', 4: 'Not in labour force'}, 
 'province':{10: 'Newfoundland and Labrador', 11: 'Prince Edward Island', 12: 'Nova Scotia', 13: 'New Brunswick', 
             24: 'Quebec', 35: 'Ontario', 46: 'Manitoba', 47: 'Saskatchewan', 48: 'Alberta', 59: 'British Columbia'}, 
 'nine_largest_cmas': {1: 'Québec', 2: 'Montréal', 3: 'Ottawa (Ontario part)', 4: 'Toronto', 5: 'Hamilton', 
                       6: 'Winnipeg', 7: 'Calgary', 8: 'Edmonton', 9: 'Vancouver', 0: 'Other CMA or non-CMA'},
 'five_year_age_group': {1: '15 to 19 years',2: '20 to 24 years',3: '25 to 29 years',
                    4: '30 to 34 years',5: '35 to 39 years',6: '40 to 44 years',7: '45 to 49 years', 8: '50 to 54 years', 
                    9: '55 to 59 years',10: '60 to 64 years',11: '65 to 69 years', 12: '70 and over'}, 
 'age_group_15_29': {1: '15 to 16 years', 2: '17 to 19 years',3: '20 to 21 years', 4: '22 to 24 years',
                     5: '25 to 26 years', 6: '27 to 29 years'}, 
 'sex': {1:'Male', 2:'Female'}, 
 'marital_status': {1: 'Married', 2: 'Living in common-law', 3: 'Widowed', 4: 'Separated', 5: 'Divorced', 
                    6: 'Single, never married'},
 'highest_educational_attainment': {0: '0 to 8 years', 1: 'Some high school', 2: 'High school graduate', 
    3: 'Some postsecondary', 4: 'Postsecondary certificate or diploma', 5: "Bachelor's degree", 6: "Above bachelor's degree"}, 
 'single_or_multiple_jobholder': {1.:'Single', 2.:'Multiple'},
 'person_has_worked_in_last_year': {1.: 'Yes, within last year', 2.: 'Yes, more than 1 year ago', 3.: 'No, never worked'}, 
 'full_or_part_time_last_job': {1.:'Full-time (30 hours or more)', 2.:'Part-time'},
 'class_of_worker_main_job': {1: 'Public sector employees', 2: 'Private sector employees', 
     3: 'Self-employed incorporated, with paid help', 4: 'Self-employed incorporated, no paid help',
     5: 'Self-employed unincorporated, with paid help', 6: 'Self-employed unincorporated, no paid help',
                              7: 'Unpaid family worker'},
 'immigrant_status': {1:'Immigrant, less than 10 years in Canada', 2:'Immigrant, more than 10 years in Canada', 
                      3:'Non-immigrant'}, 
 'industry_of_main_job': {1: 'Agriculture', 2: 'Forestry and logging and support activities for forestry',
        3: 'Fishing, hunting and trapping', 4: 'Mining, quarrying, and oil and gas extraction',5: 'Utilities', 
        6: 'Construction', 7: 'Manufacturing - durable goods',8: 'Manufacturing - non-durable goods',
        9: 'Wholesale trade', 10: 'Retail trade', 11: 'Transportation and warehousing', 12: 'Finance and insurance',
        13: 'Real estate and rental and leasing',14: 'Professional, scientific and technical services',
        15: 'Business, building and other support services', 16: 'Educational services',
        17: 'Health care and social assistance', 18: 'Information, culture and recreation',
        19: 'Accommodation and food services', 20: 'Other services (except public administration)', 
                          21: 'Public administration'},
 'occupation_at_main_job_col1': {1: 'Management occupations', 
        2: 'Business, finance and administration occupations, except management',
        3: 'Natural and applied sciences and related occupations, except management',
        4: 'Health occupations, except management',
        5: 'Occupations in education, law and social, community and government services, except management',
        6: 'Occupations in art, culture, recreation and sport, except management',
        7: 'Sales and service occupations, except management',
        8: 'Trades, transport and equipment operators and related occupations, except management',
        9: 'Natural resources, agriculture and related production occupations, except management',
       10: 'Occupations in manufacturing and utilities, except management'}, 
 'occupation_at_main_job_col2': {1: 'Legislative and senior management occupations',  
        2: 'Specialized middle management occupations', 
        3: 'Middle management occupations in retail and wholesale trade and customer services',
        4: 'Middle management occupations in trades, transportation, production and utilities',
        5: 'Professional occupations in finance', 6: 'Professional occupations in business',
        7: 'Administrative and financial supervisors and specialized administrative occupations',
        8: 'Administrative occupations and transportation logistics occupations',
        9: 'Administrative and financial support and supply chain logistics occupations',
        10: 'Professional occupations in natural sciences',
        11: 'Professional occupations in applied sciences (except engineering)',
        12: 'Professional occupations in engineering', 13: 'Technical occupations related to natural and applied sciences',
        14: 'Health treating and consultation services professionals',
        15: 'Therapy and assessment professionals', 16: 'Nursing and allied health professionals',
        17: 'Technical occupations in health', 18: 'Assisting occupations in support of health services',
        19: 'Professional occupations in law', 20: 'Professional occupations in education services',
        21: 'Professional occupations in social and community services', 22: 'Professional occupations in government services',
        23: 'Occupations in front-line public protection services',
        24: 'Paraprofessional occupations in legal, social, community and education services',
        25: 'Assisting occupations in education and in legal and public protection',
        26: 'Care providers and public protection support occupations and student monitors, crossing guards and related occupations',
        27: 'Professional occupations in art and culture', 28: 'Technical occupations in art, culture and sport',
        29: 'Occupations in art, culture and sport', 30: 'Support occupations in art, culture and sport',
        31: 'Retail sales and service supervisors and specialized occupations in sales and services',
        32: 'Occupations in sales and services', 
        33: 'Sales and service representatives and other customer and personal services occupations',
        34: 'Sales and service support occupations', 35: 'Technical trades and transportation officers and controllers',
        36: 'General trades', 
        37: 'Mail and message distribution, other transport equipment operators and related maintenance workers',
        38: 'Helpers and labourers and other transport drivers, operators and labourers',
        39: 'Supervisors and occupations in natural resources, agriculture and related production',
        40: 'Workers and labourers in natural resources, agriculture and related production',
        41: 'Supervisors, central control and process operators in processing, manufacturing and utilities and aircraft assemblers and inspectors',
        42: 'Machine operators, assemblers and inspectors in processing, manufacturing and printing',
        43: 'Labourers in processing, manufacturing and utilities'},
 'reason_of_absence_full_week': {0: 'Other reasons', 1: 'Own illness or disability', 
                                 2: 'Personal or family responsibilities', 3: 'Vacation'}, 
 'paid_for_time_off_full_week_absence': {1.:'Yes', 2.:'No'},
 'full_or_part_time_main_job': {1:'full_time', 2:'part_time'}, 
 'reason_for_part_week_absence': {0: 'Other reasons', 1: 'Own illness or disability', 
      2: 'Personal or family responsibilities', 3: 'Vacation or civic holiday', 4: 'Working short-time'},
 'reason_for_part_time_work': {0: 'Other reasons', 1: 'Own illness or disability', 2: 'Caring for children',
     3: 'Other personal or family responsibilities', 4: 'Going to school', 5: 'Personal preference',
     6: 'Business conditions or could not find full-time work, looked for full-time work in last month',
     7: 'Business conditions or could not find full-time work, did not look for full-time work in last month'},
 'union_status': {1: 'Union member', 2: 'Not a member but covered by a union contract or collective agreement',
         3: 'Non-unionized'}, 
 'job_permanency': {1: 'Permanent',2: 'Temporary, seasonal job',3: 'Temporary, term or contract job',
         4: 'Temporary, casual or other temporary jobs'},
 'establishment_size': {1: 'Less than 20 employees', 2: '20 to 99 employees', 3: '100 to 500 employees',
                 4: 'More than 500 employees'}, 
 'firm_size': {1: 'Less than 20 employees', 2: '20 to 99 employees', 3: '100 to 500 employees',
                 4: 'More than 500 employees'}, 
 'flows_into_unemployment': {1: 'Job losers, temporary layoff', 2: 'Job losers, permanent layoff', 3: 'Job leavers', 
        4: 'Job leavers/losers (status unknown), worked more than 1 year ago ', 5: 'New entrants', 
        6: 'Re-entrants, worked 1 year ago or less', 7: 'Re-entrants, worked more than 1 year ago', 8: 'Future starts'},
 'job_seekers_by_type_of_work_sought_and_temporary_layoffs_by_work_status_of_last_job': {1: 'Full-time', 
         2: 'Part-time', 3: 'Future starts'},
 'reason_for_leaving_job_previous_year_col1': {0: 'Job leavers, other reasons', 1: 'Job leavers, own illness or disability', 
        2: 'Job leavers, personal or family responsibilities', 3: 'Job leavers, going to school', 
        4: 'Job losers, laid off', 5: 'Job leavers, retired'},
 'reason_for_leaving_job_previous_year_col2': {0: 'Job leavers, other reasons', 1: 'Job leavers, own illness or disability', 
        2: 'Job leavers, caring for children', 3: 'Job leavers, pregnancy', 
        4: 'Job leavers, personal or family responsibilities', 5: 'Job leavers, going to school', 
        6: 'Job leavers, dissatisfied', 7: 'Job leavers, retired', 
        8: 'Job leavers, business sold or closed down (self-employed)', 9: 'Job losers, end of seasonal job (employee)', 
        10: 'Job losers, end of temporary or casual (employee)', 11: 'Job losers, company moved or out of business (employee)', 
        12: 'Job losers, business conditions (employee)', 13: 'Job losers, dismissal or other reasons'}, 
 'availability_during_reference_week': {1: 'Not available', 2: 'Yes, available'},
 'main_activity_before_started_looking_for_work': {0: 'Other', 1: 'Working', 2: 'Managing a home',  3: 'Going to school'},
 'reason_for_not_looking_for_work_during_the_reference_week': {0: 'Wanted work, reason - other',
     1: 'Wanted work, reason - own illness or disability', 2: 'Wanted work, reason - caring for children',
     3: 'Wanted work, reason - other personal or family responsibilities', 4: 'Wanted work, reason - school',
     5: 'Wanted work, reason - awaiting recall or reply', 6: 'Wanted work, reason - discouraged'},
 'temporary_layoff,_looked_for_work_during_the_last_four_weeks': {1:'Yes', 2:'No'},
 'current_student_status': {1: 'Non-student', 2: 'Full-time student', 3: 'Part-time student'}, 
 'type_of_economic_family': {1: 'Person not in an economic family', 2: 'Dual-earner couple, no children or none under 25',
        3: 'Dual-earner couple, youngest child 0 to 17', 4: 'Dual-earner couple, youngest child 18 to 24',
        5: 'Single-earner couple, male employed, no children or none under 25', 
        6: 'Single-earner couple, male employed, youngest child 0 to 17',
        7: 'Single-earner couple, male employed, youngest child 18 to 24',
        8: 'Single-earner couple, female employed, no children or none under 25',
        9: 'Single-earner couple, female employed, youngest child 0 to 17',
        10: 'Single-earner couple, female employed, youngest child 18 to 24',
        11: 'Non-earner couple, no children or none under 25', 12: 'Non-earner couple, youngest child 0 to 17',
        13: 'Non-earner couple, youngest child 18 to 24', 14: 'Lone-parent family, parent employed, youngest child 0 to 17',
        15: 'Lone-parent family, parent employed, youngest child 18 to 24', 
        16: 'Lone-parent family, parent not employed, youngest child 0 to 17',
        17: 'Lone-parent family, parent not employed, youngest child 18 to 24', 18: 'Other families'},
 'age_of_youngest_child': {1: 'Youngest child less than 6 years', 2: 'Youngest child 6 to 12 years',
             3: 'Youngest child 13 to 17 years', 4: 'Youngest child 18 to 24 years'}
}

In [23]:
cols_to_map = []
for pair in col_desc:
    cols_to_map.append(pair)

In [24]:
cols_to_map

[('rec_num', 'Order of record in file'),
 ('1-9999999', nan),
 ('survyear', 'Survey year'),
 ('1976-', nan),
 ('survmnth', 'Survey month'),
 ('01', 'January'),
 ('02', 'February'),
 ('03', 'March'),
 ('04', 'April'),
 ('05', 'May'),
 ('06', 'June'),
 ('07', 'July'),
 ('08', 'August'),
 ('09', 'September'),
 ('10', 'October'),
 ('11', 'November'),
 ('12', 'December'),
 (' lfsstat    ', 'Labour force status'),
 ('1', 'Employed, at work'),
 ('2', 'Employed, absent from work'),
 ('3', 'Unemployed'),
 ('4', 'Not in labour force'),
 ('prov', 'Province'),
 ('10', 'Newfoundland and Labrador'),
 ('11', 'Prince Edward Island'),
 ('12', 'Nova Scotia'),
 ('13', 'New Brunswick'),
 ('24', 'Quebec'),
 ('35', 'Ontario'),
 ('46', 'Manitoba'),
 ('47', 'Saskatchewan'),
 ('48', 'Alberta'),
 ('59', 'British Columbia'),
 ('cma', 'Nine largest CMAs'),
 ('1', 'Québec'),
 ('2', 'Montréal'),
 ('3', 'Ottawa\x96Gatineau (Ontario part)'),
 ('4', 'Toronto'),
 ('5', 'Hamilton'),
 ('6', 'Winnipeg'),
 ('7', 'Calgary')

In [33]:
# There is something wrong here. I doubt the average work hours per week is 400. Maybe we should divide by ten.
round(df_desc['usual_hours_worked_per_week_at_main_job'].mean()), df_desc['usual_hours_worked_per_week_at_main_job'].median(), df_desc['usual_hours_worked_per_week_at_main_job'].min(), df_desc['usual_hours_worked_per_week_at_main_job'].max()
df_desc['hours_away_from_work_part_week_absence'].replace(0, np.NaN).median(), df_desc['hours_away_from_work_part_week_absence'].replace(0, np.NaN).mean()
df_desc['hours_away_from_work_part_week_absence'].replace(0, np.NaN).median(), df_desc['hours_away_from_work_part_week_absence'].replace(0, np.NaN).mean()
round(df_desc['actual_hours_worked_per_week_at_all_jobs'].mean()), df_desc['actual_hours_worked_per_week_at_all_jobs'].median(), df_desc['actual_hours_worked_per_week_at_all_jobs'].max()
df_desc['hours_away_from_work_part_week_absence'].max(), df_desc['hours_away_from_work_part_week_absence'].mean(), df_desc['hours_away_from_work_part_week_absence'].median()

KeyError: 'actual_hours_worked_per_week_at_all_jobs'

In [34]:
# These hourly wages seem very high. Maybe they are weekly or monthly wages? Look into this. 
df_desc['usual_hourly_wages'].mean(), df_desc['usual_hourly_wages'].median(), df_desc['usual_hourly_wages'].max() 

(3393.8444295569357, 2900.0, 20337.0)

In [35]:
# Probably answer to this question: ABS_Q162 / EQ 40 – 
# How many weeks had you been continuously absent from work, up to and including [refer to collection period]?

df_desc['number_of_weeks_absent_from_work'].mean(), df_desc['number_of_weeks_absent_from_work'].median(), df_desc['number_of_weeks_absent_from_work'].max()

(15.537783375314861, 4.0, 99.0)

In [36]:
df_desc[df_desc['number_of_weeks_absent_from_work']==99.0]

Unnamed: 0,record_order,survey_year,survey_month,labour_force_status,province,nine_largest_cmas,five_year_age_group,age_group_15_29,sex,marital_status,highest_educational_attainment,single_or_multiple_jobholder,person_has_worked_in_last_year,full_or_part_time_last_job,class_of_worker_main_job,immigrant_status,industry_of_main_job,occupation_at_main_job_col1,occupation_at_main_job_col2,reason_of_absence_full_week,number_of_weeks_absent_from_work,paid_for_time_off_full_week_absence,usual_hours_worked_per_week_at_main_job,actual_hours_worked_per_week_at_main_job,full_or_part_time_main_job,usual_hours_worked_per_week_all_jobs,actual_hours_worked_per_week_all_jobs,hours_away_from_work_part_week_absence,reason_for_part_week_absence,paid_overtime_hours_in_reference_week,unpaid_overtime_hours_in_reference_week,number_of_overtime_or_extra_hours_worked,reason_for_part_time_work,job_tenure_months_current_employer,job_tenure_previous_employer,usual_hourly_wages,union_status,job_permanency,establishment_size,firm_size,duration_of_unemployment_weeks,flows_into_unemployment,job_seekers_by_type_of_work_sought_and_temporary_layoffs_by_work_status_of_last_job,reason_for_leaving_job_previous_year_col1,reason_for_leaving_job_previous_year_col2,duration_of_joblessness_months,availability_during_reference_week,"unemployed,_used_public_employment_agency","unemployed,_checked_with_employers_directly","unemployed,_checked_with_friends_or_relatives","unemployed,_looked_at_job_ads","unemployed,_placed_or_answered_ads","unemployed,_other_methods",main_activity_before_started_looking_for_work,reason_for_not_looking_for_work_during_the_reference_week,"temporary_layoff,_looked_for_work_during_the_last_four_weeks",current_student_status,type_of_economic_family,age_of_youngest_child,standard_final_weight
1028,1029,2023,12,2,47,0,11,,1,1,2,1.0,,,2.0,3,15.0,7.0,33.0,1.0,99.0,2.0,450.0,0.0,1.0,450.0,0.0,,,,,,,50.0,,1500.0,3.0,1.0,3.0,4.0,,,,,,,,,,,,,,,,,,5,,133
1062,1063,2023,12,2,35,0,8,,2,5,4,1.0,,,1.0,3,21.0,2.0,9.0,1.0,99.0,2.0,400.0,0.0,1.0,400.0,0.0,,,,,,,240.0,,4200.0,1.0,3.0,4.0,4.0,,,,,,,,,,,,,,,,,1.0,15,4.0,317
1694,1695,2023,12,2,46,0,9,,2,1,2,1.0,,,2.0,3,12.0,7.0,33.0,1.0,99.0,2.0,400.0,0.0,1.0,400.0,0.0,,,,,,,240.0,,1700.0,3.0,1.0,1.0,4.0,,,,,,,,,,,,,,,,,1.0,4,4.0,125
4211,4212,2023,12,2,10,0,9,,1,1,4,1.0,,,1.0,3,16.0,8.0,35.0,1.0,99.0,2.0,400.0,0.0,1.0,400.0,0.0,,,,,,,104.0,,3500.0,1.0,1.0,4.0,4.0,,,,,,,,,,,,,,,,,1.0,2,,83
7098,7099,2023,12,2,35,0,10,,1,1,4,1.0,,,2.0,3,7.0,3.0,13.0,1.0,99.0,1.0,400.0,0.0,1.0,400.0,0.0,,,,,,,240.0,,3846.0,3.0,1.0,3.0,3.0,,,,,,,,,,,,,,,,,1.0,5,,441
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106305,106306,2023,12,2,48,8,8,,2,1,4,1.0,,,1.0,3,17.0,4.0,16.0,1.0,99.0,2.0,525.0,0.0,1.0,525.0,0.0,,,,,,,240.0,,4934.0,1.0,1.0,4.0,4.0,,,,,,,,,,,,,,,,,1.0,4,4.0,521
106503,106504,2023,12,2,24,0,10,,2,6,2,1.0,,,2.0,3,18.0,7.0,33.0,1.0,99.0,2.0,350.0,0.0,1.0,350.0,0.0,,,,,,,240.0,,2967.0,1.0,1.0,3.0,4.0,,,,,,,,,,,,,,,,,1.0,1,,208
106628,106629,2023,12,2,35,4,9,,2,1,4,1.0,,,2.0,3,12.0,2.0,7.0,1.0,99.0,2.0,120.0,0.0,2.0,120.0,0.0,,,,,,5.0,214.0,,6571.0,3.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,1.0,18,,554
107315,107316,2023,12,2,35,4,8,,2,5,4,1.0,,,2.0,3,11.0,7.0,33.0,1.0,99.0,2.0,400.0,0.0,1.0,400.0,0.0,,,,,,,240.0,,2800.0,1.0,1.0,4.0,4.0,,,,,,,,,,,,,,,,,1.0,14,3.0,195


In [37]:
col_map_dict

{'labour_force_status': {1: 'Employed, at work',
  2: 'Employed, absent from work',
  3: 'Unemployed',
  4: 'Not in labour force'},
 'province': {10: 'Newfoundland and Labrador',
  11: 'Prince Edward Island',
  12: 'Nova Scotia',
  13: 'New Brunswick',
  24: 'Quebec',
  35: 'Ontario',
  46: 'Manitoba',
  47: 'Saskatchewan',
  48: 'Alberta',
  59: 'British Columbia'},
 'nine_largest_cmas': {1: 'Québec',
  2: 'Montréal',
  3: 'Ottawa (Ontario part)',
  4: 'Toronto',
  5: 'Hamilton',
  6: 'Winnipeg',
  7: 'Calgary',
  8: 'Edmonton',
  9: 'Vancouver',
  0: 'Other CMA or non-CMA'},
 'five_year_age_group': {1: '15 to 19 years',
  2: '20 to 24 years',
  3: '25 to 29 years',
  4: '30 to 34 years',
  5: '35 to 39 years',
  6: '40 to 44 years',
  7: '45 to 49 years',
  8: '50 to 54 years',
  9: '55 to 59 years',
  10: '60 to 64 years',
  11: '65 to 69 years',
  12: '70 and over'},
 'age_group_15_29': {1: '15 to 16 years',
  2: '17 to 19 years',
  3: '20 to 21 years',
  4: '22 to 24 years',
  5: 

In [38]:
for col in df_desc:
    if col in col_map_dict.keys():
        df_desc[col] = df_desc[col].map(col_map_dict[col])

In [39]:
cols_to_divide_by_ten = ['usual_hours_worked_per_week_at_main_job',
'actual_hours_worked_per_week_at_main_job',
'usual_hours_worked_per_week_all_jobs',
'actual_hours_worked_per_week_all_jobs',
'hours_away_from_work_part_week_absence',
'paid_overtime_hours_in_reference_week',
'unpaid_overtime_hours_in_reference_week',
'number_of_overtime_or_extra_hours_worked']

for col in df_desc:
    if 'hours' in col:
        df_desc[col] = df_desc[col]/10

In [40]:
df_desc.sample()

Unnamed: 0,record_order,survey_year,survey_month,labour_force_status,province,nine_largest_cmas,five_year_age_group,age_group_15_29,sex,marital_status,highest_educational_attainment,single_or_multiple_jobholder,person_has_worked_in_last_year,full_or_part_time_last_job,class_of_worker_main_job,immigrant_status,industry_of_main_job,occupation_at_main_job_col1,occupation_at_main_job_col2,reason_of_absence_full_week,number_of_weeks_absent_from_work,paid_for_time_off_full_week_absence,usual_hours_worked_per_week_at_main_job,actual_hours_worked_per_week_at_main_job,full_or_part_time_main_job,usual_hours_worked_per_week_all_jobs,actual_hours_worked_per_week_all_jobs,hours_away_from_work_part_week_absence,reason_for_part_week_absence,paid_overtime_hours_in_reference_week,unpaid_overtime_hours_in_reference_week,number_of_overtime_or_extra_hours_worked,reason_for_part_time_work,job_tenure_months_current_employer,job_tenure_previous_employer,usual_hourly_wages,union_status,job_permanency,establishment_size,firm_size,duration_of_unemployment_weeks,flows_into_unemployment,job_seekers_by_type_of_work_sought_and_temporary_layoffs_by_work_status_of_last_job,reason_for_leaving_job_previous_year_col1,reason_for_leaving_job_previous_year_col2,duration_of_joblessness_months,availability_during_reference_week,"unemployed,_used_public_employment_agency","unemployed,_checked_with_employers_directly","unemployed,_checked_with_friends_or_relatives","unemployed,_looked_at_job_ads","unemployed,_placed_or_answered_ads","unemployed,_other_methods",main_activity_before_started_looking_for_work,reason_for_not_looking_for_work_during_the_reference_week,"temporary_layoff,_looked_for_work_during_the_last_four_weeks",current_student_status,type_of_economic_family,age_of_youngest_child,standard_final_weight
5912,5913,2023,12,Not in labour force,Saskatchewan,Other CMA or non-CMA,70 and over,,Female,Separated,Postsecondary certificate or diploma,,"Yes, more than 1 year ago",,,Non-immigrant,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,79.0,,,,,,,,,,,,Person not in an economic family,,150


In [63]:
df_desc.labour_force_status = df_desc.labour_force_status.map({1:'Employed, at work', 2: 'Employed, absent from work', 
                                 3: 'Unemployed', 4: 'Not in labour force'})

In [None]:
# maybe drop cols related to provinces in canada - it does not matter much for our analysis. 

In [73]:
df_desc['five-year_age_group'] = df_desc['five-year_age_group'].map({1: '15 to 19 years',2: '20 to 24 years',3: '25 to 29 years',
                                    4: '30 to 34 years',5: '35 to 39 years',6: '40 to 44 years',7: '45 to 49 years',
                                    8: '50 to 54 years', 9: '55 to 59 years',10: '60 to 64 years',11: '65 to 69 years',
                                    12: '70 and over'})

In [78]:
df_desc['age_in_2_and_3_year_groups,_15_to_29']=df_desc['age_in_2_and_3_year_groups,_15_to_29'].map({1: '15 to 16 years', 2: '17 to 19 years',3: '20 to 21 years',4: '22 to 24 years',5: '25 to 26 years',
 6: '27 to 29 years'})

In [80]:
df_desc['sex'] = df_desc['sex'].map({1:'Male', 2:'Female'})

In [82]:
df_desc.sample(3)

Unnamed: 0,labour_force_status,province,nine_largest_cmas,five-year_age_group,"age_in_2_and_3_year_groups,_15_to_29",sex,marital_status,highest_educational_attainment,single_or_multiple_jobholder,identifies_if_a_person_has_worked_in_the_last_year,full-_or_part-time_status_of_last_job,"class_of_worker,_main_job",immigrant_status,industry_of_main_job,occupation_at_main_job,occupation_at_main_job.1,"reason_of_absence,_full_week",number_of_weeks_absent_from_work,"paid_for_time_off,_full-week_absence_only",usual_hours_worked_per_week_at_main_job,actual_hours_worked_per_week_at_main_job,full-_or_part-time_status_at_main_or_only_job,usual_hours_worked_per_week_at_all_jobs,actual_hours_worked_per_week_at_all_jobs,"hours_away_from_work,_part-week_absence_only",reason_for_part-week_absence,paid_overtime_hours_in_reference_week,unpaid_overtime_hours_in_reference_week,number_of_overtime_or_extra_hours_worked,reason_for_part-time_work,job_tenure_with_current_employer,job_tenure_with_previous_employer,usual_hourly_wages,union_status,job_permanency,establishment_size,firm_size,duration_of_unemployment,flows_into_unemployment,job_seekers_by_type_of_work_sought_and_temporary_layoffs_by_work_status_of_last_job,reason_for_leaving_job_during_previous_year_,reason_for_leaving_job_during_previous_year_.1,duration_of_joblessness,availability_during_the_reference_week,"unemployed,_used_public_employment_agency","unemployed,_checked_with_employers_directly","unemployed,_checked_with_friends_or_relatives","unemployed,_looked_at_job_ads","unemployed,_placed_or_answered_ads","unemployed,_other_methods",main_activity_before_started_looking_for_work,reason_for_not_looking_for_work_during_the_reference_week,"temporary_layoff,_looked_for_work_during_the_last_four_weeks",current_student_status,type_of_economic_family,age_of_youngest_child,standard_final_weight
58500,"Employed, at work",46,6,50 to 54 years,,Male,5,4,1.0,,,1.0,3,16.0,5.0,20.0,,,,362.0,362.0,1.0,362.0,362.0,0.0,,0.0,0.0,0.0,,77.0,,3800.0,1.0,1.0,1.0,3.0,,,,,,,,,,,,,,,,,1.0,1,,302
72206,"Employed, at work",48,8,15 to 19 years,17 to 19 years,Female,6,1,1.0,,,2.0,3,18.0,7.0,34.0,,,,70.0,70.0,2.0,70.0,70.0,0.0,,0.0,0.0,0.0,4.0,3.0,,1350.0,3.0,1.0,3.0,3.0,,,,,,,,,,,,,,,,,2.0,6,,495
22555,Not in labour force,35,4,60 to 64 years,,Male,1,2,,2.0,,,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,76.0,,,,,,,,,,,1.0,8,,1019


In [85]:
df_desc['marital_status'] = df_desc['marital_status'].map({1: 'Married',
 2: 'Living in common-law',
 3: 'Widowed',
 4: 'Separated',
 5: 'Divorced',
 6: 'Single, never married'})

In [88]:
df_desc['highest_educational_attainment'] = df_desc['highest_educational_attainment'].map({0: '0 to 8 years',
 1: 'Some high school',
 2: 'High school graduate',
 3: 'Some postsecondary',
 4: 'Postsecondary certificate or diploma',
 5: "Bachelor's degree",
 6: "Above bachelor's degree"})

In [91]:
df_desc['single_or_multiple_jobholder'] = df_desc['single_or_multiple_jobholder'].map({1.:'Single', 2.:'Multiple'})

In [97]:
df_desc['identifies_if_a_person_has_worked_in_the_last_year'] = df_desc['identifies_if_a_person_has_worked_in_the_last_year'].map({1.: 'Yes, within last year',2.: 'Yes, more than 1 year ago',3.: 'No, never worked'})

In [103]:
df_desc['full-_or_part-time_status_of_last_job'].isnull().sum()

100719

In [104]:
df_desc['full-_or_part-time_status_of_last_job'] = df_desc['full-_or_part-time_status_of_last_job'].map({1.:'Full-time (30 hours or more)', 2.:'Part-time'})

In [107]:
df_desc['class_of_worker,_main_job']=df_desc['class_of_worker,_main_job'].map({1: 'Public sector employees',
 2: 'Private sector employees',
 3: 'Self-employed incorporated, with paid help',
 4: 'Self-employed incorporated, no paid help',
 5: 'Self-employed unincorporated, with paid help',
 6: 'Self-employed unincorporated, no paid help',
 7: 'Unpaid family worker'})

In [108]:
df_desc.sample(3)

Unnamed: 0,labour_force_status,province,nine_largest_cmas,five-year_age_group,"age_in_2_and_3_year_groups,_15_to_29",sex,marital_status,highest_educational_attainment,single_or_multiple_jobholder,identifies_if_a_person_has_worked_in_the_last_year,full-_or_part-time_status_of_last_job,"class_of_worker,_main_job",immigrant_status,industry_of_main_job,occupation_at_main_job,occupation_at_main_job.1,"reason_of_absence,_full_week",number_of_weeks_absent_from_work,"paid_for_time_off,_full-week_absence_only",usual_hours_worked_per_week_at_main_job,actual_hours_worked_per_week_at_main_job,full-_or_part-time_status_at_main_or_only_job,usual_hours_worked_per_week_at_all_jobs,actual_hours_worked_per_week_at_all_jobs,"hours_away_from_work,_part-week_absence_only",reason_for_part-week_absence,paid_overtime_hours_in_reference_week,unpaid_overtime_hours_in_reference_week,number_of_overtime_or_extra_hours_worked,reason_for_part-time_work,job_tenure_with_current_employer,job_tenure_with_previous_employer,usual_hourly_wages,union_status,job_permanency,establishment_size,firm_size,duration_of_unemployment,flows_into_unemployment,job_seekers_by_type_of_work_sought_and_temporary_layoffs_by_work_status_of_last_job,reason_for_leaving_job_during_previous_year_,reason_for_leaving_job_during_previous_year_.1,duration_of_joblessness,availability_during_the_reference_week,"unemployed,_used_public_employment_agency","unemployed,_checked_with_employers_directly","unemployed,_checked_with_friends_or_relatives","unemployed,_looked_at_job_ads","unemployed,_placed_or_answered_ads","unemployed,_other_methods",main_activity_before_started_looking_for_work,reason_for_not_looking_for_work_during_the_reference_week,"temporary_layoff,_looked_for_work_during_the_last_four_weeks",current_student_status,type_of_economic_family,age_of_youngest_child,standard_final_weight
84347,"Employed, at work",35,0,45 to 49 years,,Male,Separated,Postsecondary certificate or diploma,Single,,,Private sector employees,3,20.0,8.0,36.0,,,,400.0,400.0,1.0,400.0,400.0,0.0,,0.0,0.0,0.0,,8.0,,2400.0,3.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,1.0,14,2.0,130
43960,"Employed, at work",13,0,55 to 59 years,,Female,Married,Above bachelor's degree,Single,,,Public sector employees,3,17.0,4.0,14.0,,,,375.0,375.0,1.0,375.0,375.0,0.0,,0.0,0.0,0.0,,240.0,,4538.0,1.0,1.0,4.0,4.0,,,,,,,,,,,,,,,,,1.0,2,,168
83236,Not in labour force,24,2,20 to 24 years,22 to 24 years,Female,"Single, never married",Postsecondary certificate or diploma,,"Yes, within last year",Part-time,Private sector employees,2,10.0,7.0,31.0,,,,,,,,,,,,,,,,19.0,,,,,,,,,3.0,5.0,6.0,,,,,,,,,,,2.0,3,,371


In [111]:
df_desc['immigrant_status']=df_desc['immigrant_status'].map({1:'Immigrant, less than 10 years in Canada', 2:'Immigrant, more than 10 years in Canada', 
                                3:'Non-immigrant'})

In [115]:
df_desc['industry_of_main_job'] = df_desc['industry_of_main_job'].map({1: 'Agriculture',
 2: 'Forestry and logging and support activities for forestry',3: 'Fishing, hunting and trapping',
 4: 'Mining, quarrying, and oil and gas extraction',5: 'Utilities',6: 'Construction',
 7: 'Manufacturing - durable goods',8: 'Manufacturing - non-durable goods',9: 'Wholesale trade',
 10: 'Retail trade',11: 'Transportation and warehousing',12: 'Finance and insurance',
 13: 'Real estate and rental and leasing',14: 'Professional, scientific and technical services',
 15: 'Business, building and other support services',16: 'Educational services',
 17: 'Health care and social assistance',18: 'Information, culture and recreation',19: 'Accommodation and food services',
 20: 'Other services (except public administration)',21: 'Public administration'})

In [194]:
df_desc.columns = ['labour_force_status', 'province', 'nine_largest_cmas',
       'five-year_age_group', 'age_in_2_and_3_year_groups,_15_to_29', 'sex',
       'marital_status', 'highest_educational_attainment',
       'single_or_multiple_jobholder',
       'identifies_if_a_person_has_worked_in_the_last_year',
       'full-_or_part-time_status_of_last_job', 'class_of_worker,_main_job',
       'immigrant_status', 'industry_of_main_job', 'occupation_at_main_job_col1',
       'occupation_at_main_job_col2', 'reason_of_absence,_full_week',
       'number_of_weeks_absent_from_work',
       'paid_for_time_off,_full-week_absence_only',
       'usual_hours_worked_per_week_at_main_job',
       'actual_hours_worked_per_week_at_main_job',
       'full-_or_part-time_status_at_main_or_only_job',
       'usual_hours_worked_per_week_at_all_jobs',
       'actual_hours_worked_per_week_at_all_jobs',
       'hours_away_from_work,_part-week_absence_only',
       'reason_for_part-week_absence', 'paid_overtime_hours_in_reference_week',
       'unpaid_overtime_hours_in_reference_week',
       'number_of_overtime_or_extra_hours_worked', 'reason_for_part-time_work',
       'job_tenure_with_current_employer_months', 'job_tenure_with_previous_employer',
       'usual_hourly_wages', 'union_status', 'job_permanency',
       'establishment_size', 'firm_size', 'duration_of_unemployment',
       'flows_into_unemployment',
       'job_seekers_by_type_of_work_sought_and_temporary_layoffs_by_work_status_of_last_job',
       'reason_for_leaving_job_during_previous_year_',
       'reason_for_leaving_job_during_previous_year_',
       'duration_of_joblessness', 'availability_during_the_reference_week',
       'unemployed,_used_public_employment_agency',
       'unemployed,_checked_with_employers_directly',
       'unemployed,_checked_with_friends_or_relatives',
       'unemployed,_looked_at_job_ads', 'unemployed,_placed_or_answered_ads',
       'unemployed,_other_methods',
       'main_activity_before_started_looking_for_work',
       'reason_for_not_looking_for_work_during_the_reference_week',
       'temporary_layoff,_looked_for_work_during_the_last_four_weeks',
       'current_student_status', 'type_of_economic_family',
       'age_of_youngest_child', 'standard_final_weight']

In [119]:
df_desc.sample(3)

Unnamed: 0,labour_force_status,province,nine_largest_cmas,five-year_age_group,"age_in_2_and_3_year_groups,_15_to_29",sex,marital_status,highest_educational_attainment,single_or_multiple_jobholder,identifies_if_a_person_has_worked_in_the_last_year,full-_or_part-time_status_of_last_job,"class_of_worker,_main_job",immigrant_status,industry_of_main_job,occupation_at_main_job_col1,occupation_at_main_job_col2,"reason_of_absence,_full_week",number_of_weeks_absent_from_work,"paid_for_time_off,_full-week_absence_only",usual_hours_worked_per_week_at_main_job,actual_hours_worked_per_week_at_main_job,full-_or_part-time_status_at_main_or_only_job,usual_hours_worked_per_week_at_all_jobs,actual_hours_worked_per_week_at_all_jobs,"hours_away_from_work,_part-week_absence_only",reason_for_part-week_absence,paid_overtime_hours_in_reference_week,unpaid_overtime_hours_in_reference_week,number_of_overtime_or_extra_hours_worked,reason_for_part-time_work,job_tenure_with_current_employer,job_tenure_with_previous_employer,usual_hourly_wages,union_status,job_permanency,establishment_size,firm_size,duration_of_unemployment,flows_into_unemployment,job_seekers_by_type_of_work_sought_and_temporary_layoffs_by_work_status_of_last_job,reason_for_leaving_job_during_previous_year_,reason_for_leaving_job_during_previous_year_.1,duration_of_joblessness,availability_during_the_reference_week,"unemployed,_used_public_employment_agency","unemployed,_checked_with_employers_directly","unemployed,_checked_with_friends_or_relatives","unemployed,_looked_at_job_ads","unemployed,_placed_or_answered_ads","unemployed,_other_methods",main_activity_before_started_looking_for_work,reason_for_not_looking_for_work_during_the_reference_week,"temporary_layoff,_looked_for_work_during_the_last_four_weeks",current_student_status,type_of_economic_family,age_of_youngest_child,standard_final_weight
43893,"Employed, at work",46,6,40 to 44 years,,Male,Married,Postsecondary certificate or diploma,Single,,,Private sector employees,"Immigrant, more than 10 years in Canada",Wholesale trade,3.0,13.0,,,,350.0,490.0,1.0,350.0,490.0,0.0,,140.0,0.0,140.0,,156.0,,3352.0,3.0,1.0,4.0,4.0,,,,,,,,,,,,,,,,,1.0,6,2.0,209
55136,Not in labour force,46,0,70 and over,,Male,"Single, never married",Bachelor's degree,,"Yes, more than 1 year ago",,,Non-immigrant,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,127.0,,,,,,,,,,,,1,,55
63548,Not in labour force,47,0,45 to 49 years,,Female,Married,Bachelor's degree,,"Yes, more than 1 year ago",,,Non-immigrant,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,188.0,,,,,,,,,,,1.0,6,2.0,113


In [123]:
df_desc['occupation_at_main_job_col1'] = df_desc['occupation_at_main_job_col1'].map({1: 'Management occupations',
 2: 'Business, finance and administration occupations, except management',
 3: 'Natural and applied sciences and related occupations, except management',
 4: 'Health occupations, except management',
 5: 'Occupations in education, law and social, community and government services, except management',
 6: 'Occupations in art, culture, recreation and sport, except management',
 7: 'Sales and service occupations, except management',
 8: 'Trades, transport and equipment operators and related occupations, except management',
 9: 'Natural resources, agriculture and related production occupations, except management',
 10: 'Occupations in manufacturing and utilities, except management'})

In [126]:
df_desc['occupation_at_main_job_col2']=df_desc['occupation_at_main_job_col2'].map({1: 'Legislative and senior management occupations',
 2: 'Specialized middle management occupations',
 3: 'Middle management occupations in retail and wholesale trade and customer services',
 4: 'Middle management occupations in trades, transportation, production and utilities',
 5: 'Professional occupations in finance',
 6: 'Professional occupations in business',
 7: 'Administrative and financial supervisors and specialized administrative occupations',
 8: 'Administrative occupations and transportation logistics occupations',
 9: 'Administrative and financial support and supply chain logistics occupations',
 10: 'Professional occupations in natural sciences',
 11: 'Professional occupations in applied sciences (except engineering)',
 12: 'Professional occupations in engineering',
 13: 'Technical occupations related to natural and applied sciences',
 14: 'Health treating and consultation services professionals',
 15: 'Therapy and assessment professionals',
 16: 'Nursing and allied health professionals',
 17: 'Technical occupations in health',
 18: 'Assisting occupations in support of health services',
 19: 'Professional occupations in law',
 20: 'Professional occupations in education services',
 21: 'Professional occupations in social and community services',
 22: 'Professional occupations in government services',
 23: 'Occupations in front-line public protection services',
 24: 'Paraprofessional occupations in legal, social, community and education services',
 25: 'Assisting occupations in education and in legal and public protection',
 26: 'Care providers and public protection support occupations and student monitors, crossing guards and related occupations',
 27: 'Professional occupations in art and culture',
 28: 'Technical occupations in art, culture and sport',
 29: 'Occupations in art, culture and sport',
 30: 'Support occupations in art, culture and sport',
 31: 'Retail sales and service supervisors and specialized occupations in sales and services',
 32: 'Occupations in sales and services',
 33: 'Sales and service representatives and other customer and personal services occupations',
 34: 'Sales and service support occupations',
 35: 'Technical trades and transportation officers and controllers',
 36: 'General trades',
 37: 'Mail and message distribution, other transport equipment operators and related maintenance workers',
 38: 'Helpers and labourers and other transport drivers, operators and labourers',
 39: 'Supervisors and occupations in natural resources, agriculture and related production',
 40: 'Workers and labourers in natural resources, agriculture and related production',
 41: 'Supervisors, central control and process operators in processing, manufacturing and utilities and aircraft assemblers and inspectors',
 42: 'Machine operators, assemblers and inspectors in processing, manufacturing and printing',
 43: 'Labourers in processing, manufacturing and utilities'})

In [131]:
df_desc['reason_of_absence,_full_week'] = df_desc['reason_of_absence,_full_week'].map({0: 'Other reasons',
 1: 'Own illness or disability',
 2: 'Personal or family responsibilities',
 3: 'Vacation'})

In [139]:
df_desc.sample(3)

Unnamed: 0,labour_force_status,province,nine_largest_cmas,five-year_age_group,"age_in_2_and_3_year_groups,_15_to_29",sex,marital_status,highest_educational_attainment,single_or_multiple_jobholder,identifies_if_a_person_has_worked_in_the_last_year,full-_or_part-time_status_of_last_job,"class_of_worker,_main_job",immigrant_status,industry_of_main_job,occupation_at_main_job_col1,occupation_at_main_job_col2,"reason_of_absence,_full_week",number_of_weeks_absent_from_work,"paid_for_time_off,_full-week_absence_only",usual_hours_worked_per_week_at_main_job,actual_hours_worked_per_week_at_main_job,full-_or_part-time_status_at_main_or_only_job,usual_hours_worked_per_week_at_all_jobs,actual_hours_worked_per_week_at_all_jobs,"hours_away_from_work,_part-week_absence_only",reason_for_part-week_absence,paid_overtime_hours_in_reference_week,unpaid_overtime_hours_in_reference_week,number_of_overtime_or_extra_hours_worked,reason_for_part-time_work,job_tenure_with_current_employer,job_tenure_with_previous_employer,usual_hourly_wages,union_status,job_permanency,establishment_size,firm_size,duration_of_unemployment,flows_into_unemployment,job_seekers_by_type_of_work_sought_and_temporary_layoffs_by_work_status_of_last_job,reason_for_leaving_job_during_previous_year_,reason_for_leaving_job_during_previous_year_.1,duration_of_joblessness,availability_during_the_reference_week,"unemployed,_used_public_employment_agency","unemployed,_checked_with_employers_directly","unemployed,_checked_with_friends_or_relatives","unemployed,_looked_at_job_ads","unemployed,_placed_or_answered_ads","unemployed,_other_methods",main_activity_before_started_looking_for_work,reason_for_not_looking_for_work_during_the_reference_week,"temporary_layoff,_looked_for_work_during_the_last_four_weeks",current_student_status,type_of_economic_family,age_of_youngest_child,standard_final_weight
40734,"Employed, absent from work",46,0,50 to 54 years,,Male,Married,Bachelor's degree,Single,,,Public sector employees,Non-immigrant,Public administration,"Occupations in education, law and social, comm...",Professional occupations in government services,Own illness or disability,55.0,1.0,375.0,0.0,1.0,375.0,0.0,,,,,,,50.0,,3200.0,1.0,1.0,1.0,2.0,,,,,,,,,,,,,,,,,1.0,4,4.0,259
15382,"Employed, at work",24,0,35 to 39 years,,Male,Living in common-law,Postsecondary certificate or diploma,Single,,,Private sector employees,Non-immigrant,Transportation and warehousing,"Trades, transport and equipment operators and ...",General trades,,,,420.0,420.0,1.0,420.0,420.0,0.0,,0.0,0.0,0.0,,34.0,,2200.0,3.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,1.0,3,1.0,645
69846,Not in labour force,46,6,55 to 59 years,,Female,"Single, never married",High school graduate,,"Yes, more than 1 year ago",,,Non-immigrant,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,54.0,,,,,,,,,1.0,,1.0,17,4.0,365


In [141]:
df_desc['paid_for_time_off,_full-week_absence_only']=df_desc['paid_for_time_off,_full-week_absence_only'].map({1.:'Yes', 2.:'No'})

In [144]:
# There is something wrong here. I doubt the average work hours per week is 400. Maybe we should divide by ten.
round(df_desc['usual_hours_worked_per_week_at_main_job'].mean()), df_desc['usual_hours_worked_per_week_at_main_job'].median(), df_desc['usual_hours_worked_per_week_at_main_job'].min(), df_desc['usual_hours_worked_per_week_at_main_job'].max()

(356, 400.0, 1.0, 990.0)

In [156]:
df_desc['usual_hours_worked_per_week_at_main_job'] = df_desc['usual_hours_worked_per_week_at_main_job']/10

In [179]:
df_desc['usual_hours_worked_per_week_at_all_jobs'] = df_desc['usual_hours_worked_per_week_at_all_jobs']/10

In [159]:
round(df_desc['actual_hours_worked_per_week_at_all_jobs'].mean()), df_desc['actual_hours_worked_per_week_at_all_jobs'].median(), df_desc['actual_hours_worked_per_week_at_all_jobs'].max()

(334, 375.0, 990.0)

In [160]:
df_desc['actual_hours_worked_per_week_at_all_jobs'] = df_desc['actual_hours_worked_per_week_at_all_jobs']/10

In [166]:
df_desc['hours_away_from_work,_part-week_absence_only'].max(), df_desc['hours_away_from_work,_part-week_absence_only'].mean(), df_desc['hours_away_from_work,_part-week_absence_only'].median()

(720.0, 15.479212552382808, 0.0)

In [169]:
df_desc['hours_away_from_work,_part-week_absence_only'].replace(0, np.NaN).median(), df_desc['hours_away_from_work,_part-week_absence_only'].replace(0, np.NaN).mean()

(80.0, 106.29915673939232)

In [None]:
# divide these values by 10 again. 

In [170]:
df_desc['hours_away_from_work,_part-week_absence_only'] = df_desc['hours_away_from_work,_part-week_absence_only']/10

In [174]:
df_desc['reason_for_part-week_absence'] = df_desc['reason_for_part-week_absence'].map({0: 'Other reasons',
 1: 'Own illness or disability',
 2: 'Personal or family responsibilities',
 3: 'Vacation or civic holiday',
 4: 'Working short-time'})

In [183]:
df_desc.paid_overtime_hours_in_reference_week = df_desc.paid_overtime_hours_in_reference_week/10

In [184]:
df_desc.unpaid_overtime_hours_in_reference_week = df_desc.unpaid_overtime_hours_in_reference_week/10

In [186]:
df_desc['number_of_overtime_or_extra_hours_worked'] = df_desc['number_of_overtime_or_extra_hours_worked']/10 

In [190]:
df_desc['reason_for_part-time_work'] = df_desc['reason_for_part-time_work'].map({0: 'Other reasons',
 1: 'Own illness or disability',
 2: 'Caring for children',
 3: 'Other personal or family responsibilities',
 4: 'Going to school',
 5: 'Personal preference',
 6: 'Business conditions or could not find full-time work, looked for full-time work in last month',
 7: 'Business conditions or could not find full-time work, did not look for full-time work in last month'})

In [196]:
# These hourly wages seem very high. Maybe they are weekly or monthly wages? Look into this. 
df_desc['usual_hourly_wages'].mean(), df_desc['usual_hourly_wages'].median(), df_desc['usual_hourly_wages'].max() 

(3393.8444295569357, 2900.0, 20337.0)

In [200]:
df_desc.union_status=df_desc.union_status.map({1: 'Union member', 2: 'Not a member but covered by a union contract or collective agreement',
 3: 'Non-unionized'})

In [204]:
df_desc.job_permanency = df_desc.job_permanency.map({1: 'Permanent',2: 'Temporary, seasonal job',3: 'Temporary, term or contract job',
 4: 'Temporary, casual or other temporary jobs'})

In [207]:
df_desc.establishment_size = df_desc.establishment_size.map({1: 'Less than 20 employees',
 2: '20 to 99 employees',
 3: '100 to 500 employees',
 4: 'More than 500 employees'})

In [209]:
df_desc.firm_size=df_desc.firm_size.map({1: 'Less than 20 employees',
 2: '20 to 99 employees',
 3: '100 to 500 employees',
 4: 'More than 500 employees'})

In [230]:
df_desc.age_of_youngest_child=df_desc.age_of_youngest_child.map({1: 'Youngest child less than 6 years',
 2: 'Youngest child 6 to 12 years',
 3: 'Youngest child 13 to 17 years',
 4: 'Youngest child 18 to 24 years'})

In [231]:
df_desc.type_of_economic_family=df_desc.type_of_economic_family.map({1: 'Person not in an economic family',
 2: 'Dual-earner couple, no children or none under 25',
 3: 'Dual-earner couple, youngest child 0 to 17',
 4: 'Dual-earner couple, youngest child 18 to 24',
 5: 'Single-earner couple, male employed, no children or none under 25',
 6: 'Single-earner couple, male employed, youngest child 0 to 17',
 7: 'Single-earner couple, male employed, youngest child 18 to 24',
 8: 'Single-earner couple, female employed, no children or none under 25',
 9: 'Single-earner couple, female employed, youngest child 0 to 17',
 10: 'Single-earner couple, female employed, youngest child 18 to 24',
 11: 'Non-earner couple, no children or none under 25',
 12: 'Non-earner couple, youngest child 0 to 17',
 13: 'Non-earner couple, youngest child 18 to 24',
 14: 'Lone-parent family, parent employed, youngest child 0 to 17',
 15: 'Lone-parent family, parent employed, youngest child 18 to 24',
 16: 'Lone-parent family, parent not employed, youngest child 0 to 17',
 17: 'Lone-parent family, parent not employed, youngest child 18 to 24',
 18: 'Other families'})

In [226]:
df_desc.current_student_status=df_desc.current_student_status.map({1: 'Non-student', 2: 'Full-time student', 3: 'Part-time student'})

In [228]:
df_desc['temporary_layoff,_looked_for_work_during_the_last_four_weeks']=df_desc['temporary_layoff,_looked_for_work_during_the_last_four_weeks'].map({1:'Yes', 2:'No'})

In [235]:
df_desc['reason_for_not_looking_for_work_during_the_reference_week']=df_desc['reason_for_not_looking_for_work_during_the_reference_week'].map({0: 'Wanted work, reason - other',
 1: 'Wanted work, reason - own illness or disability',
 2: 'Wanted work, reason - caring for children',
 3: 'Wanted work, reason - other personal or family responsibilities',
 4: 'Wanted work, reason - school',
 5: 'Wanted work, reason - awaiting recall or reply',
 6: 'Wanted work, reason - discouraged'})

In [240]:
df_desc['main_activity_before_started_looking_for_work']=df_desc['main_activity_before_started_looking_for_work'].map({0: 'Other', 1: 'Working', 2: 'Managing a home', 
                                                              3: 'Going to school'})

In [242]:
#df['unemployed,_other_methods'].map({1:'Yes'}).fillna('No')
df['unemployed,_other_methods']=df['unemployed,_other_methods'].map({1:'Yes'})

In [244]:
df['unemployed,_other_methods']=df['unemployed,_other_methods'].map({'Yes':1})

In [245]:
df_desc.sample()

Unnamed: 0,labour_force_status,province,nine_largest_cmas,five-year_age_group,"age_in_2_and_3_year_groups,_15_to_29",sex,marital_status,highest_educational_attainment,single_or_multiple_jobholder,identifies_if_a_person_has_worked_in_the_last_year,full-_or_part-time_status_of_last_job,"class_of_worker,_main_job",immigrant_status,industry_of_main_job,occupation_at_main_job_col1,occupation_at_main_job_col2,"reason_of_absence,_full_week",number_of_weeks_absent_from_work,"paid_for_time_off,_full-week_absence_only",usual_hours_worked_per_week_at_main_job,actual_hours_worked_per_week_at_main_job,full-_or_part-time_status_at_main_or_only_job,usual_hours_worked_per_week_at_all_jobs,actual_hours_worked_per_week_at_all_jobs,"hours_away_from_work,_part-week_absence_only",reason_for_part-week_absence,paid_overtime_hours_in_reference_week,unpaid_overtime_hours_in_reference_week,number_of_overtime_or_extra_hours_worked,reason_for_part-time_work,job_tenure_with_current_employer_months,job_tenure_with_previous_employer,usual_hourly_wages,union_status,job_permanency,establishment_size,firm_size,duration_of_unemployment,flows_into_unemployment,job_seekers_by_type_of_work_sought_and_temporary_layoffs_by_work_status_of_last_job,reason_for_leaving_job_during_previous_year_,reason_for_leaving_job_during_previous_year_.1,duration_of_joblessness,availability_during_the_reference_week,"unemployed,_used_public_employment_agency","unemployed,_checked_with_employers_directly","unemployed,_checked_with_friends_or_relatives","unemployed,_looked_at_job_ads","unemployed,_placed_or_answered_ads","unemployed,_other_methods",main_activity_before_started_looking_for_work,reason_for_not_looking_for_work_during_the_reference_week,"temporary_layoff,_looked_for_work_during_the_last_four_weeks",current_student_status,type_of_economic_family,age_of_youngest_child,standard_final_weight
96857,"Employed, at work",35,0,15 to 19 years,17 to 19 years,Female,"Single, never married",High school graduate,Single,,,Private sector employees,Non-immigrant,Accommodation and food services,"Sales and service occupations, except management",Retail sales and service supervisors and speci...,,,,14.0,140.0,2.0,14.0,14.0,0.0,,0.0,0.0,0.0,Going to school,27.0,,1600.0,Non-unionized,1.0,2.0,4.0,,,,,,,,,,,,,,,,,Full-time student,,,290


In [58]:
col_desc

[('rec_num', 'Order of record in file'),
 ('1-9999999', nan),
 ('survyear', 'Survey year'),
 ('1976-', nan),
 ('survmnth', 'Survey month'),
 ('01', 'January'),
 ('02', 'February'),
 ('03', 'March'),
 ('04', 'April'),
 ('05', 'May'),
 ('06', 'June'),
 ('07', 'July'),
 ('08', 'August'),
 ('09', 'September'),
 ('10', 'October'),
 ('11', 'November'),
 ('12', 'December'),
 (' lfsstat    ', 'Labour force status'),
 ('1', 'Employed, at work'),
 ('2', 'Employed, absent from work'),
 ('3', 'Unemployed'),
 ('4', 'Not in labour force'),
 ('prov', 'Province'),
 ('10', 'Newfoundland and Labrador'),
 ('11', 'Prince Edward Island'),
 ('12', 'Nova Scotia'),
 ('13', 'New Brunswick'),
 ('24', 'Quebec'),
 ('35', 'Ontario'),
 ('46', 'Manitoba'),
 ('47', 'Saskatchewan'),
 ('48', 'Alberta'),
 ('59', 'British Columbia'),
 ('cma', 'Nine largest CMAs'),
 ('1', 'Québec'),
 ('2', 'Montréal'),
 ('3', 'Ottawa\x96Gatineau (Ontario part)'),
 ('4', 'Toronto'),
 ('5', 'Hamilton'),
 ('6', 'Winnipeg'),
 ('7', 'Calgary')

In [236]:
dict1 = {}
pairs = [  ('0', 'Other'),
 ('1', 'Working'),
 ('2', 'Managing a home'),
 ('3', 'Going to school')]
for pair in pairs:
    dict1[int(pair[0])] = pair[1]
dict1

{0: 'Other', 1: 'Working', 2: 'Managing a home', 3: 'Going to school'}

In [135]:
# Probably answer to this question: ABS_Q162 / EQ 40 – 
# How many weeks had you been continuously absent from work, up to and including [refer to collection period]?

df_desc['number_of_weeks_absent_from_work'].mean(), df_desc['number_of_weeks_absent_from_work'].median(), df_desc['number_of_weeks_absent_from_work'].max()

(15.537783375314861, 4.0, 99.0)

In [137]:
df_desc[df_desc['number_of_weeks_absent_from_work']==99.0]

Unnamed: 0,labour_force_status,province,nine_largest_cmas,five-year_age_group,"age_in_2_and_3_year_groups,_15_to_29",sex,marital_status,highest_educational_attainment,single_or_multiple_jobholder,identifies_if_a_person_has_worked_in_the_last_year,full-_or_part-time_status_of_last_job,"class_of_worker,_main_job",immigrant_status,industry_of_main_job,occupation_at_main_job_col1,occupation_at_main_job_col2,"reason_of_absence,_full_week",number_of_weeks_absent_from_work,"paid_for_time_off,_full-week_absence_only",usual_hours_worked_per_week_at_main_job,actual_hours_worked_per_week_at_main_job,full-_or_part-time_status_at_main_or_only_job,usual_hours_worked_per_week_at_all_jobs,actual_hours_worked_per_week_at_all_jobs,"hours_away_from_work,_part-week_absence_only",reason_for_part-week_absence,paid_overtime_hours_in_reference_week,unpaid_overtime_hours_in_reference_week,number_of_overtime_or_extra_hours_worked,reason_for_part-time_work,job_tenure_with_current_employer,job_tenure_with_previous_employer,usual_hourly_wages,union_status,job_permanency,establishment_size,firm_size,duration_of_unemployment,flows_into_unemployment,job_seekers_by_type_of_work_sought_and_temporary_layoffs_by_work_status_of_last_job,reason_for_leaving_job_during_previous_year_,reason_for_leaving_job_during_previous_year_.1,duration_of_joblessness,availability_during_the_reference_week,"unemployed,_used_public_employment_agency","unemployed,_checked_with_employers_directly","unemployed,_checked_with_friends_or_relatives","unemployed,_looked_at_job_ads","unemployed,_placed_or_answered_ads","unemployed,_other_methods",main_activity_before_started_looking_for_work,reason_for_not_looking_for_work_during_the_reference_week,"temporary_layoff,_looked_for_work_during_the_last_four_weeks",current_student_status,type_of_economic_family,age_of_youngest_child,standard_final_weight
1028,"Employed, absent from work",47,0,65 to 69 years,,Male,Married,High school graduate,Single,,,Private sector employees,Non-immigrant,"Business, building and other support services","Sales and service occupations, except management",Sales and service representatives and other cu...,Own illness or disability,99.0,2.0,450.0,0.0,1.0,450.0,0.0,,,,,,,50.0,,1500.0,3.0,1.0,3.0,4.0,,,,,,,,,,,,,,,,,,5,,133
1062,"Employed, absent from work",35,0,50 to 54 years,,Female,Divorced,Postsecondary certificate or diploma,Single,,,Public sector employees,Non-immigrant,Public administration,"Business, finance and administration occupatio...",Administrative and financial support and suppl...,Own illness or disability,99.0,2.0,400.0,0.0,1.0,400.0,0.0,,,,,,,240.0,,4200.0,1.0,3.0,4.0,4.0,,,,,,,,,,,,,,,,,1.0,15,4.0,317
1694,"Employed, absent from work",46,0,55 to 59 years,,Female,Married,High school graduate,Single,,,Private sector employees,Non-immigrant,Finance and insurance,"Sales and service occupations, except management",Sales and service representatives and other cu...,Own illness or disability,99.0,2.0,400.0,0.0,1.0,400.0,0.0,,,,,,,240.0,,1700.0,3.0,1.0,1.0,4.0,,,,,,,,,,,,,,,,,1.0,4,4.0,125
4211,"Employed, absent from work",10,0,55 to 59 years,,Male,Married,Postsecondary certificate or diploma,Single,,,Public sector employees,Non-immigrant,Educational services,"Trades, transport and equipment operators and ...",Technical trades and transportation officers a...,Own illness or disability,99.0,2.0,400.0,0.0,1.0,400.0,0.0,,,,,,,104.0,,3500.0,1.0,1.0,4.0,4.0,,,,,,,,,,,,,,,,,1.0,2,,83
7098,"Employed, absent from work",35,0,60 to 64 years,,Male,Married,Postsecondary certificate or diploma,Single,,,Private sector employees,Non-immigrant,Manufacturing - durable goods,Natural and applied sciences and related occup...,Technical occupations related to natural and a...,Own illness or disability,99.0,1.0,400.0,0.0,1.0,400.0,0.0,,,,,,,240.0,,3846.0,3.0,1.0,3.0,3.0,,,,,,,,,,,,,,,,,1.0,5,,441
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106305,"Employed, absent from work",48,8,50 to 54 years,,Female,Married,Postsecondary certificate or diploma,Single,,,Public sector employees,Non-immigrant,Health care and social assistance,"Health occupations, except management",Nursing and allied health professionals,Own illness or disability,99.0,2.0,525.0,0.0,1.0,525.0,0.0,,,,,,,240.0,,4934.0,1.0,1.0,4.0,4.0,,,,,,,,,,,,,,,,,1.0,4,4.0,521
106503,"Employed, absent from work",24,0,60 to 64 years,,Female,"Single, never married",High school graduate,Single,,,Private sector employees,Non-immigrant,"Information, culture and recreation","Sales and service occupations, except management",Sales and service representatives and other cu...,Own illness or disability,99.0,2.0,350.0,0.0,1.0,350.0,0.0,,,,,,,240.0,,2967.0,1.0,1.0,3.0,4.0,,,,,,,,,,,,,,,,,1.0,1,,208
106628,"Employed, absent from work",35,4,55 to 59 years,,Female,Married,Postsecondary certificate or diploma,Single,,,Private sector employees,Non-immigrant,Finance and insurance,"Business, finance and administration occupatio...",Administrative and financial supervisors and s...,Own illness or disability,99.0,2.0,120.0,0.0,2.0,120.0,0.0,,,,,,5.0,214.0,,6571.0,3.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,1.0,18,,554
107315,"Employed, absent from work",35,4,50 to 54 years,,Female,Divorced,Postsecondary certificate or diploma,Single,,,Private sector employees,Non-immigrant,Transportation and warehousing,"Sales and service occupations, except management",Sales and service representatives and other cu...,Own illness or disability,99.0,2.0,400.0,0.0,1.0,400.0,0.0,,,,,,,240.0,,2800.0,1.0,1.0,4.0,4.0,,,,,,,,,,,,,,,,,1.0,14,3.0,195
