In [7]:
import pandas as pd
import datetime as dt
import re

today = dt.datetime.today().strftime("%m/%d/%Y %H:%M:%S")
NYC = pd.read_json('https://data.cityofnewyork.us/resource/swhp-yxa4.json')
NYC = NYC.set_index('agency')
work_hours = re.compile(r'\w*\d\d\w*')

In [8]:
NYC.info()
NYC.head(3)

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, DEPARTMENT OF INVESTIGATION to ADMIN FOR CHILDREN'S SVCS
Data columns (total 24 columns):
__of_positions               1000 non-null int64
additional_information       1000 non-null object
business_title               1000 non-null object
civil_service_title          1000 non-null object
division_work_unit           1000 non-null object
hours_shift                  1000 non-null object
job_description              1000 non-null object
job_id                       1000 non-null int64
level                        1000 non-null object
minimum_qual_requirements    991 non-null object
post_until                   295 non-null object
posting_date                 1000 non-null object
posting_type                 1000 non-null object
posting_updated              1000 non-null object
preferred_skills             1000 non-null object
process_date                 1000 non-null object
residency_requirement        1000 non-null object
salar

Unnamed: 0_level_0,__of_positions,additional_information,business_title,civil_service_title,division_work_unit,hours_shift,job_description,job_id,level,minimum_qual_requirements,...,preferred_skills,process_date,residency_requirement,salary_frequency,salary_range_from,salary_range_to,title_code_no,to_apply,work_location,work_location_1
agency,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DEPARTMENT OF INVESTIGATION,1,Appointments are subject to Office of Manageme...,INSPECTOR GENERAL,INSPECTOR GENERAL,Default,,The New York City Department of Investigation ...,277531,M4,1. A baccalaureate degree from an accredited c...,...,"1.\tJuris Doctor, Certified Public Accountant ...",04/11/2017 00:00:00,There's no residency requirement for this posi...,Annual,115000,160000,31145,All current City Employees may apply by going ...,188 West 230 Street Bronx Ny,Bronx
DEPT OF ENVIRONMENT PROTECTION,1,DEP is an equal opportunity employer with a st...,Water Resources Analyst,CITY RESEARCH SCIENTIST,BEPA/MS4-Muni Sepa Strm Sew Sy,,The NYC Department of Environmental Protection...,282497,02,"1. For Assignment Level I (only physical, bio...",...,The ideal candidate will have demonstrated exp...,04/11/2017 00:00:00,New York City residency is generally required ...,Annual,70286,88213,21744,To apply click ''Apply Now'',59-17 Junction Blvd Corona Ny,
DEPARTMENT OF BUSINESS SERV.,1,,"PROGRAM COORDINATOR, VENDOR SERVICES/PTAC",BUSINESS PROMOTION COORDINATOR,Defo Vendor Services,,Vendor Services / Procurement Technical Assist...,283007,01,1. A masters degree from an accredited college...,...,"â€¢Advanced proficiency using MS Word, MS Exce...",04/11/2017 00:00:00,New York City residency is generally required ...,Annual,39399,50000,60860,Please email your resume and cover letter incl...,110 William St. N Y,


# Cleaning the Data.

In [9]:
# Taking care of Nulls
NYC['minimum_qual_requirements'].fillna('Not Applicable', inplace=True)
NYC['post_until'].fillna(today, inplace=True)
NYC['post_until'] = pd.to_datetime(NYC['post_until'])
NYC['posting_date'] = pd.to_datetime(NYC['posting_date'])
NYC['process_date'] = pd.to_datetime(NYC['process_date'])


# There are "empty" columns still in the data, but not NULL, need to be taken care of.
NYC['additional_information'] # -> Some results are "empty", like in index 2
NYC['additional_information'][2] # -> is equal to ' '
add_info_mask = NYC['additional_information'] == ' '
NYC[add_info_mask] # 432 occurences of this
hours_shift_mask = NYC['hours_shift'] == ' '
NYC[hours_shift_mask] # 684 occurences
pref_skill_mask = NYC['preferred_skills'] == ' ' # 118 occurences
work_loc_1_mask = NYC['work_location_1'] == ' ' # 520 occurences
mask = [add_info_mask, hours_shift_mask, pref_skill_mask, work_loc_1_mask]

# Replace Nulls that we as viewer see, but that Pandas is unaware about.
NYC['additional_information'] = NYC['additional_information'].replace(to_replace=' ', value='Not Available.')
NYC['hours_shift'] = NYC['hours_shift'].replace(to_replace=' ', value='Not Available.')
NYC['preferred_skills'] = NYC['preferred_skills'].replace(to_replace=' ', value='Not Available.')
NYC['work_location_1'] = NYC['work_location_1'].replace(to_replace=' ', value='Not Available.')
NYC

# Seeing which objects I put as categorize as dtype category to conserve space/efficiency.
unique = {}
for col in NYC.columns:
    unique[col] = NYC[col].nunique()
#print(unique)

# Applying what I noticed from previous cell.
#NYC['agency'] = NYC['agency'].astype('category')
NYC['level'] = NYC['level'].astype('category')
NYC['posting_type'] = NYC['posting_type'].astype('category')
NYC['salary_frequency'] = NYC['salary_frequency'].astype('category')

# Sort by index
NYC.sort_index(inplace=True)

# memory usage shrunk from 203.1 KB -> 175 KB, 14%
NYC.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, ADMIN FOR CHILDREN'S SVCS to TEACHERS RETIREMENT SYSTEM
Data columns (total 24 columns):
__of_positions               1000 non-null int64
additional_information       1000 non-null object
business_title               1000 non-null object
civil_service_title          1000 non-null object
division_work_unit           1000 non-null object
hours_shift                  1000 non-null object
job_description              1000 non-null object
job_id                       1000 non-null int64
level                        1000 non-null category
minimum_qual_requirements    1000 non-null object
post_until                   1000 non-null datetime64[ns]
posting_date                 1000 non-null datetime64[ns]
posting_type                 1000 non-null category
posting_updated              1000 non-null object
preferred_skills             1000 non-null object
process_date                 1000 non-null datetime64[ns]
residency_requirement     

# Data Analysis

In [4]:
NYC.head()

Unnamed: 0_level_0,__of_positions,additional_information,business_title,civil_service_title,division_work_unit,hours_shift,job_description,job_id,level,minimum_qual_requirements,...,preferred_skills,process_date,residency_requirement,salary_frequency,salary_range_from,salary_range_to,title_code_no,to_apply,work_location,work_location_1
agency,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ADMIN FOR CHILDREN'S SVCS,1,Section 424-A of the New York Social Services ...,Agency Medical Director,CITY MEDICAL DIRECTOR,Office Of Chld & Fmly Hlth-Fss,Not Available.,The Office of Child and Family Health is seeki...,269622,M7,1. Possession of a valid license to practice m...,...,The preferred candidate should possess the fol...,2017-04-11,New York City Residency is not required for th...,Annual,99353,200000,53047,Click the ''Apply Now'' Button.,"150 William Street, New York N",Not Available.
ADMIN FOR CHILDREN'S SVCS,4,Section 424-A of the New York Social Services ...,Case Analyst,CHILD WELFARE SPECIALIST SUPER,Quality Improvement (Qa),Not Available.,The 4 Case Analysts will be responsible for ev...,272617,02,1. A baccalaureate degree from an accredited c...,...,The preferred candidate should possess the fol...,2017-04-11,New York City Residency is not required for th...,Annual,75520,76674,52370,Click on the ''Apply Now'' Button.,"150 William Street, New York N",Not Available.
ADMIN FOR CHILDREN'S SVCS,5,Section 424-A of the New York Social Services ...,Operations Manager,DEPUTY SUPERINTENDENT (JUVENIL,Horizon Juvenile Center,Not Available.,The Operations Manager is responsible for the ...,261759,M1,Not Applicable,...,The preferred candidate should possess the fol...,2017-04-11,New York City residency is generally required ...,Annual,73555,73555,51587,Click on the ''Apply Now'' button.,560 Brook Avenue Bronx New Yor,"560 Brook Avenue, Bronx 17 Bristol Street, Bro..."
ADMIN FOR CHILDREN'S SVCS,1,Section 424-A of the New York Social Services ...,Audit Nurse,SUPERVISOR OF NURSES,Office Of Chld & Fmly Hlth-Fss,Not Available.,The Administration for Childrenâ€™s Services (...,239558,01,A valid New York State License and current reg...,...,A valid New York State License and current reg...,2017-04-11,New York City residency is generally required ...,Annual,76597,76597,50960,Click on the ''Apply Now'' Button.,"492 First Avenue, New York, Ny",Not Available.
ADMIN FOR CHILDREN'S SVCS,1,Section 424-A of the New York Social Services ...,Pre-Placement Staff Nurse,STAFF NURSE,Office Of Chld & Fmly Hlth-Fss,Not Available.,The Administration for Childrenâ€™s Services (...,265912,00,1. A valid New York State Registered Nurse Lic...,...,The preferred candidate should possess the fol...,2017-04-11,New York City residency is generally required ...,Annual,71669,71669,50910,Click on the ''Apply Now'' Button,"492 First Avenue, New York, Ny",Not Available.
