## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
pd.set_option('display.max_rows', 100)

## Read data

In [3]:
data = pd.read_csv('monster_com-job_sample.csv')
data.head()

Unnamed: 0,country,country_code,date_added,has_expired,job_board,job_description,job_title,job_type,location,organization,page_url,salary,sector,uniq_id
0,United States of America,US,,No,jobs.monster.com,TeamSoft is seeing an IT Support Specialist to...,IT Support Technician Job in Madison,Full Time Employee,"Madison, WI 53702",,http://jobview.monster.com/it-support-technici...,,IT/Software Development,11d599f229a80023d2f40e7c52cd941e
1,United States of America,US,,No,jobs.monster.com,The Wisconsin State Journal is seeking a flexi...,Business Reporter/Editor Job in Madison,Full Time,"Madison, WI 53708",Printing and Publishing,http://jobview.monster.com/business-reporter-e...,,,e4cbb126dabf22159aff90223243ff2a
2,United States of America,US,,No,jobs.monster.com,Report this job About the Job DePuy Synthes Co...,Johnson & Johnson Family of Companies Job Appl...,"Full Time, Employee",DePuy Synthes Companies is a member of Johnson...,Personal and Household Services,http://jobview.monster.com/senior-training-lea...,,,839106b353877fa3d896ffb9c1fe01c0
3,United States of America,US,,No,jobs.monster.com,Why Join Altec? If you’re considering a career...,Engineer - Quality Job in Dixon,Full Time,"Dixon, CA",Altec Industries,http://jobview.monster.com/engineer-quality-jo...,,Experienced (Non-Manager),58435fcab804439efdcaa7ecca0fd783
4,United States of America,US,,No,jobs.monster.com,Position ID# 76162 # Positions 1 State CT C...,Shift Supervisor - Part-Time Job in Camphill,Full Time Employee,"Camphill, PA",Retail,http://jobview.monster.com/shift-supervisor-pa...,,Project/Program Management,64d0272dc8496abfd9523a8df63c184c


In [4]:
df = data.copy()

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22000 entries, 0 to 21999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   country          22000 non-null  object
 1   country_code     22000 non-null  object
 2   date_added       122 non-null    object
 3   has_expired      22000 non-null  object
 4   job_board        22000 non-null  object
 5   job_description  22000 non-null  object
 6   job_title        22000 non-null  object
 7   job_type         20372 non-null  object
 8   location         22000 non-null  object
 9   organization     15133 non-null  object
 10  page_url         22000 non-null  object
 11  salary           3446 non-null   object
 12  sector           16806 non-null  object
 13  uniq_id          22000 non-null  object
dtypes: object(14)
memory usage: 2.3+ MB


In [6]:
df.shape

(22000, 14)

## Check for null values 

In [7]:
df.isnull().sum()

country                0
country_code           0
date_added         21878
has_expired            0
job_board              0
job_description        0
job_title              0
job_type            1628
location               0
organization        6867
page_url               0
salary             18554
sector              5194
uniq_id                0
dtype: int64

## Drop Columns which are not much important

In [8]:
df.drop(['has_expired', 'uniq_id', 'job_board'], axis=1, inplace=True)

## country column

In [9]:
df.country.unique()

array(['United States of America'], dtype=object)

In [10]:
df.country_code.unique()

array(['US'], dtype=object)

In [11]:
df.drop('country', axis=1, inplace=True)

In [12]:
df.drop('country_code', axis=1, inplace=True)

## date_added column

In [13]:
df.date_added.isnull().sum()

21878

In [14]:
df[df.date_added.notnull()]

Unnamed: 0,date_added,job_description,job_title,job_type,location,organization,page_url,salary,sector
133,5/10/2016,"#TrackingJobBody table, #TrackingJobBody a {<b...",Multibed Technician Job in Deer Park,Full Time Employee,"Deer Park, TX",Other/Not Classified,http://jobview.monster.com/Multibed-Technician...,,Other
140,5/13/2016,Equal Opportunity Employer: Minority/Female/Di...,Principal Cyber Security Engineer Job in Houston,Full Time Employee,"Houston, TX",Computer SoftwareComputer/IT Services,http://jobview.monster.com/Principal-Cyber-Sec...,,IT/Software Development
251,5/9/2016,"#TrackingJobBody table, #TrackingJobBody a {<b...",Field Supervisor IS Job in Deer Park,Full Time Employee,"Deer Park, TX",Other/Not Classified,http://jobview.monster.com/Field-Supervisor-IS...,,Other
279,6/10/2016,"At American Family Insurance, we're firmly com...",Insurance Sales - Customer Service Job in Eden...,Full Time Employee,"Eden Prairie, MN 55344",Insurance,http://jobview.monster.com/insurance-sales-cus...,15.00 - 21.00 $ /hour,Accounting/Finance/Insurance
366,1/2/2017,Description The Opportunity The Vehicle Mainte...,Vehicle Maintenance Mechanic - Las Vegas,Full Time Employee,"Las Vegas, NV",Energy and Utilities,http://jobview.monster.com/vehicle-maintenance...,,Installation/Maintenance/Repair
...,...,...,...,...,...,...,...,...,...
20760,9/27/2016,"#TrackingJobBody table, #TrackingJobBody a {<b...",Central Maintenance Planner Job in Norwell,Full Time Employee,"Norwell, MA",Other/Not Classified,http://jobview.monster.com/central-maintenance...,,Administrative/Clerical
21342,3/30/2016,"#TrackingJobBody table, #TrackingJobBody a {<b...",Branch Manager Job in Cincinnati,Full Time Employee,"Cincinnati, OH",Other/Not Classified,http://jobview.monster.com/Branch-Manager-Job-...,,Other
21391,3/24/2016,"#TrackingJobBody table, #TrackingJobBody a {<b...",Field Service Driver Job in Cincinnati,Full Time Employee,"Cincinnati, OH",Other/Not Classified,http://jobview.monster.com/Field-Service-Drive...,,Logistics/Transportation
21631,4/4/2016,"#TrackingJobBody table, #TrackingJobBody a {<b...",Field Project Manager Job in Cincinnati,Full Time Employee,"Cincinnati, OH",Other/Not Classified,http://jobview.monster.com/Field-Project-Manag...,,Other


In [15]:
df.drop('date_added', axis=1, inplace=True)

## job_type column

In [16]:
df.job_type.unique()

array(['Full Time Employee', 'Full Time', 'Full Time, Employee',
       'Part Time Employee', nan, 'Full Time Temporary/Contract/Project',
       'Full Time , Employee', 'Full Time, Temporary/Contract/Project',
       'Employee', 'Part Time', 'Part Time, Employee', 'Full Time Intern',
       'Temporary/Contract/Project', 'Full Time / Employee',
       'Full Time , Temporary/Contract/Project',
       'Part Time, Temporary/Contract/Project', 'Full Time/ Employee',
       'Per Diem, Employee', 'Job Type Full Time Employee', 'Per Diem',
       'Full Time\xa0', 'Part Time Intern', 'Per Diem Employee',
       'Part Time/ Temporary/Contract/Project',
       'Part Time Temporary/Contract/Project', 'Exempt',
       'Part Time , Temporary/Contract/Project', 'Full Time\xa0 Employee',
       'Part Time Seasonal', 'Part Time , Employee', 'Job Type Employee',
       'Job Type Full Time Temporary/Contract/Project',
       'Full Time / > Employee', 'Part Time\xa0',
       'Per Diem, Temporary/Contract

In [17]:
df.job_type.isnull().sum()

1628

In [18]:
df_null_job_type = df[df.job_type.isnull()]
df_null_job_type

Unnamed: 0,job_description,job_title,job_type,location,organization,page_url,salary,sector
9,"Insituform Technologies, LLC, an Aegion compan...",Video Data Management /Transportation Technici...,,"Chesterfield, MO",,http://jobview.monster.com/video-data-manageme...,,
16,Airfields. Roads. Dams. Buildings. Name the pr...,Horizontal Construction Engineers Job in Wades...,,Wadesboro 28170,,http://jobview.monster.com/Horizontal-Construc...,,Civil & Structural EngineeringGeneral/Other: E...
22,We're looking for team players to provide cons...,Combat Engineer - Construction and Engineering...,,La Porte 46350,,http://jobview.monster.com/Combat-Engineer-Con...,,General/Other: Engineering
26,Our business sales professionals are elite sel...,AT&T Business Sales Leadership Development Pro...,,"Denver, CO",,http://jobview.monster.com/AT-T-Business-Sales...,,
44,We're looking for team players to provide cons...,Combat Engineer - Construction and Engineering...,,"Bradford, Vt 05033",,http://jobview.monster.com/Combat-Engineer-Con...,,General/Other: Engineering
...,...,...,...,...,...,...,...,...
21832,The Powered by Zip Team at Coldwell Banker Wes...,Real Estate Sales - Licensed - Leads Provided ...,,"Cincinnati, OH 45202",Real Estate/Property Management,http://jobview.monster.com/Real-Estate-Sales-L...,,Sales/Retail/Business Development
21835,You're Invited! AT&T Retail Hiring Event Event...,Hiring Event Retail Sales April th Cincinnati ...,,"Cincinnati, OH",,http://jobview.monster.com/Hiring-Event-Retail...,,
21859,"Medical Diagnostic Laboratories, LLC is a CLIA...",Part Time Field Specimen Technician Job in Cin...,,"Cincinnati, OH",Biotechnology/Pharmaceuticals,http://jobview.monster.com/Part-Time-Field-Spe...,,Logistics/Transportation
21870,One-Time Bonus You will receive a one-time bon...,Retail Sales Consultant Cincinnati OH (Cincinn...,,"Cincinnati, OH",,http://jobview.monster.com/Retail-Sales-Consul...,,


### extract shift from job_type (full time, part time, per diem)

In [19]:
df['job_type_shift'] = df.job_type.str.extract(r'((?i)full[ -]time|(?i)part[ -]time|(?i)per[ -]diem)')

In [20]:
df['job_type_shift'].replace(r'(?i)per[ -]diem', 'By the day', regex=True, inplace=True)

In [21]:
df.job_type_shift.fillna('NA', inplace=True)

In [22]:
df[['job_type', 'job_type_shift']].sample(10)

Unnamed: 0,job_type,job_type_shift
13010,Full Time,Full Time
11363,Full Time,Full Time
2512,Full Time Employee,Full Time
18709,"Full Time, Employee",Full Time
13304,"Full Time, Employee",Full Time
6922,Full Time Temporary/Contract/Project,Full Time
19321,Full Time Employee,Full Time
20803,"Full Time, Employee",Full Time
8123,"Per Diem, Employee",By the day
1324,"Full Time, Temporary/Contract/Project",Full Time


### extract type from job_type (employee, intern, seasonal, temprory)

In [23]:
df['job_type_part'] = df.job_type.str.extract(r'((?i)employee|(?i)intern|(?i)seasonal|(?i)temporary|(?i)exempt)')

In [24]:
df.job_type_part.fillna('NA', inplace=True)

In [25]:
df[['job_type', 'job_type_part']].sample(10)

Unnamed: 0,job_type,job_type_part
3003,,
10322,Full Time,
6441,Full Time Employee,Employee
7943,Full Time,
11865,Full Time Employee,Employee
19850,Full Time,
21432,Full Time,
6284,Full Time Employee,Employee
2050,Full Time,
21433,Full Time,


In [26]:
df[['job_type', 'job_type_part', 'job_type_shift']].sample(10)

Unnamed: 0,job_type,job_type_part,job_type_shift
14716,Full Time Employee,Employee,Full Time
19809,"Full Time, Employee",Employee,Full Time
15757,,,
76,Full Time,,Full Time
12095,,,
10379,Full Time Employee,Employee,Full Time
5611,Full Time,,Full Time
372,"Full Time , Employee",Employee,Full Time
5572,Full Time,,Full Time
12814,Full Time Employee,Employee,Full Time


## job_title column

In [27]:
df['job_title_name'] = df.job_title.str.replace(r'(?i)Job in (.*?)[\w\W]*|(.*?)(?i)Job Application for', "")

  df['job_title_name'] = df.job_title.str.replace(r'(?i)Job in (.*?)[\w\W]*|(.*?)(?i)Job Application for', "")


In [28]:
df['job_title_name'].sample(10)

7285                                    Forklift Operator 
21630                Restaurant Manager / Kitchen Manager 
7364     Field Service Engineer - PLC/CAD/Electro-mecha...
6611     B Combat Engineer - Construction and Engineeri...
16545                                      Infant Teacher 
16108                                     Data Specialist 
2959             Repair and Maintenance Technician / Lead 
3170                               Health Care Specialist 
2306                                  Accounts Receivable 
1204                                      Product Manager 
Name: job_title_name, dtype: object

## location column

In [29]:
df.location.sample(10)

195                                      Rhinelander 54501
13451    When you choose Toll Brothers, you enjoy all t...
1529                                   San Diego, CA 92126
11653    91L Construction Vehicle Repairer Job ID: 7880...
16535                                    Coppell, TX 75019
1089                                      Denver, CO 80204
17667                                     Dallas, TX 75201
12988                                        Davenport, IA
21971                                     West Chester, OH
18723                                  Pawtucket, RI 02861
Name: location, dtype: object

In [30]:
df['loc_'] = df.page_url.str.extract(r'-(?i)job-(.*?)(?i)-us-\d')

In [31]:
df['location_extract'] = df.loc_.str[:-3] + ', ' + df['loc_'].str[-2:]

In [32]:
df['location_extract'].sample(10)

20482        dallas, tx
9004         Belmar, NJ
2691                NaN
21335    Cincinnati, OH
2106        houston, tx
8538        houston, tx
6869      chantilly, va
17987        Dallas, TX
2924        Spokane, WA
17862        Dallas, TX
Name: location_extract, dtype: object

## organization column

In [33]:
df.loc[29]

job_description     Experis is working with a Pharmaceutical start...
job_title                                        Sr. Process Engineer
job_type                                           Full Time Employee
location                          Sr. Process Engineer, Manufacturing
organization                                              Chicago, IL
page_url            http://jobview.monster.com/Sr-Process-Engineer...
salary                                 70,000.00 - 100,000.00 $ /year
sector                                                    Engineering
job_type_shift                                              Full Time
job_type_part                                                Employee
job_title_name                                   Sr. Process Engineer
loc_                                                       Chicago-IL
location_extract                                          Chicago, IL
Name: 29, dtype: object

In [34]:
len(df.organization.unique())

739

In [35]:
df.organization.unique()

array([nan, 'Printing and Publishing', 'Personal and Household Services',
       'Altec Industries', 'Retail', 'Computer/IT Services',
       'Computer Software',
       'Hotels and Lodging Personal and Household Services', 'Insurance',
       'Business Services - Other', 'Education',
       'Construction - Industrial Facilities and InfrastructureConstruction - Residential & Commercial/Office',
       'Accounting and Auditing Services', 'Legal Services',
       'Construction - Residential & Commercial/Office',
       'Engineering Services', 'AllComputer SoftwareComputer/IT Services',
       'Healthcare Services', 'Chicago, IL', 'Manufacturing - Other',
       'Oklahoma City, OK', 'Aerospace and Defense', 'San Francisco, CA',
       'Advertising and PR ServicesManagement Consulting ServicesBusiness Services - Other',
       'Other/Not Classified',
       'RetailAdvertising and PR ServicesBusiness Services - Other',
       'All', 'Electronics, Components, and Semiconductor Mfg',
       '

In [36]:
ind = \
list(df[df.organization.str.contains(r'[\w\/ ]*\w*, \w{2}[\d]*$|[\w\/ ]*\w*, \w{2}[\d]*[^\w]')==True]['organization'].index)

In [37]:
len(ind)

300

In [38]:
df.organization.loc[ind].sample(10)

1095           Richardson, TX 75082
10410          Northbrook, IL 60062
13266                   Redmond, WA
12156                   Redmond, WA
10743                    Albany, NY
14104            Carlsbad, CA 92008
8767                     Austin, TX
5015             Columbus, OH 43085
8935     City Of Industry, CA 91748
16856           Boys Town, NE 68154
Name: organization, dtype: object

In [39]:
df.loc[ind, 'organization'] = 'Other'

In [40]:
ind = list(df[df.organization.str.match(r'(?i)other\/not classified')==True]['organization'].index)

In [41]:
df.loc[ind, 'organization'] = 'Other'

In [42]:
df.organization.fillna('NA', inplace=True)

In [43]:
len(list(df.organization.unique()))

542

In [44]:
df.organization.sample(10)

19451                                                   NA
19634    Restaurant/Food ServicesFood and Beverage Prod...
13505                              Government and Military
12233                    Transport and Storage - Materials
21174                                 Engineering Services
12101                                                   NA
11184                                  Healthcare Services
20685                                                   NA
11601                      Real Estate/Property Management
12948                                                   NA
Name: organization, dtype: object

## sector column

In [45]:
df.sector.unique()

array(['IT/Software Development', nan, 'Experienced (Non-Manager)',
       'Project/Program Management', 'Customer Support/Client Care',
       'Entry Level', 'Building Construction/Skilled Trades',
       'Civil & Structural EngineeringGeneral/Other: Engineering',
       'Installation/Maintenance/Repair', 'Business/Strategic Management',
       'Accounting/Finance/Insurance', 'General/Other: Engineering',
       'Engineering', 'Editorial/Writing', 'Medical/Health',
       'Marketing/Product', 'Manager (Manager/Supervisor of Staff)',
       'Administrative/Clerical', 'Student (Undergraduate/Graduate)',
       'Biotech/R&D/Science', 'Logistics/Transportation',
       'General/Other: Customer Support/Client Care',
       'Sales/Retail/Business Development', 'Education/Training', 'Other',
       'General/Other: Installation/Maintenance/RepairVehicle Repair and Maintenance',
       'General/Other: IT/Software Development',
       'Brand/Product MarketingGeneral/Other: Marketing/ProductProd

In [46]:
df.sector.isnull().sum()

5194

In [47]:
len(df.sector.unique())

164

In [48]:
ind = list(df[df.sector.apply(lambda x:len(str(x))>100)]['sector'].index)

In [49]:
df.loc[ind, 'sector'] = 'Other'

## Cleaned Data

In [50]:
df.sample(10)

Unnamed: 0,job_description,job_title,job_type,location,organization,page_url,salary,sector,job_type_shift,job_type_part,job_title_name,loc_,location_extract
4206,UPS is hiring individuals to work as Full-Time...,UPS Automotive Mechanic Job in Lufkin,,"Lufkin, TX 75901",Transport and Storage - Materials,http://jobview.monster.com/ups-automotive-mech...,,Installation/Maintenance/Repair,,,UPS Automotive Mechanic,lufkin-tx,"lufkin, tx"
12038,Summary Provide analytic support to align comp...,Senior Compensation Analyst Job in Detroit,Full Time Employee,"Detroit, MI 48226",Insurance,http://jobview.monster.com/Senior-Compensation...,,Human Resources,Full Time,Employee,Senior Compensation Analyst,Detroit-MI,"Detroit, MI"
17385,Position BenefitsCompetitive compensation pack...,Sales Account Management - Full Time - Immedia...,Full Time Employee,"Dallas, TX 75207",RetailAdvertising and PR ServicesOther/Not Cla...,http://jobview.monster.com/Sales-Account-Manag...,,Sales/Retail/Business Development,Full Time,Employee,Sales Account Management - Full Time - Immedia...,Dallas-TX,"Dallas, TX"
5900,".ParalegalReal Estate LawHuntington, WV Adecco...",Paralegal Job in Huntington,Employee,"Huntington, WV 25726",Legal Services,http://jobview.monster.com/Paralegal-Job-Hunti...,,Experienced (Non-Manager),,Employee,Paralegal,Huntington-WV,"Huntington, WV"
21475,We are looking to recruit a dedicated individu...,Medical Insurance Verification Specialist Job ...,Full Time Temporary/Contract/Project,"Chicago, IL 60603",,http://jobview.monster.com/Medical-Insurance-V...,,Customer Support/Client Care,Full Time,Temporary,Medical Insurance Verification Specialist,Chicago-IL,"Chicago, IL"
12508,"Automated Packaging Systems, Inc., headquarter...",AirPouch Field Service Technician Job in Cinci...,Full Time Employee,"Cincinnati, OH 45205",Manufacturing - Other,http://jobview.monster.com/airpouch-field-serv...,,Installation/Maintenance/Repair,Full Time,Employee,AirPouch Field Service Technician,cincinnati-oh,"cincinnati, oh"
3815,Sr Research Engineer-POS_70021777-0001JCDescri...,Sr Research Engineer Job in Framingham,,"Framingham, MA",Biotechnology/Pharmaceuticals,http://jobview.monster.com/Sr-Research-Enginee...,,Medical/Health,,,Sr Research Engineer,Framingham-MA,"Framingham, MA"
21932,Accounts Receivable Reconciliation Specialist ...,Accounts Receivable Reconciliation Specialist ...,Full Time Temporary/Contract/Project,"Cincinnati, OH 45251",,http://jobview.monster.com/Accounts-Receivable...,,Accounting/Finance/Insurance,Full Time,Temporary,Accounts Receivable Reconciliation Specialist ...,Job-Cincinnati-OH,"Job-Cincinnati, OH"
19818,"MMC Central BuildingSioux City, IACardiovascul...",ARNP/PA Cardiothoracic/Vascular Surgery Job in...,Full Time Employee,"Sioux City, IA",Healthcare Services,http://jobview.monster.com/ARNP-PA-Cardiothora...,,Medical/Health,Full Time,Employee,ARNP/PA Cardiothoracic/Vascular Surgery,Sioux-City-IA,"Sioux-City, IA"
7470,"Like all the vehicles on the road, the Army Na...",Petroleum Supply Specialist Job in Madison bo...,,Madison 53704,,http://jobview.monster.com/Petroleum-Supply-Sp...,,Oil Rig & Pipeline Install/Maintain/Repair,,,Petroleum Supply Specialist,,
