This repository has been archived by the owner on Feb 22, 2021. It is now read-only.
/
careerbuilder_example.py
342 lines (303 loc) · 14.3 KB
/
careerbuilder_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
########################################################
#################### IMPORT LIBRARY ####################
########################################################
import bs4
import numpy
import pandas
import re
import requests
import datetime
import stop_words
###################################################
#################### ARGUMENTS ####################
###################################################
input_job = "Data Scientist"
input_quote = False # add quotation marks("") to your input_job
input_city = "Durham" # leave empty if input_city is not specified
input_state = "NC"
sign = "-"
BASE_URL_careerbuilder = 'http://www.careerbuilder.com'
#####################################################
##### Function for Transform searching keywords #####
#####################################################
# The default "quote = False"
def transform(input,sign, quote = False):
syntax = input.replace(" ", sign)
if quote == True:
syntax = ''.join(['%2522', syntax, '%2522'])
return(syntax)
######################################
########## Generate the URL ##########
######################################
if not input_city: # if (input_city is "")
url_careerbuilder_list = [ BASE_URL_careerbuilder, '/jobs-',
transform(input_job, sign, input_quote), '-in-',input_state]
url_careerbuilder = ''.join(url_careerbuilder_list)
else: # input_city is not ""
url_careerbuilder_list = [ BASE_URL_careerbuilder, '/jobs-',
transform(input_job, sign, input_quote), '-in-',
transform(input_city, sign),',', input_state]
url_careerbuilder = ''.join(url_careerbuilder_list)
print(url_careerbuilder)
# get the HTML code from the URL
rawcode_careerbuilder = requests.get(url_careerbuilder)
# Choose "lxml" as parser
soup_careerbuilder = bs4.BeautifulSoup(rawcode_careerbuilder.text, "lxml")
# total number of results
num_total_careerbuilder = soup_careerbuilder.find(
'div', {'class' : 'count'}).contents[0]
print(num_total_careerbuilder)
num_total_careerbuilder = int(re.sub('[\(\)\{\}<>]', '',
num_total_careerbuilder).split()[0])
print(num_total_careerbuilder)
# total number of pages
num_pages_careerbuilder = int(numpy.ceil(num_total_careerbuilder/25.0))
print(num_pages_careerbuilder)
# create an empty dataframe
job_df_careerbuilder = pandas.DataFrame()
# the date for today
now = datetime.datetime.now()
now_str = now.strftime("%m/%d/%Y")
now_str_name=now.strftime('%m%d%Y')
########################################
##### Loop for all the total pages #####
########################################
for i in range(1, num_pages_careerbuilder+1):
# generate the URL
url = ''.join([url_careerbuilder,'?page_number=', str(i)])
print(url)
# get the HTML code from the URL
rawcode = requests.get(url)
soup = bs4.BeautifulSoup(rawcode.text, "lxml")
# pick out all the "div" with "class="job-row"
divs = soup.findAll("div")
job_divs = [jp for jp in divs if not jp.get('class') is None
and 'job-row' in jp.get('class')]
# loop for each div chunk
for job in job_divs:
try:
# job id
id = job.find('h2',{'class' : 'job-title'}).find('a').attrs['data-job-did']
# job link related to job id
link = BASE_URL_careerbuilder + '/job/' + id
# job title
title = job.find('h2', {'class' : 'job-title'}).text.strip()
# job company
company = job.find('div', {'class' : 'columns large-2 medium-3 small-12'}).find(
'h4', {'class': 'job-text'}).text.strip()
# job location
location = job.find('div', {'class' : 'columns end large-2 medium-3 small-12'}).find(
'h4', {'class': 'job-text'}).text.strip()
except:
continue
job_df_careerbuilder = job_df_careerbuilder.append({'job_title': title,
'job_id': id,
'job_company': company,
'date': now_str,
'from':'Careerbuilder',
'job_location':location,
'job_link':link},ignore_index=True)
cols=['from','date','job_id','job_title','job_company','job_location','job_link']
job_df_careerbuilder = job_df_careerbuilder[cols]
# delete the duplicated jobs using job link
job_df_careerbuilder = job_df_careerbuilder.drop_duplicates(['job_link'], keep='first')
# print the dimenstion of the dataframe
print(job_df_careerbuilder.shape)
# save the dataframe as a csv file
path = '/Users/chou/Google Drive/websites/github/webscraping_example/output/' + 'job_careerbuilder_' + now_str_name + '.csv'
job_df_careerbuilder.to_csv(path)
########################################################
#################### IMPORT LIBRARY ####################
########################################################
# import bs4
# import numpy
# import pandas
# import re
# import requests
# import datetime
# import stop_words
# define the stop_words for future use
stop_words = stop_words.get_stop_words('english') # list out all the English stop word
# print(stop_words)
# read the csv file
# path = '/path/output/' + 'job_careerbuilder_' + now_str_name + '.csv'
job_df_careerbuilder = pandas.DataFrame.from_csv(path)
####################################################
##### DEFINE THE TERMS THAT I AM INTERESTED IN #####
####################################################
##### Job types #####
type = ['Full-Time', 'Full Time', 'Part-Time', 'Part Time', 'Contract', 'Contractor']
type_lower = [s.lower() for s in type] # lowercases
# map the type_lower to type
type_map = pandas.DataFrame({'raw':type, 'lower':type_lower}) # create a dataframe
type_map['raw'] = ["Full-Time", "Full-Time", 'Part-Time', 'Part-Time', "Contract", 'Contract'] # modify the mapping
type_dic = list(type_map.set_index('lower').to_dict().values()).pop() # use the dataframe to create a dictionary
# print(type_dic)
##### Skills #####
skills = ['R', 'Shiny', 'RStudio', 'Markdown', 'Latex', 'SparkR', 'D3', 'D3.js',
'Unix', 'Linux', 'MySQL', 'Microsoft SQL server', 'SQL',
'Python', 'SPSS', 'SAS', 'C++', 'C', 'C#','Matlab','Java',
'JavaScript', 'HTML', 'HTML5', 'CSS', 'CSS3','PHP', 'Excel', 'Tableau',
'AWS', 'Amazon Web Services ','Google Cloud Platform', 'GCP',
'Microsoft Azure', 'Azure', 'Hadoop', 'Pig', 'Spark', 'ZooKeeper',
'MapReduce', 'Map Reduce','Shark', 'Hive','Oozie', 'Flume', 'HBase', 'Cassandra',
'NoSQL', 'MongoDB', 'GIS', 'Haskell', 'Scala', 'Ruby','Perl',
'Mahout', 'Stata']
skills_lower = [s.lower() for s in skills]# lowercases
skills_map = pandas.DataFrame({'raw':skills, 'lower':skills_lower})# create a dataframe
skills_map['raw'] = ['R', 'Shiny', 'RStudio', 'Markdown', 'Latex', 'SparkR', 'D3', 'D3',
'Unix', 'Linux', 'MySQL', 'Microsoft SQL server', 'SQL',
'Python', 'SPSS', 'SAS', 'C++', 'C', 'C#','Matlab','Java',
'JavaScript', 'HTML', 'HTML', 'CSS', 'CSS','PHP', 'Excel', 'Tableau',
'AWS', 'AWS','GCP', 'GCP',
'Azure', 'Azure', 'Hadoop', 'Pig', 'Spark', 'ZooKeeper',
'MapReduce', 'MapReduce','Shark', 'Hive','Oozie', 'Flume', 'HBase', 'Cassandra',
'NoSQL', 'MongoDB', 'GIS', 'Haskell', 'Scala', 'Ruby','Perl',
'Mahout', 'Stata']
skills_dic = list(skills_map.set_index('lower').to_dict().values()).pop()# use the dataframe to create a dictionary
# print(skills_dic)
##### Education #####
edu = ['Bachelor', "Bachelor's", 'BS', 'B.S', 'B.S.', 'Master', "Master's", 'Masters', 'M.S.', 'M.S', 'MS',
'PhD', 'Ph.D.', "PhD's", 'MBA']
edu_lower = [s.lower() for s in edu]# lowercases
edu_map = pandas.DataFrame({'raw':edu, 'lower':edu_lower})# create a dataframe
edu_map['raw'] = ['BS', "BS", 'BS', "BS", 'BS', 'MS', "MS", 'MS', 'MS', 'MS', 'MS',
'PhD', 'PhD', "PhD", 'MBA'] # modify the mapping
edu_dic = list(edu_map.set_index('lower').to_dict().values()).pop()# use the dataframe to create a dictionary
# print(edu_dic)
##### Major #####
major = ['Computer Science', 'Statistics', 'Mathematics', 'Math','Physics',
'Machine Learning','Economics','Software Engineering', 'Engineering',
'Information System', 'Quantitative Finance', 'Artificial Intelligence',
'Biostatistics', 'Bioinformatics', 'Quantitative']
major_lower = [s.lower() for s in major]# lowercases
major_map = pandas.DataFrame({'raw':major, 'lower':major_lower})# create a dataframe
major_map['raw'] = ['Computer Science', 'Statistics', 'Math', 'Math','Physics',
'Machine Learning','Economics','Software Engineering', 'Engineering',
'Information System', 'Quantitative Finance', 'Artificial Intelligence',
'Biostatistics', 'Bioinformatics', 'Quantitative']# modify the mapping
major_dic = list(major_map.set_index('lower').to_dict().values()).pop()# use the dataframe to create a dictionary
# print(major_dic)
##### Key Words ######
keywords = ['Web Analytics', 'Regression', 'Classification', 'User Experience', 'Big Data',
'Streaming Data', 'Real-Time', 'Real Time', 'Time Series']
keywords_lower = [s.lower() for s in keywords]# lowercases
keywords_map = pandas.DataFrame({'raw':keywords, 'lower':keywords_lower})# create a dataframe
keywords_map['raw'] = ['Web Analytics', 'Regression', 'Classification', 'User Experience', 'Big Data',
'Streaming Data', 'Real Time', 'Real Time', 'Time Series']# modify the mapping
keywords_dic = list(keywords_map.set_index('lower').to_dict().values()).pop()# use the dataframe to create a dictionary
# print(keywords_dic)
##############################################
##### FOR LOOP FOR SCRAPING EACH JOB URL #####
##############################################
# empty list to store details for all the jobs
list_type = []
list_skill = []
list_text = []
list_edu = []
list_major = []
list_keywords = []
for i in range(len(job_df_careerbuilder)):
# empty list to store details for each job
required_type= []
required_skills = []
required_edu = []
required_major = []
required_keywords = []
try:
# get the HTML code from the URL
job_page = requests.get(job_df_careerbuilder.iloc[i, 6])
# Choose "lxml" as parser
soup = bs4.BeautifulSoup(job_page.text, "lxml")
# drop the chunks of 'script','style','head','title','[document]'
for elem in soup.findAll(['script','style','head','title','[document]']):
elem.extract()
# get the lowercases of the texts
texts = soup.getText(separator=' ').lower()
# cleaning the text data
string = re.sub(r'[\n\r\t]', ' ', texts) # remove "\n", "\r", "\t"
string = re.sub(r'\,', ' ', string) # remove ","
string = re.sub('/', ' ', string) # remove "/"
string = re.sub(r'\(', ' ', string) # remove "("
string = re.sub(r'\)', ' ', string) # remove ")"
string = re.sub(' +',' ',string) # remove more than one space
string = re.sub(r'r\s&\sd', ' ', string) # avoid picking 'r & d'
string = re.sub(r'r&d', ' ', string) # avoid picking 'r&d'
string = re.sub('\.\s+', ' ', string) # remove "." at the end of sentences
# Job types
for typ in type_lower :
if any(x in typ for x in ['+', '#', '.']):
typp = re.escape(typ) # make it possible to find out 'c++', 'c#', 'd3.js' without errors
else:
typp = typ
result = re.search(r'(?:^|(?<=\s))' + typp + r'(?=\s|$)', string) # search the string in a string
if result:
required_type.append(type_dic[typ])
list_type.append(required_type)
# Skills
for sk in skills_lower :
if any(x in sk for x in ['+', '#', '.']):
skk = re.escape(sk)
else:
skk = sk
result = re.search(r'(?:^|(?<=\s))' + skk + r'(?=\s|$)',string)
if result:
required_skills.append(skills_dic[sk])
list_skill.append(required_skills)
# Education
for ed in edu_lower :
if any(x in ed for x in ['+', '#', '.']):
edd = re.escape(ed)
else:
edd = ed
result = re.search(r'(?:^|(?<=\s))' + edd + r'(?=\s|$)', string)
if result:
required_edu.append(edu_dic[ed])
list_edu.append(required_edu)
# Major
for maj in major_lower :
if any(x in maj for x in ['+', '#', '.']):
majj = re.escape(maj)
else:
majj = maj
result = re.search(r'(?:^|(?<=\s))' + majj + r'(?=\s|$)', string)
if result:
required_major.append(major_dic[maj])
list_major.append(required_major)
# Key Words
for key in keywords_lower :
if any(x in key for x in ['+', '#', '.']):
keyy = re.escape(key)
else:
keyy = key
result = re.search(r'(?:^|(?<=\s))' + keyy + r'(?=\s|$)', string)
if result:
required_keywords.append(keywords_dic[key])
list_keywords.append(required_keywords)
# All text
words = string.split(' ')
job_text = set(words) - set(stop_words) # drop stop words
list_text.append(list(job_text))
except: # to avoid Forbidden webpages
list_type.append('Forbidden')
list_skill.append('Forbidden')
list_edu.append('Forbidden')
list_major.append('Forbidden')
list_keywords.append('Forbidden')
list_text.append('Forbidden')
print(i)
# Add new columns
job_df_careerbuilder['job_type'] = list_type
job_df_careerbuilder['job_skills'] = list_skill
job_df_careerbuilder['job_edu'] = list_edu
job_df_careerbuilder['job_major'] = list_major
job_df_careerbuilder['job_keywords'] = list_keywords
job_df_careerbuilder['job_text'] = list_text
# reorder the columns
cols=['from','date','job_id','job_title','job_company','job_location','job_link','job_type',
'job_skills', 'job_edu', 'job_major', 'job_keywords','job_text']
job_df_careerbuilder = job_df_careerbuilder[cols]
# print the dimenstion of the dataframe
print(job_df_careerbuilder.shape)
# save the dataframe as a csv file
job_df_careerbuilder.to_csv(path)