In [1]:
from collections import Counter
import re
import requests
import pickle
import config

USAjobs <br/>
https://developer.usajobs.gov/Tutorials/Search-Jobs

In [2]:
#Parameters:

USA_API_Key = config.api_key

wa_pattern = re.compile(', Washington')

multiple_locs = ['Multiple Locations',\
                 'Location Negotiable After Selection',\
                'IRS Nationwide Locations',\
                'May be filled in various FAA duty locations']

keywords = ['investigator','investigation','information extraction',\
           'entity linking','record linkage','mandarin','french','thai',\
           'data analyst','program analyst','data scientist','data visualization',\
           'corruption','legal data analyst','natural language processing','NLP']

host = 'data.usajobs.gov'
user_agent = 'elyebliss@gmail.com'
auth_key = USA_API_Key

headers = {
    'Host': host,
    'User-Agent': user_agent,
    'Authorization-Key': auth_key
}

In [3]:
def define_USAjobs_url(keyword):
    
    url = 'https://data.usajobs.gov/api/search?' #JobCategoryCode=2210

    url+= '&Keyword='+keyword
    #url+= '&LocationName='+location
        
    return url

In [4]:
url = 'https://data.usajobs.gov/api/search?Keyword=Data Analyst:Data Scientist:Natural Language Processing:NLP:Chinese:Analytics:Analyst' #JobCategoryCode=2210&


response = requests.get(url, headers=headers)
data = response.json()

**Questions** <br/>
- How to map the json response to its main keys
- Which parameters do I set to customize my response?
- How best should I search through results?
 
**Overall plan** <br/>
- pre-filter request with as many parameters as possible
- filter results to remove:
    - not-qualified
    - not-interested
    - already-saved
    - already-seen
- make as many requests as needed if you can't use multiple keywords, locations
- combine and sort results using descriptions and other fields

**Notes**
- "All parameters support multiple search values which must be separated by a colon."
    - It looks like this isnt' actually true. I probably need to create distinct calls per keyword, and then just combine results
- It seems like you can't really make the API request using "Remote" as a location. However, you can cast a wider net and then filter results to 'PositionLocationDisplay': 'Anywhere in the U.S. (remote job)'

**Pre-step:** import list of jobs already screened

In [5]:
with open('USA_screened_jobs', 'rb') as infile:
    prescreened = pickle.load(infile)

**Step 1:** Create giant list from all keywords

In [7]:
wide_net = []
for word in keywords:
    
    response = requests.get(define_USAjobs_url(word.capitalize()), headers=headers)
    data = response.json()
    if 'SearchResultItems' in data['SearchResult']:
        wide_net.extend(data['SearchResult']['SearchResultItems'])
len(wide_net)

322

In [8]:
#remove pre-screened:
wide_net = [item for item in wide_net if (item['MatchedObjectDescriptor']['PositionID'] not in prescreened)]
len(wide_net)

297

**Step 2:** Apply deal-breaker filters <br/>
- not in WA or remote-available
    - 'Anywhere in the U.S. (remote job)'
    - ', Washington'
- not open to public or career federal employees
- manually reviewed already
- GS >13

In [9]:
wide_net1 = [item for item in wide_net if (('fed-transition'in item['MatchedObjectDescriptor']['UserArea']['Details']['HiringPath']) or ('public' in item['MatchedObjectDescriptor']['UserArea']['Details']['HiringPath']))]
len(wide_net1) #remaining jobs

240

Investigate further: <br/>
- 'Multiple Locations'
- 'Location Negotiable After Selection'
- 'IRS Nationwide Locations'
- 'May be filled in various FAA duty locations'

In [10]:
wide_net2 = []

for job in wide_net1:
    
    if job['MatchedObjectDescriptor']['PositionLocationDisplay']=='Anywhere in the U.S. (remote job)':
        wide_net2.append(job)
        
    elif wa_pattern.findall(job['MatchedObjectDescriptor']['PositionLocationDisplay']):
        wide_net2.append(job)
        
    elif job['MatchedObjectDescriptor']['PositionLocationDisplay'] in multiple_locs:
        
        keep = False
        for location in job['MatchedObjectDescriptor']['PositionLocation']:
            if wa_pattern.findall(location['LocationName']):
                keep=True
        if keep:
            wide_net2.append(job)
len(wide_net2)  

56

In [11]:
wide_net3 = []
grades = []
for job in wide_net2:
    
    try:
    
        if int(job['MatchedObjectDescriptor']['UserArea']['Details']['HighGrade']) <14:
            wide_net3.append(job) 
    
    except:
       wide_net3.append(job) 
len(wide_net3)

20

**Step 3**: prioritize results <br/>
- first batch should be anything that contains extra cool keywords in title or job description:
    - Data Scientist
    - NLP
    - Data Analyst
- next batch should be fully-remote
- then all remainders

In [12]:
wide_net4 = []
batch2 = []
batch3 = []

extra_keywords = ['data scientist','nlp','data analyst','information extraction']

for job in wide_net3:
    key_text = job['MatchedObjectDescriptor']['PositionTitle'].lower()+\
    job['MatchedObjectDescriptor']['UserArea']['Details']['MajorDuties'][0].lower()+\
    job['MatchedObjectDescriptor']['UserArea']['Details']['JobSummary'].lower()
    
    keep = False
    for keyword in extra_keywords:
        if re.findall(keyword,key_text):
            keep=True
    if keep:
        wide_net4.append(job)
    
    elif job['MatchedObjectDescriptor']['PositionLocationDisplay']=='Anywhere in the U.S. (remote job)':
        batch2.append(job)
    else:
        batch3.append(job)

**Step 4**: display and deal with results manually-reviewed already

In [13]:
wide_net4.extend(batch2)
wide_net4.extend(batch3)
len(wide_net4)

20

**Display**: <br/>
- ['MatchedObjectDescriptor']['PositionID']
- ['MatchedObjectDescriptor']['PositionTitle']
- ['MatchedObjectDescriptor']['PositionLocationDisplay']
- ['MatchedObjectDescriptor']['OrganizationName']
- ['MatchedObjectDescriptor']['DepartmentName']
- ['MatchedObjectDescriptor']['UserArea']['Details']
- ['MatchedObjectDescriptor']['UserArea']['MajorDuties']
- ['MatchedObjectDescriptor']['PositionURI']

In [14]:
add_to_seen = []

for job in wide_net4:
    #print('PositionID: '+job['MatchedObjectDescriptor']['PositionID'])
    if job['MatchedObjectDescriptor']['PositionID'] not in add_to_seen:
        print('PositionTitle: '+job['MatchedObjectDescriptor']['PositionTitle'])
        print('OrganizationName: '+job['MatchedObjectDescriptor']['OrganizationName'])
        print('DepartmentName: '+job['MatchedObjectDescriptor']['DepartmentName'])
        print('location: '+job['MatchedObjectDescriptor']['PositionLocationDisplay'])
        print()
        print('JobSummary: '+job['MatchedObjectDescriptor']['UserArea']['Details']['JobSummary'])
        print()
        #print('MajorDuties: '+job['MatchedObjectDescriptor']['UserArea']['Details']['MajorDuties'][0])
        #print()
        print('PositionURI: '+job['MatchedObjectDescriptor']['PositionURI'])

        decision = str(input("\nMark as seen =s"))
        if decision =='s':
            add_to_seen.append(job['MatchedObjectDescriptor']['PositionID'])


PositionTitle: Health System Specialist (Data Analyst)
OrganizationName: Veterans Health Administration
DepartmentName: Department of Veterans Affairs
location: Anywhere in the U.S. (remote job)

JobSummary: This position is in the National Mailed Fecal Immunochemical Testing Implementation Team (MFIT), within the National Colorectal Cancer Screening Program (NCSP), within the National Gastroenterology and Hepatology Program (NGHP). The NGHP is within the Specialty Care Program Office (SCPO). The position serves as a Data Analyst, reporting to the Supervisory Program Manager of the MFIT, supporting the SCPO.

PositionURI: https://www.usajobs.gov:443/GetJob/ViewDetails/728045600

Mark as seen =ss
PositionTitle: Data Analyst - Customer Insights (All Sources)
OrganizationName: Federal Student Aid
DepartmentName: Department of Education
location: Location Negotiable After Selection

JobSummary: Federal Student Aid (FSA) continually seeks data-driven ways to improve the customer experience.


Mark as seen =ss
PositionTitle: Criminal Investigator
OrganizationName: Office of Inspector General
DepartmentName: General Services Administration
location: Tacoma, Washington

JobSummary: As a Special Agent in the Office of Investigations, you will be responsible for conducting and reporting upon investigations of suspected criminal violations affecting the programs, operations, and employees of the GSA consistent with the Inspector General Act of 1978. Location of Position: Office of Investigations, Western Division Northwest/Arctic Region Investigations Office Tacoma, WA (JIF-10) We are currently filling one vacancy, but additional vacancies may be filled as needed.

PositionURI: https://www.usajobs.gov:443/GetJob/ViewDetails/727905500

Mark as seen =ss
PositionTitle: Supervisory Emergency Management Specialist
OrganizationName: Federal Emergency Management Agency
DepartmentName: Department of Homeland Security
location: Location Negotiable After Selection

JobSummary: This positi

In [15]:
len(add_to_seen)

16

In [16]:
prescreened = list(set(prescreened))
len(prescreened)

53

In [17]:
prescreened.extend(add_to_seen)
len(prescreened)

69

In [18]:
%cd ~/Desktop/WorkApps

with open('USA_screened_jobs', 'wb') as handle:
    pickle.dump(prescreened, handle, protocol=pickle.HIGHEST_PROTOCOL)

/Users/elyebliss/Desktop/WorkApps


**DONE**