In [1]:
%reload_ext autoreload
%autoreload 2
import os
import sys
sys.path.append('../')
from skills_ml.job_postings.common_schema import JobPostingCollectionSample
from skills_ml.job_postings.filtering import JobPostingFilterer, soc_major_group_filter
import random
import json
from skills_ml.job_postings.corpora import Doc2VecGensimCorpusCreator, CorpusCreator
from collections import Counter
import numpy as np
from skills_ml.job_postings.sample import JobSampler
import logging

# Streaming Data to Create Corpus

## Simple Corpus

In [10]:
full_filename = os.path.realpath('./50_sample.json.gz')
job_postings_generator = JobPostingCollectionSample(full_filename=full_filename)
corpus = CorpusCreator(job_postings_generator)

TypeError: super(type, obj): obj must be an instance or subtype of type

In [4]:
corpus = list(corpus)

In [5]:
len(corpus)

50

In [6]:
corpus[0]

{'@context': 'http://schema.org',
 '@type': 'JobPosting',
 'title': 'Media Consultant',
 'description': "Media consultant. Email. tweet. Location:Vienna, Va.. Date posted: 03-11-2016. Apply now. Job Summary: the Public Sector Group, a division of 1105 media, is seeking an experienced media consultant sales professional who enjoys a fast-paced leading edge environment and a variety of assignments with an innovative media company. The Media Consultant & Sales Representative will support a myriad of online and print products providing a genuine relational and consultative approach with customers to foster repeat business. The portfolio includes print publications, websites, e-newsletters, web seminars, custom media, research and other Online & print products. This position involves business development and account prospecting, face-to-face and phone selling, networking at industry events, putting together and making presentations, preparing proposals, closing sales, managing print and onl

## Creating Corpus with Criteria

#### One can define their own funciton and logic of filtering based on the common schema.

In [31]:
def major_group_filter_func(document):
    if document['onet_soc_code']:
        if document['onet_soc_code'][:2] in ['11', '13']:
            return True

def full_soc_code_filter_func(document):
    if document['onet_soc_code']:
        if document['onet_soc_code'] in ['13-1041.01']:
            return True

def wage_filter_func(document):
    if document['baseSalary']['medianValue']:
        if float(document['baseSalary']['medianValue']) >= 60000.0:
            return True

### Filtered by Major Groups

In [8]:
job_postings_generator = JobPostingCollectionSample()
filtered_job_postings = JobPostingFilterer(job_postings_generator, filter_funcs=[major_group_filter_func])
corpus = CorpusCreator(filtered_job_postings)

In [9]:
corpus = list(corpus)

In [10]:
corpus[0]

{'@context': 'http://schema.org',
 '@type': 'JobPosting',
 'title': 'State Inspector- Chesapeake',
 'description': "Primary Job Functions: Perform va state inspections. Perform oil changes and lubrication services to automobiles. Perform tire repairs and installation services to automobiles. Perform any other basic repairs to automobiles as assigned by a lead tech or manager/service manager/service writer according to his/her abilities. This could include but not be limited to brakes, suspension, alignments, parts changing, exhaust, ac/heating, maintenance inspections, and tune-ups. Gain on the job experience in all areas of automotive repair and strive to obtain ASE certifications. Clean shop and grounds during down time. Assist with transporting customers/parts and running errands. Keep his/her work area clean and safe. Various miscellaneous duties as assigned by lead tech or management. Assists other level technicians as requested. Performs other duties as assigned. Automotive repai

In [11]:
major_group = list(map(lambda c: c['onet_soc_code'][:2], corpus))

In [12]:
Counter(major_group)

Counter({'13': 3, '11': 8})

### Filtered by Full O*NET SOC Code

In [18]:
job_postings_generator = JobPostingCollectionSample()
filtered_job_postings = JobPostingFilterer(job_postings_generator, filter_funcs=[full_soc_code_filter_func])
corpus = CorpusCreator(filtered_job_postings)

In [19]:
corpus = list(corpus)

In [20]:
soc = list(map(lambda c: c['onet_soc_code'], corpus))

In [21]:
Counter(soc)

Counter({'13-1041.01': 3})

Filtered by Median Wage >= 60000

In [32]:
job_postings_generator = JobPostingCollectionSample()
filtered_job_postings = JobPostingFilterer(job_postings_generator, filter_funcs=[wage_filter_func])
corpus = CorpusCreator(filtered_job_postings)

In [53]:
corpus = list(corpus)
corpus[0]

((({'@context': 'http://schema.org',
    '@type': 'JobPosting',
    'title': 'Automotive Service Manager-Chesapeake Car Care Center',
    'description': "Job Responsibilities. Primary Job Functions: manages all customer relations and ensures that employees are providing above average customer service. Investigates and resolves all customer disputes, including warranty issues. Manages the profit and loss of the center, including expense control, pricing structure, and mark up of parts. Oversees inventory control, including managing vendor relations and approving all purchases and stocking activities. Oversees facility maintenance. Assist with oversight of the fleet drivers based out of the center. Supervises technicians and other employees at the center. This includes employee relations, hiring, firing, disciplining, and scheduling. Tracks employee productivity using management software. Maximizes sales and profitability by maintaining and scheduling appropriate work loads according to 

In [36]:
median_wage = list(map(lambda c: c['baseSalary']['medianValue'], corpus))

In [38]:
Counter(median_wage)

Counter({90000.0: 1, 68000.0: 1, 84000.0: 1, 94000.0: 3, 102000.0: 1})

# Sampling from Corpus

## Sampling from simple corpus

In [57]:
job_postings_generator = JobPostingCollectionSample()

In [58]:
corpus = CorpusCreator(job_postings_generator)

In [59]:
from skills_ml.job_postings.sample import JobSampler
job_sampler = JobSampler(corpus, random_state=42)
corpus = job_sampler.sample(10)

In [60]:
corpus[0]

({'@context': 'http://schema.org',
  '@type': 'JobPosting',
  'title': 'Sr. Network Engineer',
  'description': "Sr.. Network engineer. Job Title: Sr.. Network engineer. Job Type: Full-Time. Location: Herndon, Va.. Post date: 06/11/2012. Job Description: Position Summary: This is an exempt position that manages large complex function-LRB-s-RRB- and provides direct supervision to groups involved in network integration and implementation. Responsible for the planning and implementation of programs and projects that adhere to approved plans, budgets and schedules. Provides specialized technical and administrative and business development expertise. Coordinates and facilitates with internal and external resources to effect the timely completion of projects. Coordinates closely with customer representatives to ensure all customer needs are met. Will work at customer sites most of the time but will support all of ESP needs including business development. The primary purpose, product and/or s

In [62]:
industry = list(map(lambda c: c[0]['industry'], corpus))

In [63]:
Counter(industry)

Counter({'51': 1, '': 6, '33': 1, '54': 2})

## Sampling from filtered corpus

### Reservoir Sampling

In [64]:
job_postings_generator = JobPostingCollectionSample()
corpus = CorpusCreator(job_postings_generator, filter_func=major_group_filter_func)

In [65]:
job_sampler = JobSampler(corpus, random_state=42)
corpus = job_sampler.sample(10)

In [66]:
corpus

[({'@context': 'http://schema.org',
   '@type': 'JobPosting',
   'title': 'Sr. Network Engineer',
   'description': "Sr.. Network engineer. Job Title: Sr.. Network engineer. Job Type: Full-Time. Location: Herndon, Va.. Post date: 06/11/2012. Job Description: Position Summary: This is an exempt position that manages large complex function-LRB-s-RRB- and provides direct supervision to groups involved in network integration and implementation. Responsible for the planning and implementation of programs and projects that adhere to approved plans, budgets and schedules. Provides specialized technical and administrative and business development expertise. Coordinates and facilitates with internal and external resources to effect the timely completion of projects. Coordinates closely with customer representatives to ensure all customer needs are met. Will work at customer sites most of the time but will support all of ESP needs including business development. The primary purpose, product and/

In [67]:
onet_soc_code = list(map(lambda c: c[0]['onet_soc_code'][:2], corpus))

In [68]:
Counter(onet_soc_code)

Counter({'15': 2, '41': 1, '53': 3, '': 1, '17': 2, '11': 1})

### Weighted Reservoir Sampling

In [69]:
job_postings_generator = JobPostingCollectionSample()
filtered_job_postings = JobPostingFilterer(job_postings_generator, filter_funcs=[major_group_filter_func])
corpus = CorpusCreator(filtered_job_postings)

In [71]:
job_sampler = JobSampler(corpus, major_group=True,weights={'11': 1, '13': 3.5})
sampled_corpus = job_sampler.sample(10)

In [72]:
major_group = list(map(lambda c: c[1][:2], sampled_corpus))

In [73]:
Counter(major_group)

Counter({'11': 7, '13': 3})

In [74]:
sampled_corpus[0]

({'@context': 'http://schema.org',
  '@type': 'JobPosting',
  'title': 'Site Support Lead',
  'description': "Summary: Oversee site and act as the primary interface with customer on issues pertaining to NAS Oceana. Directs and coordinates site activities designed to ensure effective and economical support. Essential duties and responsibilities: Reports in writing and orally to AAR DSL management. Responding to on-site supply requirements generated by the squadron in support of C-40A organizational maintenance. Provides technical direction for specific projects assigned. Serves as a technical authority for various functional areas. Ensures tasks are completed within estimated time frames and budget constraints. Site responsibility for Fod program in totality. Inspects government provided on-site support under AAR custodial responsibility. Manager overall operations of the on-site storeroom. Selects most efficient means of transportation and shipping. Coordinate CFT -LRB-contract field t