In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
from gensim import corpora, models, similarities

# Import the dictionaryy and corpus 
dictionary = corpora.Dictionary.load('onet.dict')
corpus = corpora.MmCorpus('onet_corpus.mm') # comes from the first tutorial, "Corpora and Vector Space"
print(corpus)

2017-10-15 21:52:13,416 : INFO : 'pattern' package not found; tag filters are not available for English
2017-10-15 21:52:13,422 : INFO : loading Dictionary object from onet.dict
2017-10-15 21:52:13,435 : INFO : loaded onet.dict
2017-10-15 21:52:13,439 : INFO : loaded corpus index from onet_corpus.mm.index
2017-10-15 21:52:13,440 : INFO : initializing corpus reader from onet_corpus.mm
2017-10-15 21:52:13,442 : INFO : accepted corpus with 974 documents, 25184 features, 451904 non-zero entries


MmCorpus(974 documents, 25184 features, 451904 non-zero entries)


In [35]:
# Create a lsi model with 200 topics 
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=30)

2017-10-15 21:55:55,203 : INFO : using serial LSI version on this node
2017-10-15 21:55:55,211 : INFO : updating model with new documents
2017-10-15 21:55:56,466 : INFO : preparing a new chunk of documents
2017-10-15 21:55:56,555 : INFO : using 100 extra samples and 2 power iterations
2017-10-15 21:55:56,558 : INFO : 1st phase: constructing (25184, 130) action matrix
2017-10-15 21:55:56,624 : INFO : orthonormalizing (25184, 130) action matrix
2017-10-15 21:55:57,400 : INFO : 2nd phase: running dense svd on (130, 974) matrix
2017-10-15 21:55:57,518 : INFO : computing the final decomposition
2017-10-15 21:55:57,519 : INFO : keeping 30 factors (discarding 12.946% of energy spectrum)
2017-10-15 21:55:57,547 : INFO : processed documents up to #974
2017-10-15 21:55:57,552 : INFO : topic #0(1011.372): 0.344*"time" + 0.311*"equipment" + 0.274*"exposed" + 0.274*"spend" + 0.225*"software" + 0.186*"safety" + 0.156*"microsoft" + 0.154*"work" + 0.146*"systems" + 0.123*"others"
2017-10-15 21:55:57,5

In [36]:
# Create an index
index = similarities.MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and index it
index.save('onet.index')

2017-10-15 21:56:09,642 : INFO : creating matrix with 974 documents and 30 features
2017-10-15 21:56:10,996 : INFO : saving MatrixSimilarity object under onet.index, separately None
2017-10-15 21:56:10,998 : INFO : saved onet.index


In [37]:
import pickle 
jobs = pickle.load(open('lookuptable', 'rb'))

In [49]:
def lookupJob(doc): 
    # Test example 
    doc = doc.lower().replace(",","").replace(".","").replace(";","").replace("\n","").replace("\\", "")
    print("Using string: {0}\n".format(doc))
    vec_bow = dictionary.doc2bow(doc.split())
    vec_lsi = lsi[vec_bow] # convert the query to LSI space

    sims = index[vec_lsi] # perform a similarity query against the corpus
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    
    # Print out the top 5 jobs 
    for x in sims[:10]: 
        idx = x[0]
        job = jobs[idx]
        print(job['title'], (x[0], x[1]))

In [50]:
# Test example 
doc = "Apply principles of psychology to human resources, administration, management, sales, and marketing problems. Activities may include policy planning; employee testing and selection, training and development; and organizational development and analysis. May work with management to organize the work setting to improve worker productivity"
lookupJob(doc)

Using string: apply principles of psychology to human resources administration management sales and marketing problems activities may include policy planning employee testing and selection training and development and organizational development and analysis may work with management to organize the work setting to improve worker productivity

Wind Energy Operations Managers (56, 0.70923334)
Geothermal Technicians (808, 0.66917473)
Electrical and Electronic Equipment Assemblers (812, 0.63415909)
Mechatronics Engineers (178, 0.61916822)
Legislators (3, 0.6114952)
Aircraft Structure, Surfaces, Rigging, and Systems Assemblers (810, 0.6059534)
Nanosystems Engineers (182, 0.59332711)
Microsystems Engineers (179, 0.59007519)
Wind Energy Project Managers (57, 0.58043796)
Licensed Practical and Licensed Vocational Nurses (472, 0.57668948)


In [51]:
lookupJob("Identify relationships and trends in data, as well as any factors that could affect the results of research.")

Using string: identify relationships and trends in data as well as any factors that could affect the results of research

Forging Machine Setters, Operators, and Tenders, Metal and Plastic (829, 0.58304089)
Metal-Refining Furnace Operators and Tenders (837, 0.56364113)
Neurodiagnostic Technologists (477, 0.55993015)
Locomotive Firers (938, 0.53493756)
Nuclear Power Reactor Operators (875, 0.52498174)
Cutting, Punching, and Press Machine Setters, Operators, and Tenders, Metal and Plastic (831, 0.5237698)
Genetic Counselors (484, 0.52286202)
Rail Yard Engineers, Dinkey Operators, and Hostlers (939, 0.52191412)
Technical Writers (389, 0.4916237)
Bus Drivers, Transit and Intercity (931, 0.48261553)


In [52]:
lookupJob("psychology")

Using string: psychology

Loading Machine Operators, Underground Mining (961, 0.47219452)
Cutters and Trimmers, Hand (892, 0.45078647)
Commercial Pilots (926, 0.43757814)
Dental Laboratory Technicians (900, 0.43139383)
Tax Examiners and Collectors, and Revenue Agents (104, 0.42868987)
Pilots, Ship (946, 0.42638388)
Environmental Science Teachers, Postsecondary (306, 0.41572848)
Sustainability Specialists (89, 0.41425595)
Power Distributors and Dispatchers (876, 0.41378814)
Transportation Attendants, Except Flight Attendants (956, 0.40769976)


In [53]:
lookupJob("Equilibrium, dance, music")

Using string: equilibrium dance music

Loading Machine Operators, Underground Mining (961, 0.46580502)
Excavating and Loading Machine and Dragline Operators (960, 0.38809878)
First-Line Supervisors of Agricultural Crop and Horticultural Workers (679, 0.33631477)
Tax Examiners and Collectors, and Revenue Agents (104, 0.31844962)
Medical Transcriptionists (498, 0.29677024)
Neurologists (418, 0.28606406)
Cutting and Slicing Machine Setters, Operators, and Tenders (893, 0.27920157)
Aviation Inspectors (953, 0.26303357)
Prosthodontists (404, 0.26291731)
Chemistry Teachers, Postsecondary (305, 0.25712022)


In [54]:
lookupJob("oracle, database, statistics")

Using string: oracle database statistics

Cutting, Punching, and Press Machine Setters, Operators, and Tenders, Metal and Plastic (831, 0.83310312)
Rolling Machine Setters, Operators, and Tenders, Metal and Plastic (830, 0.70815575)
Drilling and Boring Machine Tool Setters, Operators, and Tenders, Metal and Plastic (832, 0.66223824)
Cost Estimators (72, 0.64837426)
Dancers (377, 0.59044701)
Loss Prevention Managers (55, 0.58863389)
Coroners (70, 0.58417118)
Grinding, Lapping, Polishing, and Buffing Machine Tool Setters, Operators, and Tenders, Metal and Plastic (833, 0.54550165)
Electronic Equipment Installers and Repairers, Motor Vehicles (765, 0.53507018)
Electronic Drafters (187, 0.52483523)


In [55]:
lookupJob("metal")

Using string: metal

Food Servers, Nonrestaurant (546, 0.61314029)
Cooks, Private Household (537, 0.50175118)
Cytogenetic Technologists (454, 0.45860389)
Dining Room and Cafeteria Attendants and Bartender Helpers (547, 0.44428283)
Payroll and Timekeeping Clerks (622, 0.44101113)
Patient Representatives (633, 0.43136948)
Control and Valve Installers and Repairers, Except Mechanical Door (784, 0.42560786)
Heating and Air Conditioning Mechanics and Installers (785, 0.39219716)
Industrial Safety and Health Engineers (160, 0.39155632)
Healthcare Social Workers (280, 0.38998443)


In [56]:
%timeit
lookupJob("Sales")

Using string: sales

Computer-Controlled Machine Tool Operators, Metal and Plastic (826, 0.53602034)
Musical Instrument Repairers and Tuners (796, 0.52137071)
Cashiers (592, 0.48826355)
Insurance Policy Processing Clerks (670, 0.37978566)
Architectural Drafters (185, 0.36035258)
Clinical Psychologists (246, 0.3482472)
Low Vision Therapists, Orientation and Mobility Specialists, and Vision Rehabilitation Therapists (431, 0.3471368)
Aviation Inspectors (953, 0.33949438)
Immigration and Customs Inspectors (518, 0.33857062)
Farmworkers and Laborers, Crop (686, 0.33669969)


In [57]:
# RANDOM JOB 

random = """About Us 
Amazon Aurora is an exciting new area of innovation for AWS, and the PostgreSQL-compatible edition of Amazon Aurora is the newest part of it. The PostgreSQL-compatible version of Amazon Aurora is a new relational database which offers enterprise-class performance, availability and durability - all at open source prices - to our customers, along with the management benefits of RDS. We’re a relatively new team in Database Services, one of the fastest growing businesses within Amazon Web Services. We are spread across Seattle, the Bay Area, and Boston. Not only do we have deep database and systems programming problems to solve for our customers, but we also minimize the effort required to maintain SQL databases by automating administrative tasks like backup / restore, scaling CPU / RAM / storage and replication / failover for high availability, allowing our customers to spend their valuable time focused on building their businesses. 

We have challenging problems to solve in distributed systems, concurrency, database internals, languages, and performance engineering. We intend to be the world's best and fastest database - come join us on the journey! 

For more information about RDS, please visit http://aws.amazon.com/rds . For more information about Aurora, please visit http://aws.amazon.com/rds/aurora . 

The Role 
This position is on an exciting new team building a highly available, very scalable, high performance database offering. We have the need for low-level systems C programmers, mid-tier high performance C++ developers, and distributed systems Java wizards. 

About You 
You’ve built a lot of software – shipped products, created platforms, tools and modules, perhaps worked with every framework under the sun, perhaps made everything you have touched fast as lightning. You may have built big, distributed, API-driven systems with thousands of users, or systems for billions of transactions. You’re as excited as we are about learning every day, and solving really hard engineering problems that no one else is. 

Passionate about software quality, repeatability, testability and maintainability, you are known to your co-workers as the go-to person for answers to questions that begin with the phrase “what’s the best way to…” – even though your answers tend to start with “help me understand what you’re trying to accomplish by…”. When you need clarity, you go to the product owner – or even a user – and talk to them until you get it. With stakeholders and other team members you are diplomatic, persuasive, and usually right. You understand the challenges associated with operating a large-scale system in production, and your designs and implementations reflect that understanding. 

When you’re wrong, you’re happy to learn something. You build software quickly, but properly (so that you don’t have to go back to it later). You know what “unit tests” are and don’t implement software without them. Shipping on time with high quality makes your heart feel warm. 

Who are you? You’re the new Software Development Engineer joining us at RDS. We work hard, have fun and make history – want to come play with us? 

What You’ll Do 

Deliver project items on-time / in-spec, communicating clearly with leads, manager and stakeholders 
Contribute to software and database architecture / design 
Contribute to and lead architecture / design conversations and code reviews (yours and other team members’) 
Work with managers and team members to estimate effort and clarify / negotiate / document scope and design 
Contribute to design, architecture, process and development standards 
Effectively mentor more junior team members, helping to maintain appropriate unit test coverage, code documentation, software structure and supportability 
Distil and communicate technical concepts to more junior developers and stakeholders 
Advise on courses of study for team members and / or self 
Argue for the right outcomes with data, conviction and diplomacy 
Identify, evaluate and suggest mitigation strategies for risks during design 
Design and implement features for new and existing products, features, APIs, platforms and frameworks 

Basic Qualifications 

Things We Like About You 

Your written and spoken English are excellent 

You’ve been developing software since you could tie your shoes or for more than 3 years, whichever is longer 

You are experienced with more than one of Java, C, C++ 

You are experienced with MySQL, PostgreSQL, Oracle, MS SQL or another 

You understand networking, network programming, network-oriented design patterns, distributed computing and best practices related to same 

You have excellent organizational, prioritization and time management skills 

You are familiar and comfortable with rapidly-evolving Agile development environments 

You have an understanding of and experience with common bug / task tracking, requirements tracking, traceability and test automation tools 

You have an ability to rapidly absorb and comprehend software and systems 

Fine Print 
The successful applicant will have a minimum of 3 years experience in software development, with at least 1 year in Enterprise / distributed systems. Post-secondary education and / or industry certification are both assets. 

Preferred Qualifications 

Desirable experience (i.e. “Stuff that will impress us”):
Linux / UNIX system experience 

Developing in extremely busy, highly scalable, highly available mission-critical distributed environments 

Deep experience with SQL / NoSQL databases 
Extensive software / database architecture 
Hands-on experience with AWS APIs and services 
Tags: Databases, Postgres, PostgreSQL, Concurrency, Systems Programming, Storage, Availability, Durability, Performance, Internals, Oracle, SQL Server """

In [58]:
lookupJob(random)

Using string: about us amazon aurora is an exciting new area of innovation for aws and the postgresql-compatible edition of amazon aurora is the newest part of it the postgresql-compatible version of amazon aurora is a new relational database which offers enterprise-class performance availability and durability - all at open source prices - to our customers along with the management benefits of rds we’re a relatively new team in database services one of the fastest growing businesses within amazon web services we are spread across seattle the bay area and boston not only do we have deep database and systems programming problems to solve for our customers but we also minimize the effort required to maintain sql databases by automating administrative tasks like backup / restore scaling cpu / ram / storage and replication / failover for high availability allowing our customers to spend their valuable time focused on building their businesses we have challenging problems to solve in distri