In [2]:
from collections import Counter # Keep track of our term counts
import pandas as pd # For converting results to a dataframe and bar chart plots
from pymongo import MongoClient
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
%matplotlib inline

In [3]:
client = MongoClient()
db = client.dsbc
job_descs = db.job_descriptions
job_descs_adj = db.job_descs3

In [5]:
pipeline = [{"$project" : {"description": "$description", "_id": 0}},
            {"$unwind" : "$description"},
            {"$group" : {"_id" : "$description", "count" : {"$sum": 1}}},
            {"$sort" : {"count" : -1}},
            {"$limit" : 500}]

In [6]:
# find most common words

results = job_descs.aggregate(pipeline)

for result in results:
    print(result["_id"] + ": " + str(result["count"]))

data: 30450
job: 30441
experience: 24015
research: 19586
jobs: 17077
work: 15764
development: 14925
new: 14304
apply: 13137
us: 12025
business: 11923
information: 11599
management: 11217
team: 10933
search: 10203
support: 9719
skills: 9403
company: 8904
years: 8797
software: 8740
analysis: 8617
engineering: 8550
services: 8490
please: 8445
scientist: 8316
ability: 8035
technology: 7886
position: 7573
technical: 7560
including: 7513
careers: 7239
email: 7191
science: 7177
time: 7130
clinical: 6995
product: 6638
working: 6584
knowledge: 6558
design: 6501
opportunity: 6443
systems: 6324
related: 6319
application: 6310
contact: 6177
one: 6104
required: 6045
location: 5986
medical: 5972
service: 5970
degree: 5907
must: 5755
solutions: 5579
health: 5578
employment: 5575
united: 5337
strong: 5265
use: 5265
2015: 5218
may: 5209
project: 5200
1: 5096
provide: 5075
process: 5069
full: 5027
statistical: 5025
requirements: 5015
senior: 4956
global: 4930
resume: 4806
analyst: 4749
sign: 4747
analyt

In [7]:
years = job_descs.find({"location": "New York, NY", "description": {"$in": ["years"]}})

In [8]:
for i in range(200):
    index = years[i]["description"].index("years")
    print(years[i]["description"][index-1:index + 6])

[u'6+', u'years', u'experience', u'related', u'discipline', u'experience', u'phases']
[u'3+', u'years', u'related', u'industry', u'experience', u'machine', u'learning']
[u'3', u'years', u'industry', u'experience', u'md', u'phd', u'advanced']
[u'5', u'years', u'experience', u'big', u'data', u'analytics.', u'masters']
[u'field5+', u'years', u'experience', u'leading', u'analytics', u'data', u'science']
[u'4', u'years', u'relevant', u'experience', u'master', u'degree', u'approved']
[u'2', u'years', u'experience', u'working', u'media', u'tv', u'digital']
[u'4', u'years', u'relevant', u'experience', u'track', u'record', u'financial']
[u'eight', u'years', u'experience', u'quantitative', u'analysis', u'development', u'validation']
[u'three', u'years', u'related', u'experience', u'equivalent', u'combination', u'relevant']
[u'4', u'years', u'experience', u'supporting', u'growing', u'reliable', u'scalable']
[u'2', u'years', u'9', u'days', u'ago', u'save', u'job']
[u'5', u'years', u'experience', u

In [269]:
jobs = job_descs.find()

In [40]:
title_prefixes = ["data", "scien", "stat", "analyst", "business", "develop", "senior", "sr", "analytics", "engineer", 
                  "quantitative", "research", "software", "programm"]

for prefix in title_prefixes:
    jobs = job_descs.find()
    print prefix + str(round(sum([prefix in job["title"].lower() for job in jobs])/float(jobs.count()), 2))

data0.21
scien0.37
stat0.07
analyst0.19
business0.02
develop0.07
senior0.14
sr0.06
analytics0.03
engineer0.15
quantitative0.03
research0.2
software0.06
programm0.02


In [48]:
years = job_descs.find({"description": {"$in": ["years"]}})

In [208]:
# eliminate duplicates using aggregate function and add to new collection

pipeline = [{"$project" : {"description": 1, "title" : 1, "location" : 1, "_id": 0}},
            {"$group" : {"_id" : "$description", 
                         "title" : {"$first": "$title"}, 
                         "location" : {"$first": "$location"}
                        }}]

results = job_descs.aggregate(pipeline)

for result in results:
    doc = {"description" : result["_id"], "title" : result["title"], "location" : result["location"]}
    job_descs_adj.insert(doc)

In [216]:
# remove short erroneous job descriptions

for job in job_descs_adj.find():
    if len(job["description"]) < 20:
        print job["description"]

[u'apply', u'ebay', u'senior', u'data', u'scientist', u'job', u'data', u'science', u'analytics', u'king', u'prussia', u'pennsylvania']
[u'glass', u'scientist', u'photovoltaics', u'bewerben', u'sie', u'sich', u'auf', u'die', u'gewhlte', u'stellenausschreibung.', u'bewerben']
[u'apply', u'express', u'scripts', u'ubc', u'principle', u'statistician', u'observational', u'studies', u'job', u'research', u'clinical', u'services', u'blue', u'bell', u'pennsylvania']
[u'recruit', u'wizard', u'longer', u'accepting', u'applications', u'position.']
[u'jobing', u'browser', u'javascript', u'enabled.we', u'unable', u'automatically', u'redirect', u'job.please', u'click', u'link', u'continue.http', u'sandiego.jobing.com', u'job', u'details2.asp', u'jobid', u'5374356']
[]
[u'comcast', u'research', u'analyst', u'please', u'wait......................', u'.']
[u'rsna', u'career', u'connect', u'online', u'medical', u'jobs.', u'apply', u'login', u'register', u'account', u'recovery']
[u'apply', u'life', u'time'

In [217]:
pipeline = [{"$project" : {"description": "$description", "_id": 0}},
            {"$unwind" : "$description"},
            {"$group" : {"_id" : "$description", "count" : {"$sum": 1}}},
            {"$sort" : {"count" : -1}},
            {"$limit" : 500}]

# find most common words

results = job_descs_adj.aggregate(pipeline)

for result in results:
    print(result["_id"] + ": " + str(result["count"]))

data: 21853
job: 20403
experience: 16973
research: 13491
jobs: 12003
work: 10945
development: 10191
new: 9937
apply: 9284
business: 8143
information: 7985
us: 7861
management: 7641
team: 7478
support: 6604
search: 6584
skills: 6560
years: 6305
analysis: 6036
software: 6008
please: 5846
services: 5842
scientist: 5832
company: 5754
ability: 5630
position: 5383
technology: 5300
technical: 5246
engineering: 5165
including: 5156
email: 5096
science: 5087
careers: 4912
clinical: 4793
time: 4761
working: 4695
knowledge: 4610
product: 4568
application: 4562
design: 4548
related: 4422
opportunity: 4376
required: 4326
systems: 4290
degree: 4240
one: 4203
contact: 4139
medical: 4092
health: 4055
must: 4030
employment: 3844
solutions: 3839
location: 3812
project: 3705
strong: 3633
requirements: 3557
2015: 3544
service: 3534
statistical: 3521
resume: 3506
may: 3476
senior: 3459
use: 3457
analyst: 3385
process: 3363
city: 3336
provide: 3326
united: 3269
analytics: 3255
marketing: 3206
full: 3177
1: 

In [266]:
skills = ["python", "r", "d3", "hadoop", "agile", "machine", "statistics", "programming", "client", "communication"]

for skill in skills:
    result = job_descs_adj.find({"description": {"$in": [skill]}})
    print skill + ": " + str(result.count()/float(job_descs.count()))

python: 0.127135256231
r: 0.153738448614
d3: 0.00700084010081
hadoop: 0.069028283394
agile: 0.0505460655279
machine: 0.105432651918
statistics: 0.149957994959
programming: 0.142957154859
client: 0.107392887146
communication: 0.296135536264


In [268]:
db.job_descs2.count()

4376

In [241]:
from sklearn.feature_extraction.text import CountVectorizer

In [244]:
CountVectorizer.__doc__

u'Convert a collection of text documents to a matrix of token counts\n\n    This implementation produces a sparse representation of the counts using\n    scipy.sparse.coo_matrix.\n\n    If you do not provide an a-priori dictionary and you do not use an analyzer\n    that does some kind of feature selection then the number of features will\n    be equal to the vocabulary size found by analyzing the data.\n\n    Parameters\n    ----------\n    input : string {\'filename\', \'file\', \'content\'}\n        If \'filename\', the sequence passed as an argument to fit is\n        expected to be a list of filenames that need reading to fetch\n        the raw content to analyze.\n\n        If \'file\', the sequence items must have a \'read\' method (file-like\n        object) that is called to fetch the bytes in memory.\n\n        Otherwise the input is expected to be the sequence strings or\n        bytes items are expected to be analyzed directly.\n\n    encoding : string, \'utf-8\' by default

In [249]:
cv = CountVectorizer(ngram_range=(1,2), min_df = 0)
 
corpus = ["hello my name is daniel shin", "hello my name is sam seo"]
for text in corpus:
    print text
print
 
print cv.fit_transform(corpus).toarray()
 
for w in cv.get_feature_names():
    print w

hello my name is daniel shin
hello my name is sam seo

[[1 1 1 1 1 1 0 1 1 1 1 0 0 0 1]
 [0 0 1 1 1 0 1 1 1 1 1 1 1 1 0]]
daniel
daniel shin
hello
hello my
is
is daniel
is sam
my
my name
name
name is
sam
sam seo
seo
shin


In [272]:
db.job_descs3.count()

25

In [1]:
import math
math.log(.00001)

-11.512925464970229