In [2]:
from os import listdir, path
import pandas as pd
import time
import re
from nltk import *
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
from nltk.corpus import stopwords
from IPython.display import clear_output
import numpy
import matplotlib.pyplot as plt


In [3]:
try:
    df
except:
    df = pd.DataFrame(columns=['posting_id', 'search_title', 'search_location', 'location', 'title',
       'company', 'posted', 'salary', 'summary', 'link', 'description'])

def sort_jobs(df, posting_number):
    """Prompt user for response, use the response to place posting_number into an array"""
    job_status = input('Is this a job you would want to apply to? (y/n/u/o) \n(u is yes but underqualified, o is none of the above.)')
    if job_status.lower() in ['yes', 'yeah', 'y']:
        yes.append(posting_number)
        print(df.link[posting_number])
    elif job_status.lower() in ['no', 'nah', 'n']:
        no.append(posting_number)
    elif job_status.lower() in ['maybe', 'm', 'other', 'o']:
        other.append(posting_number)
    elif job_status.lower() in ['underqualified', 'u']:
        underqualified.append(posting_number)
    elif job_status.lower() == 'ty':
        yes_test.append(posting_number)
    elif job_status.lower() == 'tn':
        no_test.append(posting_number)

def job_post_info(posting_number: int, df = df, n_results_per: int = 10, sorting: bool = True):
    '''
        Print important bigrams and trigrams and the job description
        0. Assemble stopword set
        1. Tokenize Sentences
        2. Remove sentences with particular words 
        3. Tokenize words from remaining sentences
        4. Remove stopwords
        5. Print collection of important bigrams and trigrams from remaining words
        6. Print the entire job description
        7. If we are sorting:
            7a. Prompt user to determine whether it's a yes, a no, a job the user is 
                underqualified for, or other
            7b. Depending on user's response append post index to appropriate list
    ''' 
    # Stopwords
    stopset = set(stopwords.words('english') + word_tokenize(df.company[posting_number].lower()))

    # Tokenize and remove sentences
    sentences = sent_tokenize(df.description[posting_number])
    sentences = [sentence for sentence in sentences if not [p for p in ['equal opportunity', 'not discriminate', 'diversity', 'affirmative action', 'gender', 'eoe'] if p  in sentence.lower()]]

    # Tokenize words
    words = [word for words in sentences for word in word_tokenize(words.lower()) if word not in stopset]
    words = [word for word in words if (word.isalpha() or word.isnumeric()) and (len(word) < 2 or len(word) > 3)]

    # Bigrams and Trigrams
    bcf = BigramCollocationFinder.from_words(words)
    tcf = TrigramCollocationFinder.from_words(words)
    # Print company and job title
    print(df.company[posting_number])
    print(df.title[posting_number])
    print("Location:", df.location[posting_number], "\n")
    # print bigrams
    print('Bigrams')
    print(set(bcf.nbest(BigramAssocMeasures.likelihood_ratio, n_results_per) + bcf.nbest(BigramAssocMeasures.raw_freq, n_results_per) + bcf.nbest(BigramAssocMeasures.chi_sq, n_results_per)), "\n")
    # Print Trigrams
    print('Trigrams')
    print(set(tcf.nbest(TrigramAssocMeasures.likelihood_ratio, n_results_per) + tcf.nbest(TrigramAssocMeasures.raw_freq, n_results_per) + tcf.nbest(TrigramAssocMeasures.chi_sq, n_results_per)), "\n")
    # Print Description
    print(df.description[posting_number])
    # If we are sorting them to train an algo
    if sorting:
        # If we are sorting through them
        sort_jobs(df, posting_number)


## Initialize empty job index containers.
These take the index position of the jobs in the data frame. Later they are used to form yes, no, and undequalified labels to train an algorithm.

In [4]:
##################################################################
#                                                                #
# MAKE SURE PREVIOUS DATA HAS BEEN WRITTEN BEFORE CLEARING !!!!! #
#                                                                #
##################################################################
yes = []
no = []
underqualified = []
other = []

## Import job post data frame

In [7]:
# List the files in the shared drive to see which ones we want to load
listdir('Z:/data/')

['0rawdata',
 '2020-11-11_recentjobs.tsv',
 '2021-01-18_monster.csv',
 '2021-01-25_monsterScraped.csv',
 '2021-01-26_monsterScraped.csv',
 '2021-01-27_monsterScraped.csv',
 '2021-01-31_monsterScraped.csv',
 '2021-02-04_monsterScraped.csv',
 '2021-02-05_monsterScraped.csv',
 '2021-02-09_monster.csv',
 '2021-02-09_monsterScraped.csv',
 '2021-02-10_indeed.csv',
 '2021-02-11_indeed.csv',
 '2021-02-11_monsterScraped.csv',
 '2021-02-12_indeed.csv',
 '2021-02-13_indeed.csv',
 '2021-02-16_indeed.csv',
 '2021-02-17_indeed.csv',
 '2021-02-17_monsterScraped.csv',
 '2021-02-18_monster.csv',
 '2021-02-20_indeed.csv',
 '2021-02-21_indeed.csv',
 '2021-03-07_indeed.csv',
 '2021-03-07_monster.csv']

In [8]:
#########################################################
##########################################################
############### THIS CELL IS FOR RAW DATA ONLY!!!! ###########
#####################################################
# Import raw files and clean them up

# Set how many of the most recent files to include
last_n_files = 2

# Get the files from the shared directory
shared_drive = 'Z:/data/0rawdata/'

shared_files = listdir(shared_drive)[-last_n_files:]
df_0 = pd.DataFrame()
for our_file in shared_files:
    print("Loading", our_file)
    df_0 = pd.concat([df_0, pd.read_csv(shared_drive + our_file)], ignore_index=True)

print("Data Loaded")

Loading 2021-03-07_indeed.csv
Loading 2021-03-07_monster.csv
Data Loaded


In [9]:
#######################################################################################################
# Import most recent scraped file from the shared drive then clean the data so we can look through it #
#######################################################################################################

# Set how many of the most recent files to include
last_n_files = 2

# Get the files from the shared directory
shared_drive = 'Z:/data/'
shared_files = listdir(shared_drive)[-last_n_files:]
df_0 = pd.DataFrame()
for our_file in shared_files:
    print("Loading", our_file)
    df_0 = pd.concat([df_0, pd.read_csv(shared_drive + our_file)], ignore_index=True)

print("Data Loaded")


Loading 2021-03-07_indeed.csv
Loading 2021-03-07_monster.csv
Data Loaded


In [14]:
if "unnamed: 0" in df_0.columns.str.lower():
    df1 = df_0.loc[:, df_0.columns != "unnamed: 0"]
else:
    df1 = df_0
df1.columns = df1.columns.str.lower()
df1 = df1.drop_duplicates(subset=['description'], keep = 'first')
df1.description = df1.description.str.replace('A/R', 'AR')
df1.posted = pd.to_numeric(df1.posted,'coerce')
if df1.shape[1] == 11:
    for col in df1.columns[list(range(5)) + [10]]:
        df1[col] = df1[col].str.replace('\n', ' ').str.replace('\r', ' ').str.replace('/', ' ').str.replace(',', ' ')
else:
    for col in df1.columns[list(range(5)) + [9]]:
        df1[col] = df1[col].str.replace('\n', ' ').str.replace('\r', ' ').str.replace('/', ' ').str.replace(',', ' ')


df1.description = df1.description.str.replace(r"([a-z])([A-Z])", r"\1 \.\2").str.replace(r' \\.', '')
df1.description = df1.description.str.replace('Description', '')
df1 = df1.dropna(subset=['description'])
df1.sort_values('posted')

Unnamed: 0,search_title,search_location,location,title,company,posted,salary,summary,link,description
456,data scientist,New York NY,New York NY,VP Engineering,Away,0,,,https://job-openings.monster.com/vp-engineerin...,Company is seeking a forward-thinking VP of En...
1154,data,Morristown NJ,Rahway NJ,HR Business Partner - Biotech,Nesco Resource LLC,0,,"Job Typefulltime, contract",https://job-openings.monster.com/hr-business-p...,Education: • Minimum of a Bachelor degree in ...
1538,data,remote,Remote TX,VP Global Sales Engineering,Actian,0,,,https://job-openings.monster.com/vp-global-sal...,Company is looking for a Vice President of Glo...
553,business analyst,New York NY,New York NY,Financial Technical Analyst,Yoh A Day & Zimmermann Company,0,,,https://job-openings.monster.com/financial-tec...,Right to Hire Location: NY Boston Houston R...
1167,data analyst,Morristown NJ,Franklin Lakes NJ,Microstrategy Consultant,Mindlance,0,,,https://job-openings.monster.com/microstrategy...,Position :Microstrategy ConsultantLocation : F...
...,...,...,...,...,...,...,...,...,...,...
521,business analyst,New York NY,New York NY,Senior Business Analyst,Eliassen Group,5,,"Job Typefulltime, employee",https://job-openings.monster.com/senior-busine...,Job Our industry-leading financial services c...
527,business analyst,New York NY,New York City NY,Business Analyst,Apex Systems,5,,"Job Typefulltime, employee",https://job-openings.monster.com/business-anal...,Apex Systems combines with parent company On A...
528,business analyst,New York NY,Newark NJ,Project - Business Systems Analyst,PSEG,5,,"Job Typefulltime, employee",https://job-openings.monster.com/project-busin...,Requisition ID: 62441 Job Function Category:...
487,data scientist,New York NY,Newark NJ,Business Support Specialist - Customer Operations,PSEG,5,,"Job Typefulltime, employee",https://job-openings.monster.com/business-supp...,Requisition ID: 64377 Job Function Category: C...


In [17]:
# Using the cleaned data, drop what we have already sorted
# create the sorting list from the remaining posts 
# then display that at the bottom

df = df1.drop(yes)
df = df.drop(no)
df = df.drop(other)
df = df.drop(underqualified)

sl = df[
  # df.description.str.lower().str.contains('scrap')
    # ~df.salary.str.lower().str.contains('no')
    # df.description.str.lower().str.contains('math')
     (
        df.description.str.lower().str.contains('python') 
       | df.description.str.lower().str.contains(' r ')
     )
    #  df.description.str.lower().str.contains('sql')
    # & df.description.str.lower().str.contains('')
    & ~df.company.str.lower().str.contains('jpm', na=False)
    & (
        df.title.str.lower().str.contains('assoc') 
      | df.title.str.lower().str.contains('junior')
      | df.title.str.lower().str.contains('jr')
      | df.title.str.lower().str.contains('entry')
      # | df.title.str.lower().str.contains('intern')
      )
    & ~(
        df.title.str.lower().str.contains('lead')
      | df.title.str.lower().str.contains('manager')
      | df.title.str.lower().str.contains('vp')
      | df.title.str.lower().str.contains('vice')
      | df.title.str.lower().str.contains('senior')
      | df.title.str.lower().str.contains('sr')
      | df.title.str.lower().str.contains('principal')
      | df.title.str.lower().str.contains('director')
      )
      & ~df.company.str.lower().str.contains('cobble', na=False)
    # & ~(
    #   df.location.str.contains('DE')
    #   | df.location.str.contains('NH')
    #   | df.location.str.contains('PA')
    # )
    # & (df['posted'] < 31)
].sort_values('posted')
sl

Unnamed: 0,search_title,search_location,location,title,company,posted,salary,summary,link,description
20,entry level data,New York NY,New York NY,Junior Data Analyst - Consumer Team new,Balyasny,2,,Strong analytical and data processing skills (...,https://www.indeed.com/rc/clk?jk=d28e651b6effe...,We are looking for an exceptional data analyst...
191,entry level data,New York NY,New York NY,Junior Data Scientist (New York NY US),Mars Incorporated,2,,,https://job-openings.monster.com/junior-data-s...,A mutually rewarding experience.Work. Realize ...
1488,entry level data,remote,DC Remote Office (DC99) DC,Junior Business Data Analyst,102 ICF Incorporated LLC,2,,,https://job-openings.monster.com/junior-busine...,ICF is looking for a Junior Business Analyst D...
192,entry level data,New York NY,New York NY,Junior Data Analyst - Consumer Team,Balyasny Asset Management L.P. (BAM),3,,,https://job-openings.monster.com/junior-data-a...,The Job Details are as follows:OVERVIEWWe are ...
205,junior data scientist,New York NY,New York City New York United States,Junior Analytics Engineer,Signify Health,3,,,https://job-openings.monster.com/junior-analyt...,How will this role have an impact? The Analyti...
212,associate data,New York NY,Brooklyn NY,Associate Data Analytics,StrongArm Technologies,3,,,https://job-openings.monster.com/associate-dat...,POSITION OVERVIEW:As a Data Analytics Associat...
239,associate data scientist,New York NY,New York NY,Adjunct Associate Faculty Applied Analytics F...,Columbia University,3,,,https://job-openings.monster.com/adjunct-assoc...,Company Columbia University has been a leader ...
242,associate data scientist,New York NY,New York NY,Adjunct Associate Faculty Anomaly Detection (...,Columbia University,3,,,https://job-openings.monster.com/adjunct-assoc...,Company Columbia University has been a leader ...
108,junior data,New York NY,New York NY,Jr. Software Data Engineers (AWS Data Engineer...,Quintrix Solutions,4,"$55,000 - $57,000 a year",/week) of pre- employment online training that...,https://www.indeed.com/company/Quintrix-Soluti...,Quintrix is on the lookout for software data e...
636,junior data scientist,Newark NJ,New Brunswick NJ,Postdoctoral Quantitative Scientist - Junior P...,Alpha Consulting Corp.,4,,"Job Typefulltime, contract",https://job-openings.monster.com/postdoctoral-...,POSTDOCTORAL QUANTITATIVE SCIENTIST – JUNIOR P...


In [16]:
"""
If we put together a list of posts to go through:
    1. Clear previous output
    2. Show the post info and sort the job
"""

for post in sl.index[:5]:
    clear_output()
    job_post_info(post, df=df)
    input()
    

Mars  Incorporated
Junior Data Scientist (New York  NY  US)
Location: New York  NY 

Bigrams
{('ambitions', 'realize'), ('years', 'experience'), ('ambition', 'makes'), ('accommodation', 'application'), ('applicant', 'penalized'), ('application', 'process'), ('ability', 'work'), ('around', 'planet'), ('also', 'provides'), ('bachelor', 'degree'), ('always', 'incorporatedmars'), ('available', 'upon'), ('almost', 'billion'), ('analytics', 'data'), ('also', 'build'), ('basic', 'probability')} 

Trigrams
{('always', 'incorporatedmars', 'business'), ('blue', 'headquartered', 'mclean'), ('applicant', 'penalized', 'result'), ('around', 'planet', 'challenging'), ('years', 'experience', 'understanding'), ('also', 'build', 'enlightened'), ('also', 'provides', 'veterinary'), ('brand', 'grab', 'everything'), ('ambitions', 'realize', 'business'), ('ability', 'work', 'large'), ('pandas', 'years', 'experience'), ('years', 'experience', 'python'), ('accommodation', 'application', 'process'), ('bachelor'

In [27]:
links = df_0.link[yes + underqualified]
df_0[~df_0.link.isin(links)]

Unnamed: 0,search_title,search_location,location,title,company,posted,salary,summary,link,description
0,entry level data,"New York, NY",,Data Entry Analyst (Part-Time) new,Memorandum Inc.,4,Up to $15 an hour,Querying for data and loading it into client r...,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,"Data Entry Analyst (Part-Time)Memo, NYC_______..."
1,entry level data,"New York, NY",,Data Entry/ Routing new,Diamond Trading Group,5,"$30,000 - $50,000 a year","Enter Orders, Send to Warehouse, Route Orders,...",https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,"Enter Orders, Send to Warehouse, Route Orders,..."
2,entry level data,"New York, NY",,Data Entry Clerk / Typist new,Counsel Press Inc.,3,,\* Provide data entry and other support for ke...,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Data Entry Clerk / TypistPlease note that any ...
3,entry level data,"New York, NY",,Data Entry new,Catholic League for Religious and Civil Rights,1,,Data entry and tracking of all donations. The ...,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Job Description Summary of Position: The Cath...
4,entry level data,"New York, NY","New York, NY",Jr Data Analyst – Internship/Entry Level new,CCS Global Tech,5,"$65,000 - $75,000 a year","Provides plan with data, reporting and analyse...",https://www.indeed.com/company/CCS-Global-Tech...,"Responsibilities: · Provides plan with data, r..."
...,...,...,...,...,...,...,...,...,...,...
641,data,"Trenton, NJ","Trenton, NJ 08611 (Chambersburg area)",Private Investigator - Part Time new,"Meridian Investigative Group, Inc",3,,"Ability to gather data, compile information, a...",https://www.indeed.com/rc/clk?jk=b745a4337c053...,Description: SUMMARY: Conduct various types o...
642,data,"Trenton, NJ","Trenton, NJ 08690",Principal Secondary Education new,Mercer County Technical School District,4,,Supplies forms for pupil accounting data such ...,https://www.indeed.com/rc/clk?jk=af40f34e6c8b2...,QUALIFICATIONS: Valid New Jersey Principal Cer...
643,data,"Trenton, NJ","Philadelphia, PA (University City area)",SECRETARY new,United States Postal Service,3,"$46,682 - $62,679 a year","Accesses, retrieves and/or updates files and o...",https://www.indeed.com/rc/clk?jk=d065c63c966e1...,United States Postal Service External Publicat...
644,data,"Trenton, NJ","Princeton, NJ","Intern, Worldwide Patient Safety, BCI Pharmaco...",Bristol Myers Squibb,0,,Support Clinical data management activities. W...,https://www.indeed.com/rc/clk?jk=25f58910896d4...,"At Bristol Myers Squibb, we are inspired by a ..."


In [6]:
# How many do we have?
print("yes:", len(yes))
print("no:", len(no)) 
print("underqualified:", len(underqualified))

yes: 0
no: 0
underqualified: 0


In [44]:
# When finished sorting, uncomment the below and save the classified data
our_columns = [
       # 'posting_id', 
       'search_title', 'search_location', 'location', 'title',
       'company', 'salary', 'summary', 'link', 'description']
df_0.loc[yes, our_columns].to_csv('yes.csv', index=False, header=False, mode = 'a')
df_0.loc[no, our_columns].to_csv('no.csv', index=False, header=False, mode = 'a')
df_0.loc[underqualified, our_columns].to_csv('underqualified.csv', index=False, header=False, mode='a')