In [2]:
import pandas as pd
import time
import re
from nltk import *
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
from nltk.corpus import stopwords
from IPython.display import clear_output
import numpy
import matplotlib.pyplot as plt


## Initialize empty job index containers.
These take the index position of the jobs in the data frame. Later they are used to form yes, no, and undequalified labels to train an algorithm.

In [232]:
##################################################################
#                                                                #
# MAKE SURE PREVIOUS DATA HAS BEEN WRITTEN BEFORE CLEARING !!!!! #
#                                                                #
##################################################################
yes = []
no = []
underqualified = []
other = []

In [150]:
def sort_jobs(df, posting_number):
    """Prompt user for response, use the response to place posting_number into an array"""
    job_status = input('Is this a job you would want to apply to? (y/n/u/o) \nu is yes but underqualified, o is none of the above.')
    if job_status.lower() in ['yes', 'yeah', 'y']:
        yes.append(posting_number)
        print(f"https://www.monster.com/jobs/search/?q=Data-Scientist&jobid={df.jobid[posting_number]}")
    elif job_status.lower() in ['no', 'nah', 'n']:
        no.append(posting_number)
    elif job_status.lower() in ['maybe', 'm', 'other', 'o']:
        other.append(posting_number)
    elif job_status.lower() in ['underqualified', 'u']:
        underqualified.append(posting_number)
    elif job_status.lower() == 'ty':
        yes_test.append(posting_number)
    elif job_status.lower() == 'tn':
        no_test.append(posting_number)

def job_post_info(posting_number: int, df = df, n_results_per: int = 10, sorting: bool = True):
    '''
        Print important bigrams and trigrams and the job description
        0. Assemble stopword set
        1. Tokenize Sentences
        2. Remove sentences with particular words 
        3. Tokenize words from remaining sentences
        4. Remove stopwords
        5. Print collection of important bigrams and trigrams from remaining words
        6. Print the entire job description
        7. If we are sorting:
            7a. Prompt user to determine whether it's a yes, a no, a job the user is 
                underqualified for, or other
            7b. Depending on user's response append post index to appropriate list
    ''' 
    stopset = set(stopwords.words('english') + word_tokenize(df['company'][posting_number].lower()))
    sentences = sent_tokenize(df.description[posting_number])
    sentences = [sentence for sentence in sentences if not [p for p in ['equal opportunity', 'not discriminate', 'diversity', 'affirmative action', 'gender', 'eoe'] if p  in sentence.lower()]]
    words = [word for words in sentences for word in word_tokenize(words.lower()) if word not in stopset]
    words = [word for word in words if (word.isalpha() or word.isnumeric()) and (len(word) < 2 or len(word) > 3)]
    bcf = BigramCollocationFinder.from_words(words)
    tcf = TrigramCollocationFinder.from_words(words)
    # Print company and job title
    print(df.company[posting_number])
    print(df.title[posting_number])
    print("Location:", df.location[posting_number])
    print()
    # print bigrams
    print('Bigrams')
    print(set(bcf.nbest(BigramAssocMeasures.likelihood_ratio, n_results_per) + bcf.nbest(BigramAssocMeasures.raw_freq, n_results_per) + bcf.nbest(BigramAssocMeasures.chi_sq, n_results_per)))
    print()
    # Print Trigrams
    print('Trigrams')
    print(set(tcf.nbest(TrigramAssocMeasures.likelihood_ratio, n_results_per) + tcf.nbest(TrigramAssocMeasures.raw_freq, n_results_per) + tcf.nbest(TrigramAssocMeasures.chi_sq, n_results_per)))
    print()
    print(df.description[posting_number])
    # If we are sorting them to train an algo
    if sorting:
        # If we are sorting through them
        sort_jobs(df, posting_number)

def process_jobs(df):
    """Clean up the frame for easy searching"""
    # Columns to lower
    df.columns = df.columns.str.lower()
    # Strip "\n", "\r", ",", "/"
    for col in df.columns:
        df[col] = df[col].str.replace('\n', ' ').str.replace('\r', ' ').str.replace('/', ' ').str.replace(',', ' ').str.replace('\+', '')
    df.description = df.description.str.replace(r"([a-z])([A-Z])", r"\1 \.\2").str.replace(r' \\.', '')
    df.posted = df.posted.str.lower().str.replace(r' day[a-z ]*', '')
    df.posted[df.posted == 'today'] = '0'
    df.posted = pd.to_numeric(df.posted)
    return df


## Import job post data frame

In [5]:
df_0 = pd.read_csv('data/2020-12-14_jobsmonster.tsv', sep="\t")
df1 = df_0
df1.columns = df1.columns.str.lower()
df1.description = df1.description.str.replace('A/R', 'AR')
for col in df1.columns:
    if col != 'posted':
        df1[col] = df1[col].str.replace('\n', ' ').str.replace('\r', ' ').str.replace('/', ' ').str.replace(',', ' ')

df1.description = df1.description.str.replace(r"([a-z])([A-Z])", r"\1 \.\2").str.replace(r' \\.', '')


6911

In [206]:
df = df1.drop(yes)
df = df.drop(no)
df = df.drop(other)
df = df.drop(underqualified)
df[
  # df.description.str.lower().str.contains('scrap')
    # ~df.salary.str.lower().str.contains('no')
    # df.description.str.lower().str.contains('math')
    #  (
    #     df.description.str.lower().str.contains('python') 
    #    | df.description.str.lower().str.contains(' r ')
    #  )
    #  df.description.str.lower().str.contains('sql')
    # & df.description.str.lower().str.contains('')
    # & ~df.company.str.lower().str.contains('jpm')
     (
        df.title.str.lower().str.contains('assoc') 
      | df.title.str.lower().str.contains('junior')
      | df.title.str.lower().str.contains('jr')
      | df.title.str.lower().str.contains('entry')
      )
    & ~(
        df.title.str.lower().str.contains('lead')
      | df.title.str.lower().str.contains('manager')
      | df.title.str.lower().str.contains('vp')
      | df.title.str.lower().str.contains('vice')
      | df.title.str.lower().str.contains('senior')
      | df.title.str.lower().str.contains('sr')
      | df.title.str.lower().str.contains('principal')
      )
    # & ~(
    #   df.location.str.contains('DE')
    #   | df.location.str.contains('NH')
    #   | df.location.str.contains('PA')
    # )
    & (df['posted'] < 22)
].sort_values('posted')
# df1.sort_values('posted')

Unnamed: 0,search_title,search_location,location,title,company,posted,salary,summary,link,description
3918,entry level data analyst,Stamford CT,Pago Pago AS,Work from Home - Entry Level - Sales Rep,Vector Marketing,0,No Salary Provided,No Job Type Provided,https: job-openings.monster.com work-from-hom...,
4137,data analyst,Stamford CT,Edgewood NY,Client Success Analyst (JR1022200),Broadridge Financial Solutions,0,No Salary Provided,No Job Type Provided,https: job-openings.monster.com client-succes...,
3924,entry level data analyst,Stamford CT,Hawthorne NJ,Architect Jr Architect,Company Confidential,0,Salary$35000 - $50000 Per Year,Job Typefulltime employee,https: job-openings.monster.com architect-jr-...,
3907,entry level business analyst,Stamford CT,Riverside MD,Quality Control Jr. Chemist,C & A Service Inc.,0,Salary$48000 - $57000 Per Year,Job Typefulltime employee,https: job-openings.monster.com quality-contr...,
3193,associate data engineer,Philadelphia PA,Wilmington DE,Consumer & Community Banking - Risk Strategy &...,JPMorgan Chase,0,No Salary Provided,No Job Type Provided,https: job-openings.monster.com consumer-comm...,
...,...,...,...,...,...,...,...,...,...,...
4796,associate data engineer,New York,OSWEGO NY,Associate Equipment Operator - Oswego NY,Exelon,21,No Salary Provided,No Job Type Provided,https: job-openings.monster.com associate-equ...,
3158,associate data engineer,Philadelphia PA,Wilmington DE,CIMD - Marcus by Goldman Sachs - Customer Acqu...,Goldman Sachs,21,No Salary Provided,No Job Type Provided,https: job-openings.monster.com cimd-marcus-b...,
5528,associate data engineer,Pennsylvania,Allentown PA,Associate Technical Consultant,Perficient Inc.,21,No Salary Provided,No Job Type Provided,https: job-openings.monster.com associate-tec...,
3957,junior financial analyst,Stamford CT,Dallas TX,Billings Analyst - Entry Level,KIBO,21,No Salary Provided,No Job Type Provided,https: job-openings.monster.com billings-anal...,


In [190]:
"""
If we put together a list of posts to go through:
    1. Clear previous output
    2. Show the post info and sort the job
"""
for post in sl:
    clear_output()
    job_post_info(post)


The Aegis Technologies Group  Inc.
Engineering & Analysis (Junior to Mid-Level) at The Aegis Technologies Group  Inc.  
Location: Huntsville  AL 35806

Bigrams
{('begin', 'developing'), ('adapt', 'work'), ('missile', 'defense'), ('data', 'analysis'), ('back', 'original'), ('leading', 'transformation'), ('test', 'cases'), ('applications', 'experience'), ('ability', 'work'), ('across', 'space'), ('actively', 'taking'), ('activities', 'begin'), ('also', 'perform'), ('analyst', 'professionals'), ('1989', 'served'), ('become', 'familiar')}

Trigrams
{('transformation', 'modern', 'warfare'), ('leading', 'transformation', 'tomorrow'), ('analysis', 'activities', 'begin'), ('advanced', 'engineering', 'solutions'), ('become', 'familiar', 'groups'), ('analysis', 'execution', 'excellent'), ('adapt', 'work', 'schedule'), ('case', 'description', 'document'), ('missile', 'defense', 'electronic'), ('across', 'space', 'superiority'), ('develop', 'test', 'cases'), ('analyst', 'professionals', 'processin

In [230]:
print("yes:", len(yes))
print("no:", len(no))
print("underqualified:", len(underqualified))

yes: 7
no: 2
underqualified: 14


In [231]:
# When you're finished sorting, uncomment the below and save the classified data
our_columns = ['company', 'title', 'description']
df_0.loc[yes, our_columns].to_csv('data/yes.tsv', sep='\t', index=False, header=False, mode = 'a')
df_0.loc[no, our_columns].to_csv('data/no.tsv', sep='\t', index=False, header=False, mode = 'a')
df_0.loc[underqualified, our_columns].to_csv('data/underqualified.tsv', sep='\t', index=False, header=False, mode='a')