In [6]:
'''
IMPORT
'''
import string
import pandas as pd
import numpy as np
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import time
import re
from random import sample
import spacy
import random
import warnings

In [7]:
'''
Input type:
list

Return type:
list

Task: 
This function is passed the list of all job or course descriptions. For each description, word_tokenize is applied,
punctuation and stopwords are removed, the description is converted back into a string which is appended to final_list.
After it contains every preprocessed description, final_list is returned.
'''
def preprocess_desc(desc_list):
    stops = stopwords.words('english')
    punct = string.punctuation
    final_list = []
    
    for desc in desc_list:
        words = word_tokenize(desc)
        words = [w.lower() for w in words if w not in stops and w not in punct]
        final_str = ' '.join(words)
        final_list.append(final_str)
        
    return final_list

In [8]:
'''
Input type:
list

Return type:
list

Task:
This function takes the list of skills as input. For each skill, any parts within parenthesis are removed, the string is 
converted to lower, and any resulting white space on the right side is stripped.
'''
def preprocess_skills(a):
    new_list = []
    for skill in a:
        new_list.append(re.sub("[\(\[].*?[\)\]]", "", skill).lower().rstrip())
    return new_list

In [9]:
'''
Input type:
str, list

Return type:
Array of tuples
i.e. [ ( description, {'entities': (start_index, end_index, SKILL), ... }) , ...]

Task:
This function will generate large amounts of training data for the SpaCy model. A description string and a list of skills
are passed as input. The list of skills is iterated, and if a given skill is contained in the description, a training data
entry is created. The starting index and ending index of the skill in the description is used in the entry.


**EXAMPLE FROM SPACY DOCUMENTATION:**
TRAIN_DATA = [
    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
    ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
]
'''
def generate_training_data(description, skills):
    temp = []

    for skill in skills:

        start_index = description.find(skill)

        if start_index != -1:

            end_index = start_index + len(skill)

            temp.append((start_index, end_index, 'SKILL'))

    spacy_training_data = tuple((description, {'entities': temp}))

    return spacy_training_data

In [10]:
'''
Input Type:
list, list

Output Type:
list

Task:
This is a helper method to take in the skill list and list of training data, then pass the training data one by one to
the main generate_training_data function. The set of training data is then returned
'''
def generate_training_data_helper(desc_list, skill_list):
    train_data = []
    
    for desc in desc_list:
        
        train_data.append(generate_training_data(desc, skill_list))
        
    return train_data

In [11]:
'''
Input type:
list

Output type:
SpaCy model

Task:
This function takes the set of training data as an input. The examples provided with the SpaCy documentation is followed
to train a Named Entity Recognition model on the training data. This model is then returned.
'''
def train_model(train_data):
    print('Training SpaCy NER Model...')
    start = time.time()
    model = spacy.blank('en')
    
    if 'ner' not in model.pipe_names:
        ner = model.create_pipe('ner')
        model.add_pipe(ner, last = True)
        
    for _, annotation in train_data:
        for ent in annotation['entities']:
            ner.add_label(ent[2])
    
    other_pipes = [pipe for pipe in model.pipe_names if pipe != 'ner']
    with model.disable_pipes(*other_pipes), warnings.catch_warnings():
        warnings.simplefilter("ignore", category=UserWarning) 
        optimizer = model.begin_training()
        for itn in range(10):
            print('Starting iteration', itn)
            random.shuffle(train_data)
            losses = {}
            index = 0
            for text, annotations in train_data:
                try:
                    model.update(
                        [text],
                        [annotations],
                        drop = 0.2,
                        sgd = optimizer,
                        losses = losses)
                except Exception as e:
                    pass
            print(losses)
    
    end = time.time()
    print('Training the model took', (end-start)/60, 'minutes')
    return model

In [18]:
'''
Input:
SpaCy model, list

Output:
list of tuples, set


Task:
This function is passed a SpaCy model for named entity recognition and a list of job/course descriptions. Each description
is passed through the model. A list that contains each skill identified by the model is created. This entry in the 
description list is then changed into a tuple with the first element being the desc and the second being the list of 
skills in the description. All of the skills are also added to a set. The edited list of descriptions and a set containing
every identified skill are then returned.
'''
def extract_skills(model, descriptions):
    s_time = time.time()
    all_skills = set()
    for i in range(len(descriptions)):
        temp = descriptions[i]
        output = model(temp)
        skills = [ent.text for ent in output.ents]
        descriptions[i] = tuple((descriptions[i], skills))
        for element in skills:
            all_skills.add(element)
    e_time = time.time()
    print('Extracting the skills took',(e_time-s_time)/60,'minutes')
    return descriptions, all_skills

In [20]:
'''
Input:
list of tuples (desc, skills), list

Output:
list

Task:
This function takes the list of tuples output by the extract_skills function and a general list of skills as input. The 
list of tuples is iterated. For each tuple, we create a sparse vector of the same length as the all_skills input. At 
each index of the sparse vector, the value will be 0 if the skill at the corresponding index of the all_skills input was 
not identified in the description. The value will be 1 if the skill was identified. A list containing all of these
sparse vectors will be returned.
'''
def create_skill_vector(desc_skills, all_skills):
    s_time = time.time()
    final = []
    for _, skills in desc_skills:
        doc_list = []
        for a in all_skills:
            if a in skills:
                doc_list.append(1)
            else:
                doc_list.append(0)
        final.append(doc_list)
    e_time = time.time()
    print('Creating the skill vectors took',(e_time-s_time)/60,'minutes')
    return final

In [21]:
# Import the required data files
print('Importing Data...')
jobs_df_home = pd.read_json('handshake_postings.txt', lines=True) #job data
courses_df_home = pd.read_json('courses.txt', lines=True) #course data
skills_df = pd.read_csv('skills_list_master.csv')#skills data from EMSI
print('Data Imported Successfully')


# We are only concerned with the courses taught in English as we don't want non-English descriptions
courses_df_home = courses_df_home[courses_df_home['language']=='Language: English']


# Create a list out of the job descriptions
job_descriptions = jobs_df_home['description'].tolist()
# Create a list of the skills that will be used to create training data
skills_list = skills_df['name'].tolist()
# Create a list of the course descriptions
course_descriptions = courses_df_home['description'].tolist()


print('Preprocessing the data...')
t3 = time.time()
job_descriptions = preprocess_desc(job_descriptions) # preprocess job descriptions
course_descriptions = preprocess_desc(course_descriptions) # preprocess course descriptions
skills_list = preprocess_skills(skills_list) # preprocess  skills
t4 = time.time()
print('Successfully preprocessed the data. This took', t4-t3, 'seconds')


# Designate a subset of the descriptions for training data
# ONLY NECESSSARY IF A MODEL WILL BE TRAINED IN THIS RUN
#train_data_job = job_descriptions[0:6500]
#train_data_course = course_descriptions[0:3000]


# Create the training data for a SpaCy NER model
# ONLY NECESSARY IF A MODEL WILL BE TRAINED IN THIS RUN
#print('Generating Training Data...')
#t1 = time.time()
#train_data_job = generate_training_data_helper(train_data_job, skills_list)
#train_data_course = generate_training_data_helper(train_data_course, skills_list)
#t2 = time.time()
#print('Successfully Generated the Training Data. This Took', t2-t1, 'Seconds')


# Train our SpaCy NER models and save them to the disk for later use
#job_model = train_model(train_data_job)
#job_model.to_disk('job_skill_model_6500')
#course_model = train_model(train_data_course)
#course_model.to_disk('course_skill_model_3000')

'''
*** WARNING ***
THESE MODELS CAN TAKE VERY LONG TO TRAIN, MAKE SURE ALL VARIABLE NAMES IN THE 
TRAIN, TO_DISK, AND LOAD CALLS ARE CORRECT TO AVOID WASTED TIME!!!
'''

# Alternatively we can load models that we previously trained and saved
job_model = spacy.load('ner_model_6500_trained')
course_model = spacy.load('course_skill_model_3000')


# Extract the skills from all of our descriptions
# We get a list of tuples (desc, skills in desc) as well as a set of all skills extracted from this call of extract_skills
job_desc_skills, all_job_skills = extract_skills(job_model, job_descriptions) #extract skills from jos
course_desc_skills, all_course_skills = extract_skills(course_model, course_descriptions) #extract skills from courses


# Combine the two sets of skills to create the master list of all skills that have been extracted from our descriptions
combined_skills = all_job_skills.union(all_course_skills)
# Convert the set to a list so it can be indexed
combined_skills = list(combined_skills)


# Create sparse vector representations for the jobs and courses with relation to the combined_skills list
job_skill_vector = create_skill_vector(job_desc_skills, combined_skills)
course_skill_vector = create_skill_vector(course_desc_skills, combined_skills)


'''
Pooja to-do:
Implement a comparison metric for the job and course skill outputs


Each description now has a sparse vector representation with relation to combined_skills
    A description (d) has vector representation (V) defined as follows:
    V[a] = 0   --->   combined_skills[a]  not in  d
    V[a] = 1   --->   combined_skills[a]  in  d


job_skill_vector and couse_skill_vector contain the vector representations for each description in 
job_descriptions and course_descriptions respectively



**Possible Idea**
One solution that may work is to add each of the vectors in job_skill_vector and course_skill_vector to create 
two vectors defined as follows:

job_sum:
    job_sum[a] = x   --->   combined_skills[a] appears x times in our set of job descriptions

course_sum:
    course_sum[a] = x   --->   combined_skills[a] appears x times in our set of couse descriptions

From here it should be easy to apply some comparison like a cosine similarity

Note: 
Because job_skill_vector and course_skill_vector have different lengths, normalization by dividing each sum vector by
the length of the original vector may be needed. I am not sure about this however so some research is needed.

This would change the meaning of the sum vectors slightly so that:
    job_sum[a] = x   --->   combined_skills[a] appears x times per job description on average
    
    
    

Feel free to pursue your own direction with this if wanted!
'''

Successfully preprocessed the dataset. This took 116.08176445960999 seconds
Extracting the skills took 6.045414423942566 minutes
Extracting the skills took 1.3601527333259582 minutes
Creating the skill vectors took 1.0115293502807616 minutes
Creating the skill vectors took 0.21547099749247234 minutes


In [24]:
#print(job_skill_vector[1])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [28]:
print(len(all_course_skills))

2773
