This notebook deals with preprocessing the dataset for the final script

In [20]:
#imports

import pandas as pd 
import nltk 
from tqdm import tqdm 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import inflect 
p = inflect.engine()
subject = 'web'

In [21]:
#converts the job descriptions to lowercase

df = pd.read_excel('../dataset/Excel/'+subject+'_dataset.xlsx')
df.job_desc = df.job_desc.str.lower()

In [22]:
#performs lemmatization on the job description


from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

#Download the corpus for lemmatization
nltk.download('wordnet')

#Create a new column for lemmatized JDs
df['job_desc_lemmatized'] = ''

#Create lemmatizer
lemmatizer = WordNetLemmatizer()

#Iterate over the dataset
for i in tqdm(range(df['job_desc'].shape[0])):
    #Convert the i-th job description into a list of words
    curr = word_tokenize(df['job_desc'].values[i])

    #init an empty string
    t_words = ''

    #iterate over every word in i-th JD
    for each in curr:    
        #Append VERB lemmatized version of everyword to empty string 
        # if (each_word:= p.singular_noun(each)): 
        #     each = each_word
        t_words = t_words + " " + lemmatizer.lemmatize(each , wordnet.VERB)
    
    #Assign the lemmatized string to the i-th row (ie corresponding to the i-th entry)
    df['job_desc_lemmatized'].values[i] = t_words

[nltk_data] Downloading package wordnet to /home/pinto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
100%|██████████| 295/295 [00:00<00:00, 688.20it/s]


In [23]:
df.head()

Unnamed: 0,experience,job_desc,key_skills,position,Doctorate :,job_desc_lemmatized
0,1 - 4 years,job description\n\nurgent hiring for php devel...,"['jQuery', 'Opencart', 'MySQL', 'Wordpress', '...",web developer,,job description urgent hire for php developer...
1,1 - 4 years,job description\nroles and responsibilities\nh...,"['Drupal', 'Html5', 'Web Technologies', 'Javas...",web developer,,job description roles and responsibilities ha...
2,1 - 3 years,job description\njob description--\n\njava scr...,"['Computer science', 'CSS', 'jQuery', 'Front e...",front end developer,,job description job description -- java scrip...
3,1 - 2 years,job description\nresponsibilities\ndevelop and...,"['Prototype', 'Illustrator', 'Illustration', '...",ui/ux designer,,job description responsibilities develop and ...
4,1 - 3 years,job description\nexperience in developing fron...,"['Css', 'Problem Solving', 'Javascript', 'Html']",angular developer,,job description experience in develop front e...


In [24]:
df.job_desc = df.job_desc.str.lower()

In [25]:
#Download corpus of stopwords from NLTK.These are stored as an iterable 

from string import digits
removed_digits = str.maketrans('' , '' , digits+'.')
print(digits)
sw = stopwords.words('english')

#Create a column for processed JDs
df['job_desc_processed'] = ''


#Define the set of punctuations to be removed 
punc = ['.', ',', '/', ':', ';', '!', '@', '#', '$', '%', '&', '*', '•', '\\', '?','(', ')', '-' , '\\n', "\'" ]

#Iterate over LEMMATIZED JDs
for i in tqdm(range(df['job_desc_lemmatized'].shape[0])):
    #init empty string 
    t_words = ""

    #Convert the i-th job description into a list of words
    curr = word_tokenize(df['job_desc_lemmatized'].values[i])
    
    #Iterate over the list of words for the i-th JD
    for each in curr:
        #Check if the word is a stopword or punctuation or a digit. The clause 'len(each) <15' is used to filter out some additional 
        #Junk words that were found in the job description. These words were grouped words that were missing a space.
        if each not in sw and each not in punc and not(each.isdigit()) and (len(each) < 15):
            #if word isnt Stopword/punctuation/digit, add it to string
            each = each.translate(removed_digits)
            t_words = t_words + " " + each

    #Assign the processed string to the i-th record (ie Corresponding to the i-th JD) 
    df['job_desc_processed'].values[i] = t_words
  
            

df.head()

 25%|██▍       | 73/295 [00:00<00:00, 715.82it/s]

0123456789


100%|██████████| 295/295 [00:00<00:00, 736.29it/s]


Unnamed: 0,experience,job_desc,key_skills,position,Doctorate :,job_desc_lemmatized,job_desc_processed
0,1 - 4 years,job description\n\nurgent hiring for php devel...,"['jQuery', 'Opencart', 'MySQL', 'Wordpress', '...",web developer,,job description urgent hire for php developer...,job description urgent hire php developer zir...
1,1 - 4 years,job description\nroles and responsibilities\nh...,"['Drupal', 'Html5', 'Web Technologies', 'Javas...",web developer,,job description roles and responsibilities ha...,job description roles hand knowledge html css...
2,1 - 3 years,job description\njob description--\n\njava scr...,"['Computer science', 'CSS', 'jQuery', 'Front e...",front end developer,,job description job description -- java scrip...,job description job description -- java scrip...
3,1 - 2 years,job description\nresponsibilities\ndevelop and...,"['Prototype', 'Illustrator', 'Illustration', '...",ui/ux designer,,job description responsibilities develop and ...,job description develop implement cohesive ux...
4,1 - 3 years,job description\nexperience in developing fron...,"['Css', 'Problem Solving', 'Javascript', 'Html']",angular developer,,job description experience in develop front e...,job description experience develop front end ...


Following cells are related to the removal of the collected useless words


In [26]:
import os 

#Open the file in read mode
useless_words = open('../uselessWords/useless words.txt' , 'r')
#Read the file. This will return a list of words that appear on every line
useless_words = useless_words.read().splitlines()

#convert the words to lowercase and strip leading and trailing spaces
useless_words = [word.lower().strip(' ') for word in useless_words]

#perform set operation to removal duplicate words
useless_words = set(useless_words)



print(f'Total number of useless words are:- {len(useless_words)}')


Total number of useless words are:- 730


In [27]:
removed_words = 0
total_words = 0 


#iterate over the processed JDs

for i in tqdm(range(df['job_desc_processed'].shape[0])):

    #split the JD into list of words
    curr = word_tokenize(df['job_desc_processed'].values[i])
   
    #init an empty string 
    t_words = ""


    #Iterate over every word of the JD
    for each in curr: 
        #if word is not in list of useless words , append it to the string
        if each not in useless_words:
            t_words = t_words + ' ' + each 
        else:
            #else increment the removed words counter
            removed_words+=1 
        #increment the total words counter.
        total_words +=1

    #Assign the value of useless words removed JDs to job_desc_processed column for the i-th record
    df['job_desc_processed'][i] = t_words

df.head()

100%|██████████| 295/295 [00:00<00:00, 1228.65it/s]


Unnamed: 0,experience,job_desc,key_skills,position,Doctorate :,job_desc_lemmatized,job_desc_processed
0,1 - 4 years,job description\n\nurgent hiring for php devel...,"['jQuery', 'Opencart', 'MySQL', 'Wordpress', '...",web developer,,job description urgent hire for php developer...,php php mysql wordpress javascript node jquer...
1,1 - 4 years,job description\nroles and responsibilities\nh...,"['Drupal', 'Html5', 'Web Technologies', 'Javas...",web developer,,job description roles and responsibilities ha...,html css javascript go back-end modify layout...
2,1 - 3 years,job description\njob description--\n\njava scr...,"['Computer science', 'CSS', 'jQuery', 'Front e...",front end developer,,job description job description -- java scrip...,-- java es es react angular vue j query html ...
3,1 - 2 years,job description\nresponsibilities\ndevelop and...,"['Prototype', 'Illustrator', 'Illustration', '...",ui/ux designer,,job description responsibilities develop and ...,cohesive ux wireframes mockups conduct resear...
4,1 - 3 years,job description\nexperience in developing fron...,"['Css', 'Problem Solving', 'Javascript', 'Html']",angular developer,,job description experience in develop front e...,html css/scss javascript angular + + angular ...


In [28]:
print(f'Total words are {total_words}')
print(f'Total words removed {removed_words}')
print(f'Percentage of words removed {(removed_words/total_words)*100}')
print(f'Sample job description after processing is as follows :- \n {df.job_desc_processed[1]}')

Total words are 31979
Total words removed 22067
Percentage of words removed 69.00465930767066
Sample job description after processing is as follows :- 
  html css javascript go back-end modify layout function accord strive visually appeal user-friendly clear navigation drupal cms interest gulfishan_a trigentcom perk benefit drupalhtmlweb


In [30]:

mapping = {'angular.js': ['angular','angularjs','angular.js'],
           'node.js': ['node' , 'nodejs' , 'node.js'],
           'react.js' :['react' , 'react.js' , 'reactjs'],
           'express.js':['express', 'express.js', 'expressjs'],
           'vue.js': ['vue' , 'vuejs' , 'vue.js'],
           'mongodb':['mongo', 'mongodb'],
           'html':['html','html5']
          } 

for i in tqdm(range(df['job_desc_processed'].shape[0])):

    #split the JD into list of words
    curr = word_tokenize(df['job_desc_processed'].values[i])
   
    #init an empty string 
    t_words = ""


    #Iterate over every word of the JD
    for each in curr: 
        #if word is not in list of useless words , append it to the string
        for key,value in mapping.items():
            if each in value: 
                each = key
            else:
                continue
                 

        t_words = t_words + ' ' + each
        
      
    
    #Assign the value of useless words removed JDs to job_desc_processed column for the i-th record
    df['job_desc_processed'][i] = t_words

df.head()
    


100%|██████████| 295/295 [00:00<00:00, 2300.09it/s]


Unnamed: 0,experience,job_desc,key_skills,position,Doctorate :,job_desc_lemmatized,job_desc_processed
0,1 - 4 years,job description\n\nurgent hiring for php devel...,"['jQuery', 'Opencart', 'MySQL', 'Wordpress', '...",web developer,,job description urgent hire for php developer...,php php mysql wordpress javascript node.js jq...
1,1 - 4 years,job description\nroles and responsibilities\nh...,"['Drupal', 'Html5', 'Web Technologies', 'Javas...",web developer,,job description roles and responsibilities ha...,html css javascript go back-end modify layout...
2,1 - 3 years,job description\njob description--\n\njava scr...,"['Computer science', 'CSS', 'jQuery', 'Front e...",front end developer,,job description job description -- java scrip...,-- java es es react.js angular.js vue.js j qu...
3,1 - 2 years,job description\nresponsibilities\ndevelop and...,"['Prototype', 'Illustrator', 'Illustration', '...",ui/ux designer,,job description responsibilities develop and ...,cohesive ux wireframes mockups conduct resear...
4,1 - 3 years,job description\nexperience in developing fron...,"['Css', 'Problem Solving', 'Javascript', 'Html']",angular developer,,job description experience in develop front e...,html css/scss javascript angular.js + + angul...


In [31]:
#save the dataset appropriately, ensure this is done for every subject

df.to_excel('../dataset/Excel/processed_'+subject+'_dataset.xlsx' , engine= 'xlsxwriter' , index = False)