# Preprocessing for job descriptions

This will take in labled data and prepare the data by cleaning, tokenizing, and running tfidf

In [246]:
import pandas as pd
import re
import nltk
import string

In [247]:
stopwords = nltk.corpus.stopwords.words("english")
df = pd.read_csv('job_postings.csv')
df.head()

Unnamed: 0,job_title,company,location,href,description,apply
0,Data Analyst Co-op (Spring term),Ridley College (Canada),"St. Catharines, ON",https://ca.indeed.com/rc/clk?jk=4aafa08c370b87...,[[<div><p><b>Position Title: Data Analyst Co-o...,Yes
1,Data Analytics Associate Summer Intern (MBA),Johnson & Johnson Family of Companies,"Toronto, ON",https://ca.indeed.com/rc/clk?jk=1d1d136f3b5263...,[[<div><p><b>Data Analytics Associate Intern –...,No
2,"Data Analyst, Summer 2021 Student Opportunities",RBC,"Toronto, ON",https://ca.indeed.com/rc/clk?jk=5bc75ed7e05b22...,[[<div><p><b>What is the opportunity?</b><br/>...,Yes
3,"Data Scientist, Summer Student 2021 Opportunities",RBC,"Toronto, ON",https://ca.indeed.com/rc/clk?jk=1bdf42b3d5b3e4...,[[<div><p><b>What is the opportunity?</b><br/>...,Yes
4,"Business/Operations Analyst, Summer 2021 Stude...",RBC,"Toronto, ON",https://ca.indeed.com/rc/clk?jk=76d9a17c168e02...,[[<div><p><b>What is the opportunity?</b></p><...,Yes


In [248]:
def cleanhtml(description):
    """removes anything inbetween <> along with \n"""
    description = str(description)
    cleanr = re.compile('<.*?>')
    description = re.sub(cleanr, ' ', description)
    description = description.replace('\n', " ")
    description = description.replace('”'," ")
    description = description.replace('“'," ")
    description = description.replace('-'," ")
    description = description.replace('/'," ")
    description = description.replace('â€™',"'")
    return description

In [249]:
descriptions_list = list(df['description'])
descriptions_list = [cleanhtml(description) for description in descriptions_list]
descriptions_list[0]

'[[   Position Title: Data Analyst Co op (Spring term)     THE RIDLEY DIFFERENCE    Founded in 1889, Ridley College is a co educational boarding and day school with over 700 students in JK to grade 12 from more than 50 countries worldwide. The Ridley experience integrates a traditional liberal arts education with individual learning enhanced by the principles of positive psychology, the International Baccalaureate continuum program, a learner driven culture and campus community, exceptional athletics and dynamic arts programs, and a commitment of service to others. This unique combination prepares meaningful and flourishing lives by teaching the habits of mind, body and spirit, and the values needed to lead in a global society.   Ridley College is located on 90 acres in St. Catharines, Ontario in the heart of the Niagara Peninsula and is comprised of a stunning blend of century old buildings housing modernized technology.    OUR TEAM    Ridley offers the prospect of being part of a hig

In [250]:
def tokenize(description):
    """Removes brackets, braces, excess whitespace, sets to lowercase. Returns a list of tokens."""
    description = "".join(description.lower()).strip().split()
    description = [word.translate(str.maketrans(' ',' ', string.punctuation)) for word in description]
    return description

In [251]:
descriptions_list = [tokenize(description) for description in descriptions_list]

In [252]:
def remove_stopwords(description):
    """remove stopwords and 'amp' """
    description = [word for word in description if word != 'amp']
    description = [word for word in description if word not in stopwords]
    return description

In [253]:
descriptions_list = [remove_stopwords(description) for description in descriptions_list]
print(len(descriptions_list[0]))
descriptions_list[0]

475


['',
 'position',
 'title',
 'data',
 'analyst',
 'co',
 'op',
 'spring',
 'term',
 'ridley',
 'difference',
 'founded',
 '1889',
 'ridley',
 'college',
 'co',
 'educational',
 'boarding',
 'day',
 'school',
 '700',
 'students',
 'jk',
 'grade',
 '12',
 '50',
 'countries',
 'worldwide',
 'ridley',
 'experience',
 'integrates',
 'traditional',
 'liberal',
 'arts',
 'education',
 'individual',
 'learning',
 'enhanced',
 'principles',
 'positive',
 'psychology',
 'international',
 'baccalaureate',
 'continuum',
 'program',
 'learner',
 'driven',
 'culture',
 'campus',
 'community',
 'exceptional',
 'athletics',
 'dynamic',
 'arts',
 'programs',
 'commitment',
 'service',
 'others',
 'unique',
 'combination',
 'prepares',
 'meaningful',
 'flourishing',
 'lives',
 'teaching',
 'habits',
 'mind',
 'body',
 'spirit',
 'values',
 'needed',
 'lead',
 'global',
 'society',
 'ridley',
 'college',
 'located',
 '90',
 'acres',
 'st',
 'catharines',
 'ontario',
 'heart',
 'niagara',
 'peninsula',
 '

In [254]:
#def remove_nums(description):
#    description = [word for word in description if word.isnumeric() == False]
#    return description

In [255]:
def remove_digits(description):
    new_description = []
    for word in description:
        no_digits = []
        for i in word:
            if not i.isdigit():
                no_digits.append(i)
            word = ''.join(no_digits)
        new_description.append(word)
    return new_description

In [256]:
descriptions_list = [remove_digits(description) for description in descriptions_list]

In [257]:
descriptions_list = [" ".join(description) for description in descriptions_list]

In [258]:
df['clean_description'] = descriptions_list

In [259]:
df['clean_description'][280]

' required ensure safe efficient operation cleaning running adjustments variety mail processing machines responsible maintenance prescribed quantity quality product produced responsibilities include troubleshooting machine performance making adjustments calling additional resources required equipment  inkjet addresser folder master mailer inserter sealer labeller tabber cutter laser printer  core abilities  quality orientation time management creative innovative thinking development continual learning problem solving accountability dependability decision making judgement operating equipment providing consultation planning organizing mathematical reasoning coaching mentoring communication team work  job duties  · set jobs assigned production manager  · operate equipment assigned jobs  · thoroughly understand able set pieces equipment timely manner  · run piece equipment accuracy machines’ top running speed required  · read understand docket relating assigned live job  · check samples  a

### Create TFIDF CSV

In [260]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = df['clean_description']
df['apply'], unique = pd.factorize(df['apply'])

tfidfv = TfidfVectorizer(min_df=0, max_df=1, use_idf=True)
tfidfv_matrix = tfidfv.fit_transform(corpus)
tfidfv_matrix = tfidfv_matrix.toarray()

In [261]:
import numpy as np
vocab = tfidfv.get_feature_names()
tfidf_df = pd.DataFrame(np.round(tfidfv_matrix, 2), columns=vocab)

In [262]:
tfidf_df['will_apply'] = df['apply']

In [263]:
tfidf_df


Unnamed: 0,aarthiga,abap,abi,abide,abor,above,absences,absolute,abstraction,abstracts,...,zealand,zn,zos,zumba,zynga,zzybgllqg,âge,égard,équipes,will_apply
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
392,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [264]:
tfidf_df.to_csv('tfidf_df.csv')

### Create Bag of Words CSV

In [265]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = df['clean_description']
df['apply'], unique = pd.factorize(df['apply'])

cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(corpus)
cv_matrix = cv_matrix.toarray()

In [266]:
# get all unique words in the corpus
vocab = cv.get_feature_names()
# show document feature vectors
bow_df = pd.DataFrame(cv_matrix, columns=vocab)
bow_df['will_apply'] = df['apply']

In [267]:
bow_df

Unnamed: 0,aarthiga,abap,abcellera,abi,abide,abilities,ability,able,abled,abor,...,zoom,zos,zumba,zynga,zzybgllqg,âge,égard,équipes,êtes,will_apply
0,0,0,0,0,0,0,3,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,2,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,0,0,0,0,0,0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,1
391,0,0,0,0,0,0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
392,0,0,0,0,0,1,2,0,0,0,...,0,0,0,0,0,0,0,0,0,1
393,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [268]:
bow_df.to_csv('bow_df.csv')

### Create ngram CSV

In [269]:
from sklearn.feature_extraction.text import CountVectorizer

ngram = CountVectorizer(ngram_range=(2,2))
ngram_matrix = ngram.fit_transform(corpus)

ngram_matrix = ngram_matrix.toarray()
vocab = ngram.get_feature_names()
ngrams_df = pd.DataFrame(ngram_matrix, columns=vocab)


In [270]:
ngrams_df['will_apply'] = df['apply']

In [271]:
ngrams_df

Unnamed: 0,aarthiga sivakumar,abap design,abap java,abcellera nat,abcellera we,abi ltd,abide applicable,abilities ability,abilities able,abilities competencies,...,zynga games,zynga leading,zynga poker,zynga provides,zynga stock,âge le,égard ethnie,équipes efficaces,êtes ouvert,will_apply
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
391,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
392,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
393,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [272]:
ngrams_df.to_csv('ngrams_df.csv')