In [4]:
import pandas as pd
import numpy as np
from textacy import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
lnkjobs = pd.read_csv("../data/linkedin_jobs.csv")
lnkjobs.head(10)

Unnamed: 0,index,subject,url,job_desc
0,0,Data Engineer Lead,https://www.linkedin.com/jobs/view/2819031657/...,"About the job\nRightpoint, a Genpact company (..."
1,1,Data Integration Engineer,https://www.linkedin.com/jobs/view/2836763830/...,About the job\nThis is a remote position that ...
2,2,"Principal Data Engineer, Data Platform",https://www.linkedin.com/jobs/view/2818408138/...,About the job\nIndigo is a company dedicated t...
3,3,Data Engineer,https://www.linkedin.com/jobs/view/2835872904/...,About the job\nDescription\n\nThe Amazon Web S...
4,4,Data Engineer(Remote),https://www.linkedin.com/jobs/view/2801440112/...,About the job\nAre you looking for the opportu...
5,5,Senior Data Engineer - Technology & Digital,https://www.linkedin.com/jobs/view/2753112635/...,About the job\nQualifications\n4+ years of IT ...
6,6,Data Engineer Lead,https://www.linkedin.com/jobs/view/2818392676/...,About the job\nPosition Overview\nHeadquartere...
7,7,Graph Data Pipeline Engineer - Senior - Big Da...,https://www.linkedin.com/jobs/view/2823015043/...,"About the job\nAt EY, you’ll have the chance t..."
8,8,Director - Data Engineer (100% Remote - Throug...,https://www.linkedin.com/jobs/view/2821169842/...,About the job\nRole/Responsibilities\n\nAs a D...
9,9,Data Engineer,https://www.linkedin.com/jobs/view/2836824578/...,About the job\nAbout NewtonX\nNewtonX is the w...


In [11]:
def remove_new_line(text: str):
    return text.replace("\n", " ")

def to_lower(text: str):
    return text.lower()

preproc = preprocessing.make_pipeline(
    remove_new_line,
    to_lower,
    preprocessing.remove.html_tags,
    preprocessing.normalize.whitespace, 
    preprocessing.replace.urls, 
    preprocessing.replace.numbers, 
    preprocessing.normalize.unicode
)

In [12]:
lnkjobs["clean_job_desc"] = lnkjobs["job_desc"].map(preproc)

In [13]:
lnkjobs.head(10)

Unnamed: 0,index,subject,url,job_desc,clean_job_desc
0,0,Data Engineer Lead,https://www.linkedin.com/jobs/view/2819031657/...,"About the job\nRightpoint, a Genpact company (...","about the job rightpoint, a genpact company (n..."
1,1,Data Integration Engineer,https://www.linkedin.com/jobs/view/2836763830/...,About the job\nThis is a remote position that ...,about the job this is a remote position that c...
2,2,"Principal Data Engineer, Data Platform",https://www.linkedin.com/jobs/view/2818408138/...,About the job\nIndigo is a company dedicated t...,about the job indigo is a company dedicated to...
3,3,Data Engineer,https://www.linkedin.com/jobs/view/2835872904/...,About the job\nDescription\n\nThe Amazon Web S...,about the job description the amazon web servi...
4,4,Data Engineer(Remote),https://www.linkedin.com/jobs/view/2801440112/...,About the job\nAre you looking for the opportu...,about the job are you looking for the opportun...
5,5,Senior Data Engineer - Technology & Digital,https://www.linkedin.com/jobs/view/2753112635/...,About the job\nQualifications\n4+ years of IT ...,about the job qualifications _NUMBER_+ years o...
6,6,Data Engineer Lead,https://www.linkedin.com/jobs/view/2818392676/...,About the job\nPosition Overview\nHeadquartere...,about the job position overview headquartered ...
7,7,Graph Data Pipeline Engineer - Senior - Big Da...,https://www.linkedin.com/jobs/view/2823015043/...,"About the job\nAt EY, you’ll have the chance t...","about the job at ey, you’ll have the chance to..."
8,8,Director - Data Engineer (100% Remote - Throug...,https://www.linkedin.com/jobs/view/2821169842/...,About the job\nRole/Responsibilities\n\nAs a D...,about the job role/responsibilities as a datab...
9,9,Data Engineer,https://www.linkedin.com/jobs/view/2836824578/...,About the job\nAbout NewtonX\nNewtonX is the w...,about the job about newtonx newtonx is the wor...


In [14]:
docs = [x for x in lnkjobs['clean_job_desc']]
docs[:10]

['about the job rightpoint, a genpact company (nyse: g) is a global experience leader. over _NUMBER_ employees across _NUMBER_ offices work with clients end-to-end, from defining and enabling vision, to ensuring ongoing market relevance. our diverse teams lead with empathy, data and creativity—always in service of the experience. from whiteboard to roll-out, we help our clients embed experience across their operations from front to back office to accelerate digital transformation through a human-centric lens. rightpoint has been recognized among the top customer experience consultancies in the forrester wave™: customer experience strategy consulting practices, q4 _NUMBER_.',
 "about the job this is a remote position that can be located anywhere in the us. several locations have been listed for marketing purposes only - only one position is available. our work at hach ensures water quality for people around the world. our customer partnerships, passionate experts, and reliable, easy-to-

In [24]:
tf_idf_vectorizer_ngram_2 = TfidfVectorizer(ngram_range=(2,2), stop_words=['english'], max_df=2)
tf_idf_docs = tf_idf_vectorizer_ngram_2.fit_transform(docs)

In [25]:
tf_idf_vectorizer_ngram_2.inverse_transform(tf_idf_docs[0])

[array(['q4 _number_', 'practices q4', 'consulting practices',
        'strategy consulting', 'experience strategy', 'wave customer',
        'forrester wave', 'the forrester', 'consultancies in',
        'experience consultancies', 'customer experience', 'top customer',
        'the top', 'among the', 'recognized among', 'been recognized',
        'has been', 'rightpoint has', 'lens rightpoint', 'centric lens',
        'human centric', 'through human', 'transformation through',
        'digital transformation', 'accelerate digital', 'to accelerate',
        'office to', 'back office', 'to back', 'front to', 'from front',
        'operations from', 'their operations', 'across their',
        'experience across', 'embed experience', 'clients embed',
        'our clients', 'help our', 'we help', 'out we', 'roll out',
        'to roll', 'whiteboard to', 'from whiteboard', 'experience from',
        'the experience', 'service of', 'in service', 'always in',
        'creativity always', 'an

In [30]:
tf_idf_vectorizer_ngram_3 = TfidfVectorizer(ngram_range=(4,4), stop_words=['english'], max_df=2)
tf_idf_docs_ngram_3 = tf_idf_vectorizer_ngram_3.fit_transform(docs)

In [31]:
tf_idf_vectorizer_ngram_3.inverse_transform(tf_idf_docs_ngram_3[0])

[array(['consulting practices q4 _number_',
        'strategy consulting practices q4',
        'experience strategy consulting practices',
        'customer experience strategy consulting',
        'wave customer experience strategy',
        'forrester wave customer experience',
        'the forrester wave customer', 'in the forrester wave',
        'consultancies in the forrester',
        'experience consultancies in the',
        'customer experience consultancies in',
        'top customer experience consultancies',
        'the top customer experience', 'among the top customer',
        'recognized among the top', 'been recognized among the',
        'has been recognized among', 'rightpoint has been recognized',
        'lens rightpoint has been', 'centric lens rightpoint has',
        'human centric lens rightpoint', 'through human centric lens',
        'transformation through human centric',
        'digital transformation through human',
        'accelerate digital transform