In [None]:
# default_exp data_cleaning_linkedin_jobs

# data cleaning for linkedin jobs

> API details.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#hide
# from nbdev.showdoc import *

In [None]:
import re
import string
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
import config


In [None]:
# Load data of linkedin jobs
jobs = pd.read_csv(config.LINKEDIN_JOBS, low_memory=False)
jobs.shape

(109, 5)

In [None]:
jobs.head()

Unnamed: 0,title,company_url,published,description,criteria
0,Data Scientist,https://www.linkedin.com/company/johnson-&-joh...,2021-05-07,Job DescriptionDATA SCIENTIST - JANSSEN ANZWor...,"{'Seniority level': ['Not Applicable'], 'Emplo..."
1,Data Scientist,https://uk.linkedin.com/company/hays?trk=publi...,2021-05-06,Centrally located in Brisbane CBDMultiple work...,"{'Seniority level': ['Entry level'], 'Employme..."
2,Data Scientist - Artificial Intelligence/Machi...,https://au.linkedin.com/company/systemize-cons...,2021-05-07,"· $83,000 (inc. super) + up to 25% perf...",{'Employment type': ['Full-time']}
3,Data Scientist,https://au.linkedin.com/company/kpmg-australia...,2021-05-03,Digital Delta Data Scientist Do you love solvi...,"{'Seniority level': ['Not Applicable'], 'Emplo..."
4,Entry level Data Scientist / Risk Analyst oppo...,https://au.linkedin.com/company/commonwealthba...,2021-05-07,Entry level Data Scientist / Risk Analyst oppo...,"{'Seniority level': ['Entry level'], 'Employme..."


### Clean descriptions

In [None]:
def clean_text(text):
    """Make text lower case, remove punctuation"""
    text = text.lower()
    text = re.sub('\xa0', ' ', text)
    text = re.sub('\u202f', ' ', text)
    text = re.sub('[0-9]', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    return text

jobs.description = jobs.description.apply(lambda x: clean_text(x))

### Remove stop words

In [None]:
stop_words = text.ENGLISH_STOP_WORDS
def remove_stop_words(text):
    for word in stop_words:
        text = re.sub(f' {word} ', ' ', text)
    return text

jobs.description = jobs.description.apply(lambda x: remove_stop_words(x))

### Unify job titles

In [None]:
jobs.title.value_counts().head(30)

Data Engineer                                                19
Data Scientist                                               15
Data Analyst                                                 12
Entry level Data Scientist / Risk Analyst opportunities       4
Machine Learning Engineer                                     4
Data Scientist - QuantumBlack                                 3
Data Science Engineer                                         3
Data Analyst, ANZ                                             3
Data Scientist, Customer Analytics                            3
Data Scientist - Artificial Intelligence/Machine Learning     3
Business Analyst                                              3
*Machine Learning Engineer*                                   2
Investment Data Analyst                                       2
Principal Data Scientist                                      2
Datascientist                                                 2
Data Scientist- Actuarial               

In [None]:
official_titles = ["Data Engineer", "Data Science Engineer", "Data Warehouse",
                   "Data Scientist", "Machine Learning Engineer", "Machine Learning Specialist", "Machine Learning",
                   "Data Analyst", "Business Analyst", "Data & Analytics",                    
                   "Software Engineer"]

def unify_job_title(text):
    res = "Not sure: " + text
    for title in official_titles:
        if title in text:
            res = title
    return res

In [None]:
jobs.loc[:,'title'] = jobs.title.apply(lambda x: unify_job_title(x))

In [None]:
jobs.title.value_counts()

Data Scientist                                                        29
Data Engineer                                                         23
Data Analyst                                                          19
Machine Learning                                                      12
Business Analyst                                                       5
Software Engineer                                                      4
Data Science Engineer                                                  3
Not sure: Data engineer                                                2
Not sure: Data Science - Specialist                                    2
Not sure: Datascientist                                                2
Not sure: Insights and Data Science Analysts                           1
Not sure: Amazon Tech U Graduate Program 2021                          1
Not sure: Data & Reporting Analyst                                     1
Not sure: Applied Scientist - Intern               

In [None]:
# Mapping titles
title_map = {"Data Engineer":"Data Engineer", "Data Science Engineer":"Data Engineer", "Data Warehouse":"Data Engineer",
               "Data Scientist":"Data Scientist", "Machine Learning Engineer":"Machine Learning Engineer", 
               "Machine Learning Specialist":"Machine Learning Engineer", "Machine Learning":"Machine Learning Engineer",
               "Data Analyst":"Data Analyst", "Business Analyst":"Data Analyst", "Data & Analytics":"Data Analyst",                    
               "Software Engineer":"Software Engineer"}

In [None]:
jobs.title = jobs.title.map(title_map)

In [None]:
jobs.isna().sum()

title          14
company_url     0
published       0
description     0
criteria        0
dtype: int64

In [None]:
jobs.dropna(inplace=True)

In [None]:
jobs.head()

Unnamed: 0,title,company_url,published,description,criteria
0,Data Scientist,https://www.linkedin.com/company/johnson-&-joh...,2021-05-07,job descriptiondata scientist janssen anzwor...,"{'Seniority level': ['Not Applicable'], 'Emplo..."
1,Data Scientist,https://uk.linkedin.com/company/hays?trk=publi...,2021-05-06,centrally located brisbane cbdmultiple work st...,"{'Seniority level': ['Entry level'], 'Employme..."
2,Machine Learning Engineer,https://au.linkedin.com/company/systemize-cons...,2021-05-07,· super performance bo...,{'Employment type': ['Full-time']}
3,Data Scientist,https://au.linkedin.com/company/kpmg-australia...,2021-05-03,digital delta data scientist love solving comp...,"{'Seniority level': ['Not Applicable'], 'Emplo..."
4,Data Scientist,https://au.linkedin.com/company/commonwealthba...,2021-05-07,entry level data scientist risk analyst oppo...,"{'Seniority level': ['Entry level'], 'Employme..."


In [None]:
# Save clean data
jobs.to_csv(config.LINKEDIN_JOBS_CLEAN, index=False)

### Create Document-Term Matrix

In [None]:
jobs = jobs.groupby(['title']).description.apply(lambda x: ','.join(x)).reset_index()

In [None]:
cvec = CountVectorizer(stop_words='english')
data_cvec = cvec.fit_transform(jobs.description)
data_dtm = pd.DataFrame(data_cvec.toarray(), columns=cvec.get_feature_names())
data_dtm.index = jobs.title

In [None]:
data_dtm.to_csv(config.LINKEDIN_JOBS_DTM)

In [None]:
data_dtm

Unnamed: 0_level_0,aa,abilities,abilitiespreferred,abilitiesshow,ability,able,aboriginal,abstraction,academic,accelerating,...,youyou,zealand,zealandextraordinary,zendeks,zendesk,zero,zip,zipster,zoo,zshow
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Data Analyst,0,0,1,1,39,7,7,0,0,0,...,0,1,0,0,0,1,0,0,4,0
Data Engineer,0,0,0,0,22,11,3,1,2,0,...,1,1,1,0,0,0,1,0,0,0
Data Scientist,0,5,0,0,16,8,5,0,9,5,...,2,6,0,0,0,0,12,3,0,0
Machine Learning Engineer,2,0,0,0,10,1,0,0,1,0,...,0,0,0,2,28,0,0,0,0,0
Software Engineer,0,1,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
