In [1]:
import requests
import numpy as np
import pandas as pd

import string

import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download('wordnet')
nltk.download('omw-1.4')

from langdetect import detect, detect_langs
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from bs4 import BeautifulSoup

from sklearn.metrics import accuracy_score, f1_score, hamming_loss
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\grego\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\grego\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\grego\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\grego\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
PAGE_QUERIES = 52
job_data = []

for page_num in range(1, PAGE_QUERIES + 1):
    r = requests.get('https://www.arbeitnow.com/api/job-board-api', params={'page': page_num})
    job_data.extend(r.json()['data'])

In [3]:
# From https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python
def cleanDescription(html):
    soup = BeautifulSoup(html, features = "html.parser")

    for script in soup(["script","style"]):
        script.extract()
    
    text = soup.get_text()

    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    return '\n'.join(chunk for chunk in chunks if chunk)

def cleanStrList(list):
    for i, e in enumerate(list):
        list[i] = str.lower(e)
    return sorted(list)

In [4]:
labels = list(job_data[0].keys()) # Get names of column labels

df = pd.DataFrame(job_data, columns=labels)
df['description'] = df['description'].apply(cleanDescription)
df['tags'] = df['tags'].apply(cleanStrList)
df['job_types'] = df['job_types'].apply(cleanStrList)

df

Unnamed: 0,slug,company_name,title,description,remote,url,tags,job_types,location,created_at
0,finanzbuchhalter-gn-munich-239942,Vality One Recruitment GmbH,Finanzbuchhalter (gn),Ihr Aufgabengebiet\nUnterstützung bei der Buch...,False,https://www.arbeitnow.com/jobs/companies/valit...,[finance],[],Munich,1683920946
1,bilanzbuchhalter-senior-accountant-gn-munich-3...,Vality One Recruitment GmbH,Bilanzbuchhalter / Senior Accountant (gn),Ihr Aufgabengebiet\nGesamtverantwortung für di...,False,https://www.arbeitnow.com/jobs/companies/valit...,[finance],[],Munich,1683920946
2,junior-frontend-developerin-landshut-231818,Zentur.io GmbH,Junior Frontend Developer:in,Zenturio ist ein Anbieter von Software-as-a-Se...,False,https://www.arbeitnow.com/jobs/companies/zentu...,[web development],[berufseinstieg],Landshut,1683920946
3,pflichtpraktikum-influencer-marketing-berlin-3...,OUTFITTERY GmbH,Pflichtpraktikum Influencer Marketing,__OUTFITTERY__ ist Europas größter Personal-Sh...,False,https://www.arbeitnow.com/jobs/companies/outfi...,[marketing and communication],[internship],Berlin,1683920946
4,cto-tech-lead-fur-iot-tech-startup-software-ha...,Livello GmbH,CTO / Tech Lead für IoT-Tech-Startup (m/w/d) S...,Als CTO und Mitglied des Führungsteam wirst an...,False,https://www.arbeitnow.com/jobs/companies/livel...,[software development],"[teamleitung, vollzeit (unbefristet)]",Düsseldorf,1683920946
...,...,...,...,...,...,...,...,...,...,...
5195,ausbildung-zum-verkaufer-frische-2023-duisburg...,Lebensmitteleinzelhandel Hans Czaikowski GmbH,Ausbildung zum Verkäufer Frische (m/w/d) - 2023,Willkommen bei EDEKA Rhein-Ruhr – Deinem Ausbi...,False,https://www.arbeitnow.com/jobs/companies/leben...,"[other, retail]","[apprenticeship, full time, student]",Duisburg,1683811765
5196,ausbildung-zum-verkaufer-frische-2023-krefeld-...,Lebensmitteleinzelhandel Hans Czaikowski GmbH,Ausbildung zum Verkäufer Frische (m/w/d) - 2023,Willkommen bei EDEKA Rhein-Ruhr – Deinem Ausbi...,False,https://www.arbeitnow.com/jobs/companies/leben...,"[other, retail]","[apprenticeship, full time, student]",Krefeld,1683811765
5197,ausbildung-zum-verkaufer-frische-2023-duisburg...,Lebensmitteleinzelhandel Hans Czaikowski GmbH,Ausbildung zum Verkäufer Frische (m/w/d) - 2023,Willkommen bei EDEKA Rhein-Ruhr – Deinem Ausbi...,False,https://www.arbeitnow.com/jobs/companies/leben...,"[other, retail]","[apprenticeship, full time, student]",Duisburg,1683811765
5198,ausbildung-zum-verkaufer-frische-2023-mulheim-...,Lebensmitteleinzelhandel Hans Czaikowski GmbH,Ausbildung zum Verkäufer Frische (m/w/d) - 2023,Willkommen bei EDEKA Rhein-Ruhr – Deinem Ausbi...,False,https://www.arbeitnow.com/jobs/companies/leben...,"[other, retail]","[apprenticeship, full time, student]",Mülheim a.d. Ruhr,1683811765


In [5]:
tags_count = df['tags'].explode().value_counts().to_dict() # Occurrences of each tag
job_types_count = df['job_types'].explode().value_counts().to_dict() # Occurrences of each job type

In [None]:
df.iloc[5195]['description']

In [6]:
df['lang'] = df['description'].apply(detect) # Apply language detection for description and set as language

In [None]:
df[df['lang'] =='en']

In [None]:
df['lang'].unique()

In [79]:
df[(df['job_types'].str.len() != 0) & (df['tags'].str.len() != 0)]

Unnamed: 0,slug,company_name,title,description,remote,url,tags,job_types,location,created_at,lang
0,testmanagement-fur-software-und-systemtests-iv...,Bosch Group,Testmanagement für Software- und Systemtests (...,Company DescriptionBei Bosch gestalten wir Zuk...,False,https://www.arbeitnow.com/jobs/companies/bosch...,"[engineering, mechanical or industrial enginee...","[associate, full time]",Abstatt,1683903149,de
1,testmanagement-fur-software-und-systemtests-iv...,Bosch Group,Testmanagement für Software- und Systemtests (...,Company DescriptionBei Bosch gestalten wir Zuk...,False,https://www.arbeitnow.com/jobs/companies/bosch...,"[engineering, mechanical or industrial enginee...","[associate, full time]",Holzkirchen,1683903146,de
2,ausbildung-kaufmann-frau-im-einzelhandel-stuhr...,JYSK,Ausbildung Kaufmann/Frau Im Einzelhandel (M/W/...,Job DescriptionDu hast Lust und kannst es kaum...,False,https://www.arbeitnow.com/jobs/companies/jysk/...,"[retail, sales]","[entry, full time]",Stuhr,1683903146,de
3,technischer-sachbearbeiter-im-servicedesk-berg...,REPA Deutschland GmbH,Technischer Sachbearbeiter im Servicedesk (m/w/d),Company DescriptionREPA Deutschland GmbH gehör...,False,https://www.arbeitnow.com/jobs/companies/repa-...,"[customer service, wholesale]","[executive, full time]",Bergkirchen,1683903146,de
4,ausbildung-kaufmann-frau-im-einzelhandel-breme...,JYSK,Ausbildung Kaufmann/Frau Im Einzelhandel (M/W/...,Job DescriptionDu hast Lust und kannst es kaum...,False,https://www.arbeitnow.com/jobs/companies/jysk/...,"[retail, sales]","[entry, full time]",Bremen,1683903146,de
...,...,...,...,...,...,...,...,...,...,...,...
5195,ausbildung-zum-verkaufer-2023-krefeld-61734,EDEKA Heiner Kempken e.K.,Ausbildung zum Verkäufer (m/w/d) - 2023,Willkommen bei EDEKA Rhein-Ruhr – Deinem Ausbi...,False,https://www.arbeitnow.com/jobs/companies/edeka...,"[other, retail]","[apprenticeship, full time, student]",Krefeld,1683811757,de
5196,ausbildung-zum-verkaufer-2023-krefeld-340353,EDEKA Heiner Kempken e.K.,Ausbildung zum Verkäufer (m/w/d) - 2023,Willkommen bei EDEKA Rhein-Ruhr – Deinem Ausbi...,False,https://www.arbeitnow.com/jobs/companies/edeka...,"[other, retail]","[apprenticeship, full time, student]",Krefeld,1683811757,de
5197,ausbildung-zum-verkaufer-2023-krefeld-133597,EDEKA Heiner Kempken e.K.,Ausbildung zum Verkäufer (m/w/d) - 2023,Willkommen bei EDEKA Rhein-Ruhr – Deinem Ausbi...,False,https://www.arbeitnow.com/jobs/companies/edeka...,"[other, retail]","[apprenticeship, full time, student]",Krefeld,1683811757,de
5198,ausbildung-zum-verkaufer-2023-krefeld-399541,EDEKA Heiner Kempken e.K.,Ausbildung zum Verkäufer (m/w/d) - 2023,Willkommen bei EDEKA Rhein-Ruhr – Deinem Ausbi...,False,https://www.arbeitnow.com/jobs/companies/edeka...,"[other, retail]","[apprenticeship, full time, student]",Krefeld,1683811757,de


In [22]:
def map_tags_col(row_tag_list, tags_list):
    map_list = {}
    for tag in tags_list:
        map_list[tag] = 1 if tag in row_tag_list else 0
    
    return map_list

In [23]:
labels_list = ['information technology', 'sales', 'healthcare services', 'customer service', 'engineering']
lang_convert = {
    'en': 'english',
    'de': 'dutch',
    'pt': 'portuguese',
    'fr': 'french'
}

multilingual_stop_words_list = stopwords.words('dutch') + stopwords.words('english') + stopwords.words('portuguese') + stopwords.words('french')

# Filter DF to only include entries with labels in the predetermined labels list 
filtered_df = df[(df['tags'].str.len() != 0) & (df['tags'].apply(lambda x: set(labels_list).intersection(set(x))))][['description', 'lang', 'tags']]

# Map out each tag to individual columns
applied_df = filtered_df.apply(lambda row: map_tags_col(row.tags, labels_list), axis='columns', result_type='expand')
filtered_df = pd.concat([filtered_df, applied_df], axis='columns')
filtered_df

Unnamed: 0,description,lang,tags,information technology,sales,healthcare services,customer service,engineering
19,Company DescriptionTIDAL is a global music and...,en,"[engineering, information technology and servi...",0,0,0,0,1
20,Company Description👋🏼 We're Nagarro.\nWe are a...,en,"[information technology, information technolog...",1,0,0,0,0
28,DESTINATION FOR GREAT TALENT\nAt Trade Republi...,en,[customer service],0,0,0,1,0
29,DESTINATION FOR GREAT TALENT\nAt Trade Republi...,en,[customer service],0,0,0,1,0
30,DESTINATION FOR GREAT TALENT\nAt Trade Republi...,en,[customer service],0,0,0,1,0
...,...,...,...,...,...,...,...,...
4265,Ihre AufgabenVerteilung der Speisen an unsere ...,de,"[food services and hospitality, healthcare ser...",0,0,1,0,0
4268,Ihre AufgabenFür unseren neuen Standort in ​Po...,de,"[healthcare services, medical and health]",0,0,1,0,0
4671,Die EDEKA Südbayern Handels Stiftung & Co. KG ...,de,"[engineering, retail]",0,0,0,0,1
4787,Die Südbayerische Fleischwaren GmbH hat ihren ...,de,"[engineering, food and beverage]",0,0,0,0,1


In [198]:
total_tags_list = list(sorted(set([tag for tags in filtered_df['tags'].tolist() for tag in tags])))

In [24]:
X_train, X_test, y_train, y_test = train_test_split(np.array(filtered_df[['description','lang']]), np.array(filtered_df[labels_list]), test_size=0.25)

In [25]:
def feature_extraction(X_inputs):
    lang_convert = {
        'en': 'english',
        'de': 'dutch',
        'pt': 'portuguese',
        'fr': 'french'
    }

    wn = WordNetLemmatizer()

    tokens_list = [' '.join([wn.lemmatize(SnowballStemmer(lang_convert[x[1]]).stem(w)) for w in word_tokenize(str.lower(x[0].translate(str.maketrans('', '', string.punctuation))), language=lang_convert[x[1]])]) for x in X_inputs]
    
    return tokens_list

In [26]:
pipe1 = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=multilingual_stop_words_list)),
    ('clf', BinaryRelevance(LogisticRegression(solver='sag'))),
])

pipe1.fit(feature_extraction(X_train), y_train);

In [27]:
pipe2 = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=multilingual_stop_words_list)),
    ('clf', ClassifierChain(LogisticRegression(solver='sag'))),
])

pipe2.fit(feature_extraction(X_train), y_train);

In [28]:
pipe3 = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=multilingual_stop_words_list)),
    ('clf', LabelPowerset(LogisticRegression(solver='sag'))),
])

pipe3.fit(feature_extraction(X_train), y_train);

In [29]:
predictions1 = pipe1.predict(feature_extraction(X_test))

print('Accuracy = ', accuracy_score(y_test,predictions1))
print('F1 score is ',f1_score(y_test, predictions1, average="micro"))
print('Hamming Loss is ', hamming_loss(y_test, predictions1))

Accuracy =  0.5669291338582677
F1 score is  0.7236180904522613
Hamming Loss is  0.08661417322834646


In [30]:
predictions2 = pipe2.predict(feature_extraction(X_test))

print('Accuracy = ', accuracy_score(y_test,predictions2))
print('F1 score is ',f1_score(y_test, predictions2, average="micro"))
print('Hamming Loss is ', hamming_loss(y_test, predictions2))

Accuracy =  0.7559055118110236
F1 score is  0.7559055118110235
Hamming Loss is  0.09763779527559055


In [31]:
predictions3 = pipe3.predict(feature_extraction(X_test))

print('Accuracy = ', accuracy_score(y_test,predictions3))
print('F1 score is ',f1_score(y_test, predictions3, average="micro"))
print('Hamming Loss is ', hamming_loss(y_test, predictions3))

Accuracy =  0.8818897637795275
F1 score is  0.8818897637795275
Hamming Loss is  0.047244094488188976


In [41]:
incorrect_labels = []
for i, (v1, v2) in enumerate(zip(np.array(predictions3.todense()), y_test)):
    if not np.array_equal(v1,v2):
        incorrect_labels.append(i)

incorrect_labels

[1, 10, 28, 30, 31, 43, 56, 64, 68, 73, 76, 91, 98, 111, 116]

In [43]:
[row[0] for row in X_test[incorrect_labels]]

['Company DescriptionWir sind bereit Neues zu wagen. Bist Du es auch?Syntegon Technology ist ein weltweit führender Anbieter von Prozess- und Verpackungstechnik. Rund 5.800 Kolleg:innen\xa0in mehr als 15 Ländern arbeiten für die Syntegon Gruppe an intelligenten und nachhaltigen Technologien für die Pharma und Nahrungsmittelindustrie.Am Standort Waiblingen befindet sich unser Hauptsitz. Von hier aus steuert die Geschäftsführung das Unternehmen weltweit.\xa0In Waiblingen entwickelt, produziert und vertreibt Syntegon sowohl Lösungen für die Pharma- als auch für die\xa0 Nahrungsmittelindustrie. Im Bereich Pharma liegt der Fokus auf Technologien für die Verarbeitung oraler fester Darreichungsformen (OSD, oral solid dosage) wie Tabletten und Kapseln. Eines unserer OSD-Kundenzentren befindet sich hier. Für die Nahrungsmittelindustrie umfasst das Portfolio Verpackungslösungen für trockene Nahrungsmittel wie Kaffee und Mehl.Gehe\xa0den entscheidenden Schritt! Bewerbe dich jetzt!Job DescriptionU

In [20]:
pipe1.predict_proba(feature_extraction(X_test))

<127x5 sparse matrix of type '<class 'numpy.float64'>'
	with 635 stored elements in List of Lists format>

In [39]:
np.array(predictions3.todense())

array([[0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0],
       [0, 0, 0,

In [38]:
y_test

array([[0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0],
       [0, 0, 0,