In [2]:
import json
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statistics
import seaborn as sns
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
PAGE_QUERIES = 52
job_data = []

for page_num in range(1, PAGE_QUERIES + 1):
    r = requests.get('https://www.arbeitnow.com/api/job-board-api', params={'page': page_num})
    job_data.extend(r.json()['data'])

In [4]:
# From https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python
def cleanDescription(html):
    soup = BeautifulSoup(html, features = "html.parser")

    for script in soup(["script","style"]):
        script.extract()
    
    text = soup.get_text()

    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    return '\n'.join(chunk for chunk in chunks if chunk)

def cleanStrList(list):
    for i, e in enumerate(list):
        list[i] = str.lower(e)
    return sorted(list)

In [5]:
labels = list(job_data[0].keys()) # Get names of column labels

df = pd.DataFrame(job_data, columns=labels)
df['description'] = df['description'].apply(cleanDescription)
df['tags'] = df['tags'].apply(cleanStrList)
df['job_types'] = df['job_types'].apply(cleanStrList)

df

Unnamed: 0,slug,company_name,title,description,remote,url,tags,job_types,location,created_at
0,werkstudent-in-im-bereich-datamanagement-bensh...,Preventis GmbH,Werkstudent/in im Bereich Datamanagement (m/w/d),Du willst in einem global ausgerichteten deuts...,False,https://www.arbeitnow.com/jobs/companies/preve...,[database administration],[working student],Bensheim,1683654545
1,graphics-designer-seevetal-423717,Trendt Vision,Graphics Designer (m/w/d),Aufgaben\n- Creating graphics/photo collages\n...,False,https://www.arbeitnow.com/jobs/companies/trend...,"[industrial design, product design]",[professional / experienced],Seevetal,1683654545
2,kopie-von-werkstudent-praktikant-advertiser-ma...,mrge - commerce advertising,Kopie von Werkstudent/Praktikant Advertiser Ma...,Auf der Suche nach einer spannenden Herausford...,False,https://www.arbeitnow.com/jobs/companies/mrge-...,[marketing and communication],"[berufseinstieg, internship]",Berlin,1683654545
3,working-student-publisher-management-berlin-52191,mrge - commerce advertising,Working Student Publisher Management (m/f/d),Looking for an exciting challenge?\nJoin us on...,False,https://www.arbeitnow.com/jobs/companies/mrge-...,[marketing and communication],"[entry, internship]",Berlin,1683654545
4,senior-recruiter-all-genders-munich-221833,Ray Sono AG,(Senior) Recruiter (all genders),Deine Aufgaben bei uns\nAls (Senior) Recruiter...,False,https://www.arbeitnow.com/jobs/companies/ray-s...,[hr],[berufserfahren],Munich,1683654545
...,...,...,...,...,...,...,...,...,...,...
5195,senior-python-developer-in-qa-berlin-21047,Veeam Software,Senior Python Developer in QA,Company DescriptionThe Veeam Quality Assurance...,False,https://www.arbeitnow.com/jobs/companies/veeam...,"[computer software, quality assurance]","[full time, mid-senior]",Berlin,1683204730
5196,commercial-manager-cee-berlin-275097,Omio,"Commercial Manager, CEE",Company DescriptionAbout OmioDriven by our tea...,False,https://www.arbeitnow.com/jobs/companies/omio/...,"[business development, internet]","[full time, mid-senior]",Berlin,1683204730
5197,alltagsbegleitung-alltagsbetreuung-in-freiburg...,European Homecare GmbH,Alltagsbegleitung/Alltagsbetreuung (m/w/d) in ...,Company DescriptionDie European Homecare GmbH ...,False,https://www.arbeitnow.com/jobs/companies/europ...,"[civic and social organization, other]","[associate, full time]",Freiburg im Breisgau,1683204730
5198,elektronikentwickler-quantensensorik-iv-ludwig...,Bosch Group,Elektronikentwickler Quantensensorik (w/m/div.),Company DescriptionBei Bosch gestalten wir Zuk...,False,https://www.arbeitnow.com/jobs/companies/bosch...,"[automotive, engineering]","[associate, full time]",Ludwigsburg,1683204730


In [6]:
tags_count = df['tags'].explode().value_counts().to_dict() # Occurrences of each tag
job_types_count = df['job_types'].explode().value_counts().to_dict() # Occurrences of each job type

In [41]:
similarity_weights = {
    "tags": 7,
    "job_type": 7,
    "location": 1,
    "remote": 1,
    "title": 3
}

similarity_total_weight = sum(similarity_weights.values())

In [58]:
# Code from https://towardsdatascience.com/comparing-documents-with-similarity-metrics-e486bc678a7d
# returns the cosine similarity value of the two given texts
def compute_cosine_similarity(text1, text2):
    
    # stores text in a list
    list_text = [text1, text2]
    
    # converts text into vectors with the TF-IDF 
    vectorizer = TfidfVectorizer(stop_words='english')
    vectorizer.fit_transform(list_text)
    tfidf_text1, tfidf_text2 = vectorizer.transform([list_text[0]]), vectorizer.transform([list_text[1]])
    
    # computes the cosine similarity
    cs_score = cosine_similarity(tfidf_text1, tfidf_text2)
    
    return np.round(cs_score[0][0],2)

def compute_jaccard(list1, list2):
    return len(list(set(list1) & set(list2)))/len(set(list1) | set(list2)) if len(set(list1) | set(list2)) != 0 else 1

In [49]:
target_id = 2047
target_obj = df.iloc[target_id]
target_obj

slug                     customer-care-coordinator-cologne-210316
company_name                                              Eurowag
title                           Customer Care Coordinator (m/w/d)
description     Company DescriptionEurowag wurde vor etwas meh...
remote                                                      False
url             https://www.arbeitnow.com/jobs/companies/eurow...
tags            [customer service, information technology and ...
job_types                                      [entry, full time]
location                                                  Cologne
created_at                                             1683535923
Name: 2047, dtype: object

In [46]:
def getSimilarJobs(target_id, similarity_weights=similarity_weights, df=df, n_similar=10):
    job_similarity = []
    target_obj = df.iloc[target_id]
    similarity_total_weight = sum(similarity_weights.values())

    for job_id, job_obj in df.iterrows():
        if target_id == job_id:
            continue

        tag_jaccard = compute_jaccard(target_obj["tags"], job_obj["tags"])
        job_type_jaccard = compute_jaccard(target_obj["job_types"], job_obj["job_types"])
        location_sim = 1 if target_obj["location"] == job_obj["location"] else 0
        remote_sim = 1 if target_obj["remote"] == job_obj["remote"] else 0
        title_cos = compute_cosine_similarity(target_obj['title'], job_obj['title'])

        similarity = 0
        for weight, val in zip(similarity_weights.values(), [tag_jaccard, job_type_jaccard, location_sim, remote_sim, title_cos]):
            similarity += weight * val

        job_similarity.append({
            "job": job_id,
            "similarity": similarity/similarity_total_weight
        })
    
    similarity_df = pd.DataFrame(job_similarity, columns=["job", "similarity"])
    return df.iloc[list(similarity_df.nlargest(n_similar, 'similarity').index)].join(similarity_df.nlargest(n_similar, 'similarity')['similarity'])

In [48]:
df.iloc[target_id]

slug                     customer-care-coordinator-cologne-210316
company_name                                              Eurowag
title                           Customer Care Coordinator (m/w/d)
description     Company DescriptionEurowag wurde vor etwas meh...
remote                                                      False
url             https://www.arbeitnow.com/jobs/companies/eurow...
tags            [customer service, information technology and ...
job_types                                      [entry, full time]
location                                                  Cologne
created_at                                             1683535923
Name: 2047, dtype: object

In [47]:
getSimilarJobs(target_id, similarity_weights, df, 10)

Unnamed: 0,slug,company_name,title,description,remote,url,tags,job_types,location,created_at,similarity
1520,teamleiter-customer-care-iv-berlin-5615,Bosch Group,Teamleiter Customer Care (w/m/div.),Company DescriptionMöchtest Du Deine Ideen in ...,False,https://www.arbeitnow.com/jobs/companies/bosch...,"[customer service, information technology and ...","[associate, full time]",Berlin,1683550328,0.608596
448,german-speaking-in-house-recruiter-cologne-344052,Verisk,German Speaking In-House Recruiter (m/w/d),Company DescriptionWe help the world see new p...,False,https://www.arbeitnow.com/jobs/companies/veris...,"[human resources, information technology and s...","[entry, full time]",Cologne,1683636731,0.596491
5131,abercrombie-fitch-vollzeit-key-holder-zweibruc...,Abercrombie and Fitch Stores,Abercrombie & Fitch - Vollzeit Key Holder (m/w...,Company DescriptionDas Unternehmen Abercrombi...,False,https://www.arbeitnow.com/jobs/companies/aberc...,"[customer service, retail]","[entry, full time]",Zweibrücken,1683204742,0.596491
433,customer-service-specialist-english-f-m-d-berl...,Omio,Customer Service Specialist - English (f / m /...,"Company DescriptionAt Omio, we take a unique a...",False,https://www.arbeitnow.com/jobs/companies/omio/...,"[customer service, internet]","[entry, full time]",Berlin,1683636733,0.570702
125,engenheiro-de-sistemas-automoveis-h-m-hamburg-...,ALTEN,Engenheiro de sistemas automóveis (H/M),Company DescriptionALTEN é uma empresa líder e...,False,https://www.arbeitnow.com/jobs/companies/alten...,"[engineering, information technology and servi...","[entry, full time]",Hamburg,1683651138,0.54386
592,callcenter-agent-in-voll-oder-teilzeit-bochum-...,DelPro GmbH,"Callcenter Agent (m/w/d) in Voll-, oder Teilzeit",Company DescriptionDie DelPro GmbH ist ein Rec...,False,https://www.arbeitnow.com/jobs/companies/delpr...,"[customer service, legal services]","[entry, full time]",Bochum,1683629534,0.54386
994,ingenieur-systeme-junior-all-gender-hamburg-72570,ALTEN,Ingénieur système junior (all gender),Company DescriptionNous sommes les concepteurs...,False,https://www.arbeitnow.com/jobs/companies/alten...,"[engineering, information technology and servi...","[entry, full time]",Hamburg,1683615122,0.54386
1187,empfangsmitarbeiter-miwid-bergisch-gladbach-49...,Miltenyi Biotec,Empfangsmitarbeiter (mIwId),Company DescriptionJoin us in shaping the futu...,False,https://www.arbeitnow.com/jobs/companies/milte...,"[biotechnology, customer service]","[entry, full time]",Bergisch Gladbach,1683557542,0.54386
1475,technical-trainer-mulheim-karlich-202966,Tomra,Technical Trainer (m/w/d),Company DescriptionTOMRA Recycling entwickelt ...,False,https://www.arbeitnow.com/jobs/companies/tomra...,"[customer service, information technology and ...","[full time, mid-senior]",Mülheim-Kärlich,1683550330,0.54386
1517,teamleiter-kundenservice-iv-berlin-312895,Bosch Group,Teamleiter Kundenservice (w/m/div.),Company DescriptionMöchtest Du Deine Ideen in ...,False,https://www.arbeitnow.com/jobs/companies/bosch...,"[customer service, information technology and ...","[associate, full time]",Berlin,1683550328,0.54386


In [54]:
df.iloc[1458]

slug            sicherheitsmitarbeiter-fur-klinikum-in-wittena...
company_name               Global Protect Sicherheitsdienste GmbH
title           Sicherheitsmitarbeiter (m/w/d) für Klinikum in...
description     Company DescriptionGLOBAL PROTECT ist ein zert...
remote                                                      False
url             https://www.arbeitnow.com/jobs/companies/globa...
tags                         [other, security and investigations]
job_types                                  [associate, full time]
location                                                   Berlin
created_at                                             1683550330
Name: 1458, dtype: object

In [50]:
getSimilarJobs(1458, similarity_weights, df, 10)

Unnamed: 0,slug,company_name,title,description,remote,url,tags,job_types,location,created_at,similarity
1456,sicherheitsmitarbeiter-fur-klinikum-in-lichter...,Global Protect Sicherheitsdienste GmbH,Sicherheitsmitarbeiter (m/w/d) für Klinikum in...,Company DescriptionGLOBAL PROTECT ist ein zert...,False,https://www.arbeitnow.com/jobs/companies/globa...,"[other, security and investigations]","[associate, full time]",Berlin,1683550330,0.936842
1463,sicherheitsmitarbeiter-fur-wohnheim-in-mariend...,Global Protect Sicherheitsdienste GmbH,Sicherheitsmitarbeiter (m/w/d) für Wohnheim in...,Company DescriptionGLOBAL PROTECT ist ein zert...,False,https://www.arbeitnow.com/jobs/companies/globa...,"[other, security and investigations]","[associate, full time]",Berlin,1683550330,0.936842
1466,sicherheitskraft-fur-gesundheitszentrum-berlin...,Global Protect Sicherheitsdienste GmbH,Sicherheitskraft (m/w/d) für Gesundheitszentrum,Company DescriptionGLOBAL PROTECT ist ein zert...,False,https://www.arbeitnow.com/jobs/companies/globa...,"[other, security and investigations]","[associate, full time]",Berlin,1683550330,0.936842
825,sicherheitsmitarbeiter-fur-objektschutz-berlin...,flash-security GmbH,Sicherheitsmitarbeiter (m/w/d) für Objektschutz,Company DescriptionWIR. SICHERN. BERLIN. flash...,False,https://www.arbeitnow.com/jobs/companies/flash...,"[other, security and investigations]","[associate, full time]",Berlin,1683622330,0.906842
837,sicherheitsmitarbeiter-fur-drv-berlin-46916,flash-security GmbH,Sicherheitsmitarbeiter (m/w/d) für DRV,Company DescriptionSkytec Security GmbH steht ...,False,https://www.arbeitnow.com/jobs/companies/flash...,"[other, security and investigations]","[associate, full time]",Berlin,1683622330,0.906842
1232,sicherheitsmitarbeiter-fur-centerbewachung-ber...,Süss Security GmbH,Sicherheitsmitarbeiter (m/w/d) für Centerbewac...,Company DescriptionBereits seit 1991 entwickel...,False,https://www.arbeitnow.com/jobs/companies/suss-...,"[other, security and investigations]","[associate, full time]",Berlin,1683557533,0.906842
836,sicherheitsmitarbeiter-fur-das-bundesamt-berli...,flash-security GmbH,Sicherheitsmitarbeiter (m/w/d) für das Bundesamt,Company DescriptionWIR. SICHERN. BERLIN. flash...,False,https://www.arbeitnow.com/jobs/companies/flash...,"[other, security and investigations]","[associate, full time]",Berlin,1683622330,0.895789
903,sicherheitsmitarbeiter-fur-alba-recyclinghof-b...,Skytec Security GmbH,Sicherheitsmitarbeiter (m/w/d) für ALBA-Recycl...,Company DescriptionSkytec Security GmbH steht ...,False,https://www.arbeitnow.com/jobs/companies/skyte...,"[other, security and investigations]","[associate, full time]",Berlin,1683622322,0.895789
1458,sicherheitsmitarbeiter-fur-klinikum-in-wittena...,Global Protect Sicherheitsdienste GmbH,Sicherheitsmitarbeiter (m/w/d) für Klinikum in...,Company DescriptionGLOBAL PROTECT ist ein zert...,False,https://www.arbeitnow.com/jobs/companies/globa...,"[other, security and investigations]","[associate, full time]",Berlin,1683550330,0.895789
1459,sicherheitsmitarbeiter-fur-wohnheim-in-wedding...,Global Protect Sicherheitsdienste GmbH,Sicherheitsmitarbeiter (m/w/d) für Wohnheim in...,Company DescriptionGLOBAL PROTECT ist ein zert...,False,https://www.arbeitnow.com/jobs/companies/globa...,"[other, security and investigations]","[associate, full time]",Berlin,1683550330,0.895789


In [56]:
df.iloc[5091]

slug            auszubildender-zum-steuerfachangestellten-stad...
company_name                  LTG Steuerberatungsgesellschaft mbH
title           Auszubildender zum Steuerfachangestellten (m/w/d)
description     Leidenschaftlich. Transparent. Ganzheitlich.\n...
remote                                                      False
url             https://www.arbeitnow.com/jobs/companies/ltg-s...
tags                                                [tax advisor]
job_types                                                      []
location                                                    Stade
created_at                                             1683208144
Name: 5091, dtype: object

In [59]:
getSimilarJobs(5091, similarity_weights, df, 10)

Unnamed: 0,slug,company_name,title,description,remote,url,tags,job_types,location,created_at,similarity
5087,auszubildender-zum-steuerfachangestellten-hamb...,LTG Steuerberatungsgesellschaft mbH,Auszubildender zum Steuerfachangestellten (m/w/d),Leidenschaftlich. Transparent. Ganzheitlich.\n...,False,https://www.arbeitnow.com/jobs/companies/ltg-s...,[tax advisor],[],Hamburg,1683208144,0.947368
5089,auszubildender-zum-steuerfachangestellten-rote...,LTG Steuerberatungsgesellschaft mbH,Auszubildender zum Steuerfachangestellten (m/w/d),Leidenschaftlich. Transparent. Ganzheitlich.\n...,False,https://www.arbeitnow.com/jobs/companies/ltg-s...,[tax advisor],[],Rotenburg,1683208144,0.947368
5090,auszubildender-zum-steuerfachangestellten-otte...,LTG Steuerberatungsgesellschaft mbH,Auszubildender zum Steuerfachangestellten (m/w/d),Leidenschaftlich. Transparent. Ganzheitlich.\n...,False,https://www.arbeitnow.com/jobs/companies/ltg-s...,[tax advisor],[],Otterndorf,1683208144,0.947368
5092,auszubildender-zum-steuerfachangestellten-in-f...,LTG Steuerberatungsgesellschaft mbH,Auszubildender zum Steuerfachangestellten (m/w...,Leidenschaftlich. Transparent. Ganzheitlich.\n...,False,https://www.arbeitnow.com/jobs/companies/ltg-s...,[tax advisor],"[apprenticeship, berufseinstieg]",Freiburg,1683208144,0.947368
5082,steuerberater-oder-steuerberateranwarter-alle-...,LTG Steuerberatungsgesellschaft mbH,Steuerberater oder Steuerberateranwärter (alle...,Leidenschaftlich. Transparent. Ganzheitlich.\n...,False,https://www.arbeitnow.com/jobs/companies/ltg-s...,[tax advisor],[],Stade,1683208144,0.842105
5093,auszubildender-zum-steuerfachangestellten-droc...,LTG Steuerberatungsgesellschaft mbH,Auszubildender zum Steuerfachangestellten (m/w/d),Leidenschaftlich. Transparent. Ganzheitlich.\n...,False,https://www.arbeitnow.com/jobs/companies/ltg-s...,[tax advisor],[],Drochtersen,1683208144,0.842105
5101,quereinstieg-als-steuerfachkraft-hamburg-465918,LTG Steuerberatungsgesellschaft mbH,Quereinstieg als Steuerfachkraft (m/w/d),Leidenschaftlich. Transparent. Ganzheitlich.\n...,False,https://www.arbeitnow.com/jobs/companies/ltg-s...,[tax advisor],[],Hamburg,1683208144,0.842105
5108,steuerfachkraft-steuerfachangestellter-steuerf...,LTG Steuerberatungsgesellschaft mbH,"Steuerfachkraft (Steuerfachangestellter, Steue...",Leidenschaftlich. Transparent. Ganzheitlich.\n...,False,https://www.arbeitnow.com/jobs/companies/ltg-s...,[tax advisor],[],Otterndorf,1683208144,0.842105
1088,23082-bilanzbuchhalter-vollzeit-oder-teilzeit-...,Sabine Lechler GmbH Personal.Beratung,23082 Bilanzbuchhalter (m/w/d) Vollzeit oder T...,"Für unseren Kunden, ein Beratungsunternehmen f...",False,https://www.arbeitnow.com/jobs/companies/sabin...,[tax advisor],[],Nuremberg,1683568143,0.789474
1420,steuerfachwirt-sozialwesen-essen-127395,JobAtlas,Steuerfachwirt (m/w/d) - Sozialwesen,Organisation und Finanzen sind Dein Ding? Dann...,False,https://www.arbeitnow.com/jobs/companies/jobat...,[tax advisor],[],Essen,1683553745,0.789474


In [52]:
df.iloc[18]

slug            helpdesk-agent-it-everyone-is-welcome-2nd-leve...
company_name                               Sirius Facilities GmbH
title           Helpdesk Agent IT (everyone is welcome) 2nd Le...
description     Deine Aufgaben\nDu unterstützt den Bereich USE...
remote                                                      False
url             https://www.arbeitnow.com/jobs/companies/siriu...
tags                                       [software development]
job_types                                        [berufserfahren]
location                                                   Berlin
created_at                                             1683654545
Name: 18, dtype: object

In [82]:
tags_count, job_types_count

({'remote': 963,
  'information technology and services': 951,
  'information technology': 838,
  'engineering': 541,
  'sales': 525,
  'retail': 454,
  'computer software': 448,
  'other': 409,
  'customer service': 252,
  'automotive': 233,
  'healthcare services': 204,
  'medical and health': 178,
  'it': 173,
  'it software': 171,
  'marketing and communication': 150,
  'management': 148,
  'finance': 145,
  'tax advisor': 126,
  'sales and business development': 114,
  'software development': 103,
  'security and investigations': 98,
  'project and program management': 80,
  'marketing and product': 74,
  'hr': 73,
  'project management': 67,
  'electrical and electronic manufacturing': 59,
  'transport': 59,
  'human resources': 57,
  'training and instruction': 57,
  'production': 53,
  'cosmetics': 53,
  'logistics and transportation': 51,
  'accounting and finance': 49,
  'team leader': 49,
  'recruitment and selection': 47,
  'administrative and clerical': 47,
  'biotechnolog