In [None]:
import pandas as pd
import numpy as np
import re
import itertools
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor

import spacy
from spacy.matcher import PhraseMatcher
from spacy.pipeline import EntityRecognizer

from skillNer.general_params import SKILL_DB
from skillNer.skill_extractor_class import SkillExtractor

from gensim.utils import simple_preprocess

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('german') + stopwords.words('english'))

nlp = spacy.load("en_core_web_lg") 


In [None]:
df = pd.read_json("./data/DS_crawl.json", lines = True)
df

In [None]:
def get_skills(text:str, skill_extractor, index:int) -> list:
    if index%100==0:
        print(f"Process Text at Index {index}")
    try:
        annotations = skill_extractor.annotate(text)
        return list(set([j.get("doc_node_value") for j in annotations["results"].get("ngram_scored")]))
    except:
        return [""]
    pass

skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)
final_txt_list = [" ".join([word for word in simple_preprocess(str(doc)) if word not in stop_words]) for doc in df["text"]]

with ThreadPoolExecutor(max_workers=min(50, len(final_txt_list))) as pool:
    results = pool.map(get_skills, final_txt_list, [skill_extractor for _ in range(len(final_txt_list))], [i for i in range(len(final_txt_list))])


df["skills"] = [i for i in results]

In [None]:
cities = list(itertools.chain(*[i.split(",") for i in df["stadt"]]))
cities = [i.strip() for i in cities]
cities = [city.replace('Frankfurt (Main)', "Frankfurt am Main").replace('Frankfurt/Main', "Frankfurt am Main").replace('Frankfurt a. M.', "Frankfurt am Main") for city in cities if len(city) != 0] 

#First check top cities
print(pd.Series(cities).value_counts()[:10])

top_cities = ['München', 'Berlin', 'Hamburg', 'Stuttgart', 'Köln',
       'Frankfurt am Main', 'Düsseldorf', 'Essen', 'Hannover',
       'bundesweit']

top_cities_data = [city for city in cities if city in [i for i in top_cities]]

sns.set(rc = {'figure.figsize':(25,10)}, style='whitegrid')
palette=['tab:blue']

ax = sns.countplot(y="class", data=pd.DataFrame({"class": top_cities_data}), 
            order = pd.Series(top_cities).value_counts().index, palette=palette)
ax.set(xlabel = f"Count (n={len(df)})", ylabel='Städte')
plt.show()


In [None]:
#Top10 Skills
skills = list(itertools.chain(*df["skills"]))

#Add Excel and R, because there are missing in he used skill DB
skills = skills + list(itertools.chain(*([list(set(re.findall("\\bR\\b", i))) for i in df["text"] if list(set(re.findall("\\bR\\b", i)))])))
skills = skills + list(itertools.chain(*([list(set(re.findall("\\bExcel\\b", i))) for i in df["text"] if list(set(re.findall("\\bExcel\\b", i)))])))

#Check for relevant Skills first
print((pd.Series(skills).value_counts()))

tech_stack = ["stata","excel","Python", "R", "sql", "java", "big data", "devops", "docker", "git", "kubernetes", "machine learning", "linux", "javascript", "algorithms", "terraform", "spss", "nosql", "deep learning"]
skill_set = ["agile","innovative", "english", "german", "communication skills", "creative", "analytical", "scrum", "programming", "teamwork", "enthusiasm", "reliability"] 
benefits = ["childcare", "yoga", "food", "personal development"]


In [None]:
sns.set(rc = {'figure.figsize':(25,10)}, style='whitegrid')
palette=['tab:blue']

tech_stack_data = [skill for skill in skills if skill.lower() in [i.lower() for i in tech_stack]]
ax = sns.countplot(y="class", data=pd.DataFrame({"class": tech_stack_data}), 
            order = pd.Series(tech_stack_data).value_counts().index, palette=palette)
ax.set(xlabel = f"Count (n={len(df)})", ylabel='Tech-Stack')
plt.show()

skill_set_data = [skill for skill in skills if skill.lower() in [i.lower() for i in skill_set]]
ax = sns.countplot(y="class", data=pd.DataFrame({"class": skill_set_data}), 
            order = pd.Series(skill_set_data).value_counts().index, palette=palette)
ax.set(xlabel=f"Count (n={len(df)})", ylabel='Skill-Stack')
plt.show()

benefits_data = [skill for skill in skills if skill.lower() in [i.lower() for i in benefits]]
ax = sns.countplot(y="class", data=pd.DataFrame({"class": benefits_data}), 
            order = pd.Series(benefits_data).value_counts().index, palette=palette)
ax.set(xlabel=f"Count (n={len(df)})", ylabel='Benefits')
plt.show()