**Libraries**

In [1]:
!pip install pandas



In [2]:
import pandas as pd

**Data Examination**

In [3]:
df = pd.read_csv("data.csv")

In [4]:
df.head(10)

Unnamed: 0,Title,Company,Location,Number of Applicants,Experience,Employment Type,Description
0,(Fluent English or Ukrainian) Data Scientist (...,Outstaff Your Team,Bucharest,Fiți printre primii 25 de candidați,Nivel mediu de experiență,Full-time,Our spin-off product and team are looking for ...
1,(Global Oil Gas) Finance Data Scientist,MatchaTalent,"Bucharest, Romania",Fiți printre primii 25 de candidați,Nivel mediu de experiență,Full-time,This role required candidate to permanently re...
2,(Global Oil Gas) Finance Data Scientist,MatchaTalent,"Bucharest, Romania",Fiți printre primii 25 de candidați,Nivel mediu de experiență,Full-time,This role required candidate to permanently re...
3,(Global Oil Gas) Finance Data Scientist,MatchaTalent,"Bucharest, Romania",Fiți printre primii 25 de candidați,Nivel mediu de experiență,Full-time,This role required candidate to permanently re...
4,(remote) Cloud Engineer (with FinOps),HORNBACH Romania,Bucharest Metropolitan Area,Fiți printre primii 25 de candidați,Full-time,Unknown,Technology is not everything for HORNBACH's su...
5,(Senior) Data Scientists - based in Luxembourg,European Investment Bank (EIB),Bucharest,27 de candidați,Nu se aplică,Contract,The positions are based at our Luxembourg head...
6,(Senior) Data Scientists - based in Luxembourg,European Investment Bank (EIB),Bucharest,27 de candidați,Nu se aplică,Contract,The positions are based at our Luxembourg head...
7,(Senior) DevSecOps Engineer– based in Luxembourg,European Investment Bank (EIB),Bucharest,Fiți printre primii 25 de candidați,Nu se aplică,Contract,This position is based at our Luxembourg headq...
8,.NET Backend Developer - Mid or Senior,Mindera - Portugal,Cluj-Napoca,Fiți printre primii 25 de candidați,Nivel mediu de experiență,Full-time,"Here at Mindera, we are continuously developin..."
9,.NET Backend Developer - Mid or Senior,Mindera - Portugal,Cluj-Napoca,Fiți printre primii 25 de candidați,Nivel mediu de experiență,Full-time,"Here at Mindera, we are continuously developin..."


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4003 entries, 0 to 4002
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Title                 4003 non-null   object
 1   Company               4003 non-null   object
 2   Location              4003 non-null   object
 3   Number of Applicants  4003 non-null   object
 4   Experience            4003 non-null   object
 5   Employment Type       4003 non-null   object
 6   Description           4003 non-null   object
dtypes: object(7)
memory usage: 219.0+ KB


In [6]:
df.isna().sum(), (df == "Unknown").sum()

(Title                   0
 Company                 0
 Location                0
 Number of Applicants    0
 Experience              0
 Employment Type         0
 Description             0
 dtype: int64,
 Title                     0
 Company                   0
 Location                  0
 Number of Applicants      0
 Experience                0
 Employment Type         214
 Description               0
 dtype: int64)

In [7]:
for col in df.columns[:-1]:
  print(df[col].value_counts())
  print("\n")

Title
Software Engineer                                                         202
Java Developer                                                             96
DevOps Engineer                                                            85
Data Engineer                                                              82
Python Developer                                                           56
                                                                         ... 
Quality Analyst                                                             1
Qt / C++ Senior Software Developer                                          1
Embedded Software Engineer for Battery Management Systems Applications      1
QA/QC Engineer - Oil & Gas (Saudi Arabia)                                   1
(Fluent English or Ukrainian) Data Scientist (NLP, LLM, Pytorch)            1
Name: count, Length: 1184, dtype: int64


Company
Microsoft             179
MindPal               131
Deutsche Bank         126
Luxoft  

**Data Processing**

Experience

In [8]:
def update_experience(df, title_keywords, experience_level):
  """
  Update the 'Experience' column based on keywords in the 'Title' column.
  """

  mask = df["Title"].str.contains('|'.join(title_keywords), case=False)
  df.loc[mask, "Experience"] = experience_level


experience_keywords = {
  "Senior": ["Senior", "Sr"],
  "Junior": ["Junior", "Jr"],
  "Mid": ["Mid", "Middle"]
}


for level, keywords in experience_keywords.items():
  update_experience(df, keywords, level)

In [9]:
def update_experience(df):
  """
  Update the 'Experience' column based on specific keywords in the 'Experience' column.
  """

  for idx, row in df.iterrows():
    exp = row["Experience"]

    if "mediu" in exp:
      df.at[idx, "Experience"] = "Mid"

    elif "Începător" in exp:
      df.at[idx, "Experience"] = "Junior"

    elif "Stagiar" in exp:
      df.at[idx, "Experience"] = "Intern"

    elif "Contract" in exp:
      df.at[idx, "Experience"] = "Senior"

    elif "Director" in exp or "Executiv" in exp:
      df.at[idx, "Experience"] = "Executive"

    elif "Nu se aplică" in exp or "Asociat" in exp or "Full-time" in exp:
      df.at[idx, "Experience"] = "Unknown"


update_experience(df)

Location

In [10]:
df["Location"] = df["Location"].str.split(",").str[0]

In [11]:
df["Location"] = df["Location"].str.replace(" Metropolitan Area", "")

Number of Applicants

In [12]:
df["Number of Applicants"] = df["Number of Applicants"].apply(
    lambda x: "".join(filter(str.isdigit, x)))

Title

In [13]:
categories = {
    "DevOps & SysOps": ["ops", "administrator", "platform engineer",
                        "system", "nanagement", "scrum", "agile",
                         "cloud", "aws", "azure"],
    "Quality Assurance & Testing": ["qa", "quality", "test"],
    "Data Science & Machine Learning": ["data", "scientist", "machine", "ml",
                                        "ai", "etl", "analytics", "analyst",
                                        "analysis", "nlp", "bi",
                                        "Business Intelligence",]
}


def categorize_job_titles(title):
  """
  Categorizes a job title into predefined categories based on keywords.
  """

  for category, keywords in categories.items():
    if any(keyword in title.lower() for keyword in keywords):
      return category
  return "Full Stack Developer"


df["Category"] = df["Title"].apply(categorize_job_titles)

Description

In [14]:
features = {
  "languages": [
      "python", "javascript", "js", "java", "kotlin", "swift", "php",
      "typescript", "ruby", "golang", "sql", "rust", "bash", "shell",
      "assembly", "scala", "matlab", "perl", "dart", " r ", " r,", " r/",
      "c#", "c #", "c++", "c ++", " c ", " c,", "c/"
  ],

  "databases": [
      "postgresql", "mysql", "sqlite", "mongodb",
      "microsoft sql server", "redis", "mariadb", "elasticsearch",
      "oracle", "dynamodb", "bigquery", "microsoft access"
  ],

  "clouds": [
      "aws", "azure", "google cloud", "firebase", "cloudflare"
  ],

  "technologies": [
      "flask", "django", "fastapi", "spring boot", "spring framework", "plotly",
      "hibernate", "ruby on rails", "symfony", "matplotlib", "seaborn",
      "beautiful soup", "tensorflow", "scikit-learn", "pytorch", "keras",
      "node.js", "nodejs", " react,", " react ", "jquery", "angular", "next.js",
      "vue.js", "svelte", "bootstrap", "nestjs", "express.js", "wordpress",
      "Power BI", "Tableau", "selenium",  "numpy", "pandas", "polars", "opencv",
      "flutter", "electron", "opengl", "swiftui", "rabbitmq", "laravel",
      "cuda", ".net",
  ],

  "tools": [
      "docker", "kubernetes", "kafka", "hadoop", "spark", "jenkins", "git"
  ],

  "education": [
      "degree"
  ]
}

In [15]:
def find_items(description, feature):
  """
  Detects items mentioned in feature the given job description.
  """

  if feature == "languages":
    found_items = [item.replace(" ", "").replace("/", "").replace(",", "") for
                      item in features[feature] if item in description.lower()]

    if "js" in found_items:
      found_items = [item for item in found_items if item not in "js"]
      if not "JavaScript" in found_items:
        found_items.append("javascript")

    if "bash" in found_items or "shell" in found_items:
      found_items = [item for item in found_items if
                        item not in ["bash", "shell"]]
      found_items.append("bash/shell")

    if "c" in found_items or "c++" in found_items:
      found_items = [item for item in found_items if
                        item not in ["c", "c++"]]
      found_items.append("c/c++")

  else:
    found_items = [item for item in features[feature] if
                   item in description.lower()]

  return list(set(found_items))


for feature in features.keys():
  df[feature.capitalize()] = df["Description"].apply(
      lambda x: find_items(x, feature))