**Libraries**

In [1]:
!pip install pandas



In [2]:
import pandas as pd

**Data Examination**

In [3]:
df = pd.read_csv("data.csv")

In [4]:
df.head(10)

Unnamed: 0,Title,Company,Location,Number of Applicants,Experience,Employment Type,Description
0,C & C++ Software Engineer,Printec Group,Bucharest,Fiți printre primii 25 de candidați,Nivel mediu de experiență,Full-time,We are seeking for a driven Software Engineer ...
1,Mid C/C++ Developer,ABIT Technologies,"Bucharest, Romania",32 de candidați,Full-time,Unknown,Locație: Brașov/BucureștiMod de colaborare: CI...
2,C++ Developer,MindPal,Alba Iulia,35 de candidați,Începător,Full-time,We are looking for C++ DeveloperResponsibiliti...
3,C++ Developer,MindPal,Bucharest,Peste 200 de candidați,Începător,Full-time,We are looking for C++ DeveloperResponsibiliti...
4,C/C++ Developer,ZYNK,Cluj-Napoca,Fiți printre primii 25 de candidați,Full-time,Unknown,We are looking for a skilled C/C++ Developer t...
5,C++ Developer,MindPal,Arad,30 de candidați,Începător,Full-time,We are looking for C++ DeveloperResponsibiliti...
6,Cross Platform C++ Software Developer,Bitdefender,Bucharest,39 de candidați,Nivel mediu de experiență,Full-time,Are you interested in working for a growing co...
7,"Software Engineer, C++ - EA Sports FC",Electronic Arts (EA),Romania,158 de candidați,Nu se aplică,Full-time,EA SPORTS is one of the most iconic brands in ...
8,Qt / C++ Junior Software Developer,Hipo.ro,"Ilfov, Romania",Fiți printre primii 25 de candidați,Asociat,Full-time,A Snapshot of Your DayAs a Junior Software Dev...
9,C++/C# Developer,Cognyte,Bucharest,Fiți printre primii 25 de candidați,Nu se aplică,Full-time,Today’s world is crime-riddled. Criminals are ...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2513 entries, 0 to 2512
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Title                 2513 non-null   object
 1   Company               2513 non-null   object
 2   Location              2513 non-null   object
 3   Number of Applicants  2513 non-null   object
 4   Experience            2513 non-null   object
 5   Employment Type       2513 non-null   object
 6   Description           2513 non-null   object
dtypes: object(7)
memory usage: 137.6+ KB


In [6]:
df.isna().sum(), (df == "Unknown").sum()

(Title                   0
 Company                 0
 Location                0
 Number of Applicants    0
 Experience              0
 Employment Type         0
 Description             0
 dtype: int64,
 Title                     1
 Company                   1
 Location                  1
 Number of Applicants      1
 Experience                1
 Employment Type         144
 Description               1
 dtype: int64)

In [7]:
for col in df.columns[:-1]:
  print(df[col].value_counts())
  print("\n")

Title
Part-Time Work From Home as Photo Collector Anywhere in Europe    124
Software Engineer                                                  89
DevOps Engineer                                                    57
Java Developer                                                     45
Data Engineer                                                      39
                                                                 ... 
Infrastructure Automation Engineer                                  1
(remote) Cloud Engineer (with FinOps)                               1
DevOps Engineer - GCP Certified                                     1
Senior QA Automation Engineer                                       1
Software Application Specialist                                     1
Name: count, Length: 1283, dtype: int64


Company
TransPerfect                 124
Microsoft                     85
Luxoft                        80
Deutsche Bank                 67
Hipo.ro                       57
         

**Data Processing**

Remove duplicates

In [8]:
df.drop_duplicates(inplace=True)

Experience

In [9]:
def update_experience(df, title_keywords, experience_level):
  """
  Update the 'Experience' column based on keywords in the 'Title' column.
  """

  mask = df["Title"].str.contains('|'.join(title_keywords), case=False)
  df.loc[mask, "Experience"] = experience_level


experience_keywords = {
  "Senior": ["Senior", "Sr"],
  "Junior": ["Junior", "Jr"],
  "Mid": ["Mid", "Middle"]
}


for level, keywords in experience_keywords.items():
  update_experience(df, keywords, level)

In [10]:
def update_experience(df):
  """
  Update the 'Experience' column based on specific keywords in the 'Experience' column.
  """

  for idx, row in df.iterrows():
    exp = row["Experience"]

    if "mediu" in exp:
      df.at[idx, "Experience"] = "Mid"

    elif "Începător" in exp:
      df.at[idx, "Experience"] = "Junior"

    elif "Stagiar" in exp:
      df.at[idx, "Experience"] = "Intern"

    elif "Contract" in exp:
      df.at[idx, "Experience"] = "Senior"

    elif "Director" in exp or "Executiv" in exp:
      df.at[idx, "Experience"] = "Executive"

    elif "Nu se aplică" in exp or "Asociat" in exp or "Full-time" in exp:
      df.at[idx, "Experience"] = "Unknown"


update_experience(df)

Location

In [11]:
df["Location"] = df["Location"].str.split(",").str[0]

In [12]:
df["Location"] = df["Location"].str.replace(" Metropolitan Area", "")

Number of Applicants

In [13]:
df["Number of Applicants"] = df["Number of Applicants"].apply(
    lambda x: "".join(filter(str.isdigit, x)))

Title

In [14]:
categories = {
    "DevOps & SysOps": ["ops", "administrator", "platform engineer",
                        "system", "nanagement", "scrum", "agile",
                         "cloud", "aws", "azure"],
    "Quality Assurance & Testing": ["qa", "quality", "test"],
    "Data Science & Machine Learning": ["data", "scientist", "machine", "ml",
                                        "ai", "etl", "analytics", "analyst",
                                        "analysis", "nlp", "bi",
                                        "Business Intelligence",]
}


def categorize_job_titles(title):
  """
  Categorizes a job title into predefined categories based on keywords.
  """

  for category, keywords in categories.items():
    if any(keyword in title.lower() for keyword in keywords):
      return category
  return "Full Stack Developer"


df["Category"] = df["Title"].apply(categorize_job_titles)

Description

In [15]:
features = {
  "languages": [
      "python", "javascript", "js", "java", "kotlin", "swift", "php",
      "typescript", "ruby", "golang", "sql", "rust", "bash", "shell",
      "assembly", "matlab", "perl", "dart", " r ", " r,", " r/",
      "c#", "c #", "c++", "c ++", " c ", " c,", "c/"
  ],

  "databases": [
      "postgresql", "mysql", "sqlite", "mongodb",
      "microsoft sql server", "redis", "mariadb", "elasticsearch",
      "oracle", "dynamodb", "bigquery", "microsoft access"
  ],

  "clouds": [
      "aws", "azure", "google cloud", "firebase", "cloudflare"
  ],

  "technologies": [
      "flask", "django", "fastapi", "spring boot", "spring framework", "plotly",
      "hibernate", "ruby on rails", "symfony", "matplotlib", "seaborn",
      "beautiful soup", "tensorflow", "scikit-learn", "pytorch", "keras",
      "node.js", "nodejs", " react,", " react ", "jquery", "angular", "next.js",
      "vue.js", "svelte", "bootstrap", "nestjs", "express.js", "wordpress",
      "Power BI", "Tableau", "selenium", "numpy", "pandas", "polars", "opencv",
      "flutter", "opengl", "swiftui", "rabbitmq", "laravel", "cuda", ".net"
  ],

  "tools": [
      "docker", "kubernetes", "kafka", "hadoop", "spark", "jenkins", "git"
  ],

  "education": [
      "degree"
  ]
}

In [16]:
def find_items(description, feature):
  """
  Detects items mentioned in feature the given job description.
  """

  if feature == "languages":
    found_items = [item.replace(" ", "").replace("/", "").replace(",", "") for
                      item in features[feature] if item in description.lower()]

    if "js" in found_items:
      found_items = [item for item in found_items if item not in "js"]
      if not "JavaScript" in found_items:
        found_items.append("javascript")

    if "bash" in found_items or "shell" in found_items:
      found_items = [item for item in found_items if
                        item not in ["bash", "shell"]]
      found_items.append("bash/shell")

    if "c" in found_items or "c++" in found_items:
      found_items = [item for item in found_items if
                        item not in ["c", "c++"]]
      found_items.append("c/c++")

  else:
    found_items = [item.replace(",", "") for item in features[feature] if
                      item in description.lower()]

    if feature == "technologies":
      if " react" in found_items or " react " in found_items:
        found_items = [item for item in found_items if item not in [" react", " react "]]
        found_items.append("react")
      if "nodejs" in found_items:
        found_items = [item for item in found_items if item not in ["nodejs"]]
        found_items.append("node.js")

  return list(set(found_items))


for feature in features.keys():
  df[feature.capitalize()] = df["Description"].apply(
      lambda x: find_items(x, feature))

**Save new df**

In [17]:
df.to_csv("data_processed.csv", index=False)