In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('job_positions.csv')
df.head()

Unnamed: 0,Position,Location,Company Name,Num Applicants,Sector,Num Employees,Position Type,Remote,Easy Apply
0,Software Developer - OOP,"Ovar, Aveiro, Portugal",Bosch,\n 8 applicants\n,Information Technology & Services,"10,001+ employees",Full-time · Mid-Senior level,False,True
1,Process Automation Developer (M/F),"Porto, Porto, Portugal",adidas,,Sporting Goods,"10,001+ employees",Full-time,False,False
2,Backend Developer,"Lisbon, Portugal",CGI,\n 12 applicants\n,Information Technology & Services,"10,001+ employees",Full-time · Associate,False,True
3,Junior Full Stack Developer,"Porto, Portugal",Tlantic,\n 16 applicants\n,,51-200 employees,Full-time,False,True
4,.NET Fullstack developer,"Lisbon, Portugal",agap2IT Portugal,\n 24 applicants\n,,,Contract,True,True


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 880 entries, 0 to 879
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Position        880 non-null    object
 1   Location        880 non-null    object
 2   Company Name    880 non-null    object
 3   Num Applicants  880 non-null    object
 4   Sector          880 non-null    object
 5   Num Employees   880 non-null    object
 6   Position Type   880 non-null    object
 7   Remote          880 non-null    bool  
 8   Easy Apply      880 non-null    bool  
dtypes: bool(2), object(7)
memory usage: 50.0+ KB


## Fix Location

In [4]:
df['Location'].value_counts()

def short_location(location):
    if 'lisbon' in location.lower() or 'lisboa' in location.lower():
        return 'Lisboa'
    elif location.lower() == 'portugal':
        return 'NA'
    elif 'greater' in location.lower():
        return location.split(' ')[1]
    elif 'metropolitan' in location.lower():
        return location.split(' ')[0]
    else:
        return location.split(',')[0]

df['Location'] = df['Location'].apply(short_location)

In [5]:
df['Location'].value_counts()

Lisboa                 533
Porto                  168
Aveiro                  39
Braga                   31
Castelo Branco          24
Coimbra                 22
NA                      15
Vila Nova de Gaia        9
Leiria                   9
Guimaraes                7
Matosinhos               7
Viseu                    5
Gondomar                 4
Faro                     2
Funchal                  2
Ovar                     1
Guimarães                1
São João da Madeira      1
Name: Location, dtype: int64

## Fix Number of Applicants

In [6]:
df['Num Applicants'].value_counts()
df['Num Applicants'] = df['Num Applicants'].apply(lambda x: x.replace('\n','').strip().replace('None','0').split(' ')[0]).astype('int64')

In [7]:
df['Num Applicants'].value_counts()

0     490
1     108
2      64
6      32
3      30
4      29
7      22
5      16
9      14
10     12
8      11
18      7
11      7
16      6
12      6
23      5
13      4
15      4
19      3
20      3
22      3
24      3
21      1
Name: Num Applicants, dtype: int64

## Fix Number of Employees

In [8]:
df['Num Employees'].value_counts()

1,001-5,000 employees     205
51-200 employees          189
201-500 employees         141
None                      137
501-1,000 employees        85
10,001+ employees          47
5,001-10,000 employees     43
11-50 employees            29
1-10 employees              4
Name: Num Employees, dtype: int64

In [9]:
#df['Num Applicants'] = df['Num Applicants'].apply(lambda x: x.replace('\n','').strip().replace('None','0').split(' ')[0]).astype('int64')
df['Num Employees'] = df['Num Employees'].astype(str).apply(lambda x: x.replace(' employees','').replace('None','0').strip())
df['Num Employees'].value_counts()

1,001-5,000     205
51-200          189
201-500         141
0               137
501-1,000        85
10,001+          47
5,001-10,000     43
11-50            29
1-10              4
Name: Num Employees, dtype: int64

## Fix Position Type

In [10]:
df['Position Type'].value_counts()

Full-time · Entry level         415
Full-time · Associate           182
Full-time · Mid-Senior level    103
Contract · Mid-Senior level      80
None                             44
Full-time                        42
Contract                          8
Contract · Associate              3
Part-time · Associate             1
Part-time                         1
Part-time · Entry level           1
Name: Position Type, dtype: int64

In [11]:
df['Position Type'] = df['Position Type'].apply(lambda x: x.replace('·','|').replace('None','NA').strip())
df['Position Type'].value_counts()

Full-time | Entry level         415
Full-time | Associate           182
Full-time | Mid-Senior level    103
Contract | Mid-Senior level      80
NA                               44
Full-time                        42
Contract                          8
Contract | Associate              3
Part-time | Associate             1
Part-time | Entry level           1
Part-time                         1
Name: Position Type, dtype: int64

## Fix Top 40 Position Titles

In [12]:
df['Position'].value_counts()

Junior Developer                                            25
Front-end Developer – Full-time, Remote                     22
Java Developer – Full-time, Remote                          18
Go Developer – Full-time, Remote                            17
Java Developer                                              15
                                                            ..
Cloud Developer - Lisboa                                     1
Consultor / Programador de Software PHC Manufactor (M/F)     1
Data Analyst/Python Developer (M/F)                          1
Full Stack Java Developer (M/F)                              1
Developer Java (m/f)                                         1
Name: Position, Length: 355, dtype: int64

In [16]:
pd.set_option('display.max_rows', None)

def clean_title(position):
    if 'back-end' in position.lower():
        return position.lower().replace('back-end','Backend').split('-')[0].split('(')[0].strip()
    elif 'front-end' in position.lower():
        return position.lower().replace('front-end','Frontend').split('-')[0].split('(')[0].strip()
    elif  '- full' in position.lower():
        return position.split('- ')[0].strip()
    elif  '|' in position.lower():
        return position.split('|')[0].strip()
    elif  '(' in position.lower():
        return position.split('(')[0].strip()
    elif ':' in  position.lower():
        return position.split(':')[1].split('-')[0].strip()
    elif '-' in  position.lower():
        return position.split('-')[0].strip()
    else:
        return position

df['Position'].apply(clean_title).value_counts()

Java Developer                                           33
Backend Developer                                        29
Angular Developer                                        27
Junior Developer                                         25
Frontend developer – full                                22
Outsystems Developer                                     21
Java Developer – Full                                    18
PHP Developer                                            18
Go Developer – Full                                      17
Fullstack Developer                                      16
Frontend Developer                                       16
.Net Developer                                           16
Python Developer                                         15
Cloud Developer                                          14
C++ Developer                                            13
RPA Developer                                            13
Full Stack Developer                    