## Import libraries and CSV to dataframe

In [255]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

In [256]:
df = pd.read_csv('job_positions.csv', delimiter=";")

df.head(15)

Unnamed: 0,Id,Position,Location,Company Name,Num Applicants,Sector,Num Employees,Position Type,Remote,Easy Apply,Job Details
0,2859901077,Data Scientist,Lisbon Portugal,LovelyStay,\n 24 applicants\n,,11-50 employees,Full-time,FALSO,FALSO,DATA SCIENTIST\nLovelyStay is one of the large...
1,2855483620,Data Scientist,Ermesinde Porto Portugal,Pragm�tica - Ag�ncia de Marketing Digital,\n 5 applicants\n,,1-10 employees,Full-time,FALSO,FALSO,Requisitos:\n- Forma��o em �reas como Ci�ncia ...
2,2867533674,Azure Data Scientist,Portugal,BOLD by Devoteam,0 applicants,IT Services and IT Consulting,501-1 000 employees,Full-time � Mid-Senior level,VERDADEIRO,FALSO,The passion for what we do has no boundaries ...
3,2867548917,Azure Data Analyst,Portugal,BOLD by Devoteam,0 applicants,IT Services and IT Consulting,501-1 000 employees,Full-time � Mid-Senior level,VERDADEIRO,FALSO,The passion for what we do has no boundaries ...
4,2853007464,Data Analyst w/ Python (M/F),Porto Portugal,Capgemini Engineering,\n 7 applicants\n,IT Services and IT Consulting,10 001+ employees,Contract,FALSO,FALSO,Capgemini Engineering combines under one bran...
5,2865921571,Data Analyst,Vila do Conde Porto Portugal,Dorel Juvenile,\n 15 applicants\n,Manufacturing,1 001-5 000 employees,Full-time � Mid-Senior level,FALSO,FALSO,About us\nDorel Juvenile is the world's leadin...
6,2861026003,Accounting Data Analyst,Lisbon Portugal,Bose Corporation,\n 11 applicants\n,Computers and Electronics,5 001-10 000 employees,Full-time � Mid-Senior level,FALSO,FALSO,Job Description\nAre you looking for a challen...
7,2798795416,Data Analyst,Lisbon Portugal,NEX T Engineering,0 applicants,IT Services and IT Consulting,51-200 employees,Full-time � Associate,FALSO,FALSO,Your NEX T step is right here!\nProcuramos qua...
8,2859438934,Data Analyst,Portugal,Keyrus,\n 6 applicants\n,IT Services and IT Consulting,1 001-5 000 employees,Full-time � Mid-Senior level,VERDADEIRO,FALSO,As a Data Analyst you will need analytical an...
9,2867560290,AWS Data Analyst,Portugal,BOLD by Devoteam,0 applicants,IT Services and IT Consulting,501-1 000 employees,Full-time � Mid-Senior level,VERDADEIRO,FALSO,The passion for what we do has no boundaries ...


In [257]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2881 entries, 0 to 2880
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Id              2881 non-null   int64 
 1   Position        2881 non-null   object
 2   Location        2881 non-null   object
 3   Company Name    2847 non-null   object
 4   Num Applicants  2881 non-null   object
 5   Sector          2224 non-null   object
 6   Num Employees   2405 non-null   object
 7   Position Type   2734 non-null   object
 8   Remote          2881 non-null   object
 9   Easy Apply      2881 non-null   object
 10  Job Details     2794 non-null   object
dtypes: int64(1), object(10)
memory usage: 247.7+ KB


## Fix Position Title

In [258]:
df['Position'].value_counts()

def fix_position(position):
    if 'analyst' in position.lower() or 'analista' in position.lower():
        return 'Data Analyst'
    elif 'data' in position.lower() and 'engineer' in position.lower():
        return 'Data Engineer'
    elif 'dados' in position.lower() and 'engenheiro' in position.lower():
        return 'Data Engineer'
    elif 'scientist' in position.lower() or 'cientista' in position.lower():
        return 'Data Scientist'
    else:
        return 'Other'

# Standardize position titles and remove positions different than Data Analyst, Data Engineer or Data Scientist
df['Position'] = df['Position'].apply(fix_position)
df['Position'].value_counts()
df = df[~df.Position.str.contains("Other")]
df['Position'].value_counts()

Data Analyst      488
Data Engineer     469
Data Scientist    214
Name: Position, dtype: int64

## Fix Location

In [259]:
df['Location'].value_counts()
def short_location(location):
    if 'lisbon' in location.lower() or 'lisboa' in location.lower():
        return 'Lisboa'
    if 'porto' in location.lower():
        return 'Porto'
    elif location.lower() == 'portugal':
        return 'NA'
    elif 'european' in location.lower():
        return 'European Union'
    elif 'ponta delgada' in location.lower():
        return 'Ponta Delgada'
    elif 'funchal' in location.lower():
        return 'Funchal'
    elif len(location.split("  ")) == 3:
        return location.split("  ")[1]
    elif 'varzim' in location.lower():
        return 'Póvoa do Varzim'
    else:
        if 'portugal' in location.lower():
            return location.replace('Portugal','')
        else:
            return location

df['Location'] = df['Location'].apply(short_location)
df['Location'] = df['Location'].str.replace(u"\uFFFD", "ú")
df['Location'].value_counts()

Lisboa             770
Porto              211
NA                 123
Braga               15
European Union      11
Aveiro               8
Setúbal              7
Funchal              6
Leiria               5
Coimbra              5
Faro                 3
Ponta Delgada        3
Póvoa do Varzim      2
Coimbra              1
Aveiro               1
Name: Location, dtype: int64

## Fix Company Name

In [260]:
df['Company Name'].value_counts()
df['Company Name'] = df['Company Name'].astype(str).apply(lambda x: x.replace(u"\uFFFD", "").strip())
df['Company Name'].value_counts()

BOLD by Devoteam         132
FARFETCH                  39
Revolut                   27
Capgemini Engineering     23
Agoda                     23
                        ... 
kencko                     1
IDW                        1
Grupo Egor                 1
Kuehne+Nagel               1
Irium Portugal             1
Name: Company Name, Length: 298, dtype: int64

## Fix Number of Applicants

In [261]:
df['Num Applicants'].value_counts()
df['Num Applicants'] = df['Num Applicants'].apply(lambda x: x.replace('\n','').strip().replace('applicants','').replace('applicant','')).astype('int64')
df['Num Applicants'].value_counts()

0     549
1     168
2      90
3      56
4      53
5      32
6      32
7      25
10     22
11     22
8      20
9      15
18     12
12     12
13     11
14     10
20      8
17      7
15      6
19      4
21      4
24      4
16      3
22      3
23      3
Name: Num Applicants, dtype: int64

## Fix Sector

In [262]:
df['Sector'].value_counts()
df['Sector'].fillna('NA', inplace=True)
df['Sector'].value_counts()


IT Services and IT Consulting                   435
NA                                              231
Internet Publishing                             115
Financial Services                               60
Computer Software                                55
Staffing and Recruiting                          35
Management Consulting                            24
Hospitals and Health Care                        22
Banking                                          18
Industrial Automation Machinery                  14
Motor Vehicle Manufacturing                      11
Telecommunications                               11
Human Resources                                   9
Computers and Electronics                         9
Transportation  Logistics and Storage             8
Food and Beverage Services                        7
Travel Arrangements                               7
Outsourcing and Offshoring Consulting             6
International Trade and Development               6
Renewable En

## Fix Number of Employees

In [263]:
df['Num Employees'].value_counts()
df['Num Employees'] = df['Num Employees'].astype(str).apply(lambda x: x.replace(' employees','').replace('nan','NA').strip())
df['Num Employees'].value_counts()

1 001-5 000     235
10 001+         188
NA              168
501-1 000       168
51-200          154
201-500          89
5 001-10 000     81
11-50            65
1-10             23
Name: Num Employees, dtype: int64

## Fix Position Type

In [264]:
df['Position Type'] = df['Position Type'].astype(str).apply(lambda x: x.replace(u"\uFFFD", "").replace('nan','NA').strip().split("  ")[0])

def containsNumber(value):
    for character in value:
        if character.isdigit():
            return 'NA'
    return value

df['Position Type'] = df['Position Type'].apply(containsNumber)
#df['Position Type'] = df['Position Type'].astype(str).apply(lambda x: 'NA' if '/' in x else x)
df['Position Type'].value_counts()

Full-time     1046
NA              67
Contract        43
Internship      12
Temporary        3
Name: Position Type, dtype: int64

## Fix Job Details

In [265]:
df['Job Details'] = df['Job Details'].astype(str).apply(lambda x: x.replace(u"\uFFFD", "").strip())

## Export cleaned data to CSV file

In [266]:
df.head()

Unnamed: 0,Id,Position,Location,Company Name,Num Applicants,Sector,Num Employees,Position Type,Remote,Easy Apply,Job Details
0,2859901077,Data Scientist,Lisboa,LovelyStay,24,,11-50,Full-time,FALSO,FALSO,DATA SCIENTIST\nLovelyStay is one of the large...
1,2855483620,Data Scientist,Porto,Pragmtica - Agncia de Marketing Digital,5,,1-10,Full-time,FALSO,FALSO,Requisitos:\n- Formao em reas como Cincia dos ...
2,2867533674,Data Scientist,,BOLD by Devoteam,0,IT Services and IT Consulting,501-1 000,Full-time,VERDADEIRO,FALSO,The passion for what we do has no boundaries ...
3,2867548917,Data Analyst,,BOLD by Devoteam,0,IT Services and IT Consulting,501-1 000,Full-time,VERDADEIRO,FALSO,The passion for what we do has no boundaries ...
4,2853007464,Data Analyst,Porto,Capgemini Engineering,7,IT Services and IT Consulting,10 001+,Contract,FALSO,FALSO,Capgemini Engineering combines under one bran...


In [267]:
df.to_csv('.\cleaned_job_positions.csv', index = False)