# Indeed Job Listing Data - Exploratory Data Analysis (EDA) 

In [7]:
# import necessary packages 
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

## Load the data
Import the data scraped using scraper-countries.py (Sept 20-21 2024), for the following countries: USA, Sweden, France, and Italy. These datasets contain the job listings for the 3 largest cities in respective country, for the job titles _Data Scientist_, _Data Analyst_, _Product Analyst_, and _BI Analyst_.

In [45]:
# import csv files with scraped data for resp. country 
# Sweden 
df_SWE = pd.read_csv('output/indeed_jobs_Sweden.csv')
# France 
df_FRA = pd.read_csv('output/indeed_jobs_France.csv')
# Italy
df_ITA = pd.read_csv('output/indeed_jobs_Italy.csv')

In [50]:
# USA 
df_NY = pd.read_csv('output/indeed_jobs_USA_NY.csv')
# df_NY has no country column, add it manually to match the format of the other US cities
df_NY['country'] = 'USA'
df_LA = pd.read_csv('output/indeed_jobs_USA_LA.csv')
df_CHI = pd.read_csv('output/indeed_jobs_USA_Chicago.csv')
# make sure order of columns is the same as for LA and Chicago
desired_order = df_LA.columns.tolist()
print('Order of columns: ', desired_order)
df_NY = df_NY[desired_order]
# concactenate rows for US cities 
df_USA = pd.concat([df_NY, df_LA, df_CHI], ignore_index=True)
# verify that it worked 
df_USA.columns.tolist()==desired_order

Order of columns:  ['page', 'country', 'job_link', 'search_keyword', 'search_location', 'job_title', 'company_name', 'company_location', 'salary', 'job_description']


True

In [27]:
df_SWE.head(5)
#df_SWE.tail(5)

Unnamed: 0,page,country,job_link,search_keyword,search_location,job_title,company_name,company_location,salary,job_description
0,1,Sweden,https://se.indeed.com/rc/clk?jk=a54f123aa8f7ee...,Data+Analyst,Stockholm,Data Analyst (Data Visualization),Swedium Global Services,Distansjobb in Stockholm,Not available,Data Analyst (Data Visualization) \n \n Swediu...
1,1,Sweden,https://se.indeed.com/rc/clk?jk=5dbbee34feadd8...,Data+Analyst,Stockholm,Data Analyst,Deploja AB,169 73 Solna,Not available,Role: \n \n Website Performance: Web analyst f...
2,1,Sweden,https://se.indeed.com/rc/clk?jk=07cc3fa43334c5...,Data+Analyst,Stockholm,Data Analyst,Cubane Solutions AB,Stockholm,Heltid,Responsibilities \n \n Website Performance: W...
3,1,Sweden,https://se.indeed.com/rc/clk?jk=265b21402a8391...,Data+Analyst,Stockholm,Data Analyst,Sobi,Stockholm,Heltid,Company Description \n Statistics show that w...
4,1,Sweden,https://se.indeed.com/rc/clk?jk=65bfa8c313624f...,Data+Analyst,Stockholm,"Juniora Analysts till Operations, Engineering,...",Deloitte,113 57 Stockholm,Heltid,Företagsbeskrivning \n \n \n Deloitte är en g...


In [28]:
df_FRA.head(5)

Unnamed: 0,page,country,job_link,search_keyword,search_location,job_title,company_name,company_location,salary,job_description
0,1,France,https://www.indeed.fr/rc/clk?jk=3163cc1d8a51ee...,Data+Analyst,Paris (75),CDI - Data analyst KYC - Senior F/H,La Banque Postale,Paris (75),CDI,INTRODUCTION \n Au sein de la direction Data ...
1,1,France,https://www.indeed.fr/rc/clk?jk=71fa252714d41c...,Data+Analyst,Paris (75),Data Analyst F/H,IZI Solutions Durables,Télétravail partiel à La Défense (92),CDI,"À propos \n \n En 2019, le Groupe EDF veut re..."
2,1,France,https://www.indeed.fr/rc/clk?jk=21f25c16a32522...,Data+Analyst,Paris (75),Data analyst,Le Groupe La Poste,Issy-les-Moulineaux (92),CDI,"La Poste Groupe change, nos métiers évoluent. ..."
3,1,France,https://www.indeed.fr/pagead/clk?mo=r&ad=-6NYl...,Data+Analyst,Paris (75),Data Analyst H/F,GROUPE ID SEARCH,Paris (75),"À partir de 35 000 € par an - CDI, Temps plein","Tertialis, société du groupe ID SEARCH, est sp..."
4,1,France,https://www.indeed.fr/rc/clk?jk=8c5cc311877c09...,Data+Analyst,Paris (75),Consultant Débutant Data Analyst en CDI - Pari...,EY,92037 La Défense,CDI,Participer à la digitalisation de nos clients ...


In [29]:
# Pick up here: 
df_ITA.head(5)

Unnamed: 0,page,country,job_link,search_keyword,search_location,job_title,company_name,company_location,salary,job_description
0,1,Italy,https://it.indeed.com/pagead/clk?mo=r&ad=-6NYl...,Data+Analyst,"Roma, Lazio",Business Intelligence (Power BI) - Python- Bus...,Area IT Group,"Roma, Lazio",25.000 € - 40.000 € all'anno,Ricerchiamo Sviluppatori in almeno uno dei seg...
1,1,Italy,https://it.indeed.com/pagead/clk?mo=r&ad=-6NYl...,Data+Analyst,"Roma, Lazio",Power Bi Analyst,Verisure Italy,00144 Roma,Not available,Ti piacerebbe far parte della multinazionale d...
2,1,Italy,https://it.indeed.com/pagead/clk?mo=r&ad=-6NYl...,Data+Analyst,"Roma, Lazio",IT Project Management Internship,Procter & Gamble,"Roma, Lazio",Not available,Job Location Rome Job Description INFORMATI...
3,1,Italy,https://it.indeed.com/pagead/clk?mo=r&ad=-6NYl...,Data+Analyst,"Roma, Lazio",Credit & Collection Data Analyst,Verisure Italy S R L,00144 Roma,Not available,Ti piacerebbe far parte della multinazionale d...
4,1,Italy,https://it.indeed.com/pagead/clk?mo=r&ad=-6NYl...,Data+Analyst,"Roma, Lazio",Aeronautical System Analyst,adecco,"Roma, Lazio",Not available,Per azienda leader a livello globale nella ges...


In [51]:
df_SWE.info() # most column data types seem to be Dtype object, convert non-mixed columns to strings to optimize performance 
# columns with both numbers and strings (no need to convert?): company_location, salary
# need to clean search_keyword (contains +)
# need to remove \n for job description column 
# shape 
# describe 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 320 entries, 0 to 319
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   page              320 non-null    int64 
 1   country           320 non-null    object
 2   job_link          320 non-null    object
 3   search_keyword    320 non-null    object
 4   search_location   320 non-null    object
 5   job_title         320 non-null    object
 6   company_name      320 non-null    object
 7   company_location  320 non-null    object
 8   salary            320 non-null    object
 9   job_description   320 non-null    object
dtypes: int64(1), object(9)
memory usage: 25.1+ KB


## Data reduction 
Handle missing and duplicate data entries. Remove unnecessary columns (if any). 

In [None]:
# handle missing values 

# check for potential duplicates 
# nunique() 
# unique()
# isnull().sum()
# also calculate percentage of missing values for each column 

# drop columns if not needed 

# Maybe add scraped date column? 

## Data Cleaning 
Clean and preprocess the data to handle anomalies and outliers. 

## Feature engineering 

In [None]:
# split column to separate numerical values for salary 

# keyword count? 

## Univariate analysis 


## Bivariate analysis 

In [12]:
## Visualization

## Conclusions 
TBA.