Job Post Analysis and Report

In [1]:
# Import General Libraries
import pandas as pd 
import numpy as np 

# Import Visualization Libraries
import matplotlib.pyplot as plt 
import seaborn as sns 

## Loading Data

In [2]:
df = pd.read_csv(r'transformed_jobs_data.csv')
df.head(3)

Unnamed: 0,job_id,job_employment_type,job_title,job_apply_link,job_description,job_city,job_country,job_posted_at_timestamp,employer_website,employer_company_type
0,XX6946dvNO3187IkAAAAAA==,FULLTIME,Data Engineer,https://ca.linkedin.com/jobs/view/data-enginee...,• *Please note that our offices will be closed...,Montréal,CA,2023-12-19 20:22:04,,
1,6Wu7QcWfrhOR-THoAAAAAA==,FULLTIME,Data Engineer,https://ca.linkedin.com/jobs/view/data-enginee...,We are seeking an experienced and passionate D...,,CA,2023-12-19 15:55:49,,
2,Oiu186OT5E6bkZReAAAAAA==,FULLTIME,"2024 RBCIS, Summer Co-op Data Engineer / Full ...",https://ca.linkedin.com/jobs/view/2024-rbcis-s...,Job Summary\n\nJob Description\n\nWhat is the ...,Toronto,CA,2023-12-19 22:30:06,http://www.rbc.com,Finance


In [3]:
# df['job_city'].unique()

In [4]:
df.tail()

Unnamed: 0,job_id,job_employment_type,job_title,job_apply_link,job_description,job_city,job_country,job_posted_at_timestamp,employer_website,employer_company_type
161,0JxLam0cJtXsmjQZAAAAAA==,FULLTIME,Senior Cloud Data Engineer,https://www.linkedin.com/jobs/view/senior-clou...,Job Description\n\nJob Summary:\n\nThis positi...,Las Vegas,US,2023-12-19 12:11:14,http://www.bdo.com,Consulting
162,EGvoxwF7K6W0LOMtAAAAAA==,FULLTIME,Senior Cloud Data Engineer,https://www.linkedin.com/jobs/view/senior-clou...,Job Description\n\nJob Summary:\n\nThis positi...,Richmond,US,2023-12-19 10:42:36,http://www.bdo.com,Consulting
163,TvhAbNPDfD24fyHVAAAAAA==,FULLTIME,Senior Cloud Data Engineer,https://www.linkedin.com/jobs/view/senior-clou...,Job Description\n\nJob Summary:\n\nThis positi...,Fort Worth,US,2023-12-19 11:52:55,http://www.bdo.com,Consulting
164,6N_3eAHv_8HXYM0cAAAAAA==,FULLTIME,Senior Cloud Data Engineer,https://www.linkedin.com/jobs/view/senior-clou...,Job Description\n\nJob Summary:\n\nThis positi...,Columbus,US,2023-12-19 12:15:04,http://www.bdo.com,Consulting
165,iQu2KNPFHbJstSZuAAAAAA==,FULLTIME,Senior Cloud Data Engineer,https://www.linkedin.com/jobs/view/senior-clou...,Job Description\n\nJob Summary:\n\nThis positi...,Fort Lauderdale,US,2023-12-19 12:11:14,http://www.bdo.com,Consulting


In [5]:
df.shape

(166, 10)

In [6]:
df.isnull().sum()

job_id                      0
job_employment_type         0
job_title                   0
job_apply_link              0
job_description             0
job_city                   38
job_country                 0
job_posted_at_timestamp     0
employer_website           53
employer_company_type      90
dtype: int64

In [7]:
# Function to format null values to 2dp
def formatt(x):
    return '{:.2f}%'.format(x)

((df.isnull().sum() / len(df)) * 100).apply(formatt)

job_id                      0.00%
job_employment_type         0.00%
job_title                   0.00%
job_apply_link              0.00%
job_description             0.00%
job_city                   22.89%
job_country                 0.00%
job_posted_at_timestamp     0.00%
employer_website           31.93%
employer_company_type      54.22%
dtype: object

- Presence of missing data in **THREE** columns namely:
    - job_city [22.89%]
    - employer_website [31.93%]
    - employer_company_type [54.22%]

In [8]:
df.nunique()

job_id                     166
job_employment_type          4
job_title                   99
job_apply_link             166
job_description            153
job_city                    70
job_country                  3
job_posted_at_timestamp    154
employer_website            88
employer_company_type       13
dtype: int64

In [9]:
df.duplicated().value_counts()

False    166
Name: count, dtype: int64

- No duplicates.

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166 entries, 0 to 165
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   job_id                   166 non-null    object
 1   job_employment_type      166 non-null    object
 2   job_title                166 non-null    object
 3   job_apply_link           166 non-null    object
 4   job_description          166 non-null    object
 5   job_city                 128 non-null    object
 6   job_country              166 non-null    object
 7   job_posted_at_timestamp  166 non-null    object
 8   employer_website         113 non-null    object
 9   employer_company_type    76 non-null     object
dtypes: object(10)
memory usage: 13.1+ KB


In [11]:
# Impute missing job_city 
modeCity = df.groupby('job_country')['job_city'].count().reset_index()
modeCity.columns = ['job_country', 'job_city']
modeCity

    

Unnamed: 0,job_country,job_city
0,CA,26
1,GB,48
2,US,54


In [30]:
df.dtypes

job_id                     object
job_employment_type        object
job_title                  object
job_apply_link             object
job_description            object
job_city                   object
job_country                object
job_posted_at_timestamp    object
employer_website           object
employer_company_type      object
dtype: object

In [31]:
# Convert job_city to a categorical data
df['job_city'] = df['job_city'].astype('category')

In [35]:
# Label Encoding in Pandas
df['Encoding_with_pandas'] = df['job_city'].cat.codes
df.head(3)

Unnamed: 0,job_id,job_employment_type,job_title,job_apply_link,job_description,job_city,job_country,job_posted_at_timestamp,employer_website,employer_company_type,Encoding_with_pandas
0,XX6946dvNO3187IkAAAAAA==,FULLTIME,Data Engineer,https://ca.linkedin.com/jobs/view/data-enginee...,• *Please note that our offices will be closed...,Montréal,CA,2023-12-19 20:22:04,,,44
1,6Wu7QcWfrhOR-THoAAAAAA==,FULLTIME,Data Engineer,https://ca.linkedin.com/jobs/view/data-enginee...,We are seeking an experienced and passionate D...,,CA,2023-12-19 15:55:49,,,-1
2,Oiu186OT5E6bkZReAAAAAA==,FULLTIME,"2024 RBCIS, Summer Co-op Data Engineer / Full ...",https://ca.linkedin.com/jobs/view/2024-rbcis-s...,Job Summary\n\nJob Description\n\nWhat is the ...,Toronto,CA,2023-12-19 22:30:06,http://www.rbc.com,Finance,64


In [37]:
# Label encoding using Scikit-Learn
from sklearn.preprocessing import LabelEncoder

# Create an instance
label_encoder = LabelEncoder()

# Fit LabelEncoder
label_encoder.fit(df['job_city'])

df['encoded_with_sklearn'] = label_encoder.transform(df['job_city'])

In [38]:
df.head()

Unnamed: 0,job_id,job_employment_type,job_title,job_apply_link,job_description,job_city,job_country,job_posted_at_timestamp,employer_website,employer_company_type,Encoding_with_pandas,encoded_with_sklearn
0,XX6946dvNO3187IkAAAAAA==,FULLTIME,Data Engineer,https://ca.linkedin.com/jobs/view/data-enginee...,• *Please note that our offices will be closed...,Montréal,CA,2023-12-19 20:22:04,,,44,44
1,6Wu7QcWfrhOR-THoAAAAAA==,FULLTIME,Data Engineer,https://ca.linkedin.com/jobs/view/data-enginee...,We are seeking an experienced and passionate D...,,CA,2023-12-19 15:55:49,,,-1,70
2,Oiu186OT5E6bkZReAAAAAA==,FULLTIME,"2024 RBCIS, Summer Co-op Data Engineer / Full ...",https://ca.linkedin.com/jobs/view/2024-rbcis-s...,Job Summary\n\nJob Description\n\nWhat is the ...,Toronto,CA,2023-12-19 22:30:06,http://www.rbc.com,Finance,64,64
3,5y7azm6gLeu1ab_9AAAAAA==,FULLTIME,Lead Data Engineer,https://ca.linkedin.com/jobs/view/lead-data-en...,"Overview\n\nAt Nortal, we are driven by a gran...",,CA,2023-12-19 17:06:15,http://www.nortal.com,Computer Services,-1,70
4,CJUJsgmqVdDmJmHgAAAAAA==,FULLTIME,2024 Investor Services - Business Data Analyst...,https://jobs.rbc.com/ca/en/job/R-0000074144/20...,Job Summary\n\nWhat is the opportunity?\n\nAs ...,Toronto,CA,2023-12-20 00:00:00,http://www.rbc.com,Finance,64,64


In [44]:
# One Hot Encoding
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder()

transformed_one_hot_encoding = one_hot_encoder.fit_transform(df[['job_city']])

ohe_df = pd.DataFrame(transformed_one_hot_encoding.toarray(), dtype='int')
ohe_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,61,62,63,64,65,66,67,68,69,70
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [45]:
ohe_final_df = df.join(ohe_df)
ohe_final_df

Unnamed: 0,job_id,job_employment_type,job_title,job_apply_link,job_description,job_city,job_country,job_posted_at_timestamp,employer_website,employer_company_type,...,61,62,63,64,65,66,67,68,69,70
0,XX6946dvNO3187IkAAAAAA==,FULLTIME,Data Engineer,https://ca.linkedin.com/jobs/view/data-enginee...,• *Please note that our offices will be closed...,Montréal,CA,2023-12-19 20:22:04,,,...,0,0,0,0,0,0,0,0,0,0
1,6Wu7QcWfrhOR-THoAAAAAA==,FULLTIME,Data Engineer,https://ca.linkedin.com/jobs/view/data-enginee...,We are seeking an experienced and passionate D...,,CA,2023-12-19 15:55:49,,,...,0,0,0,0,0,0,0,0,0,1
2,Oiu186OT5E6bkZReAAAAAA==,FULLTIME,"2024 RBCIS, Summer Co-op Data Engineer / Full ...",https://ca.linkedin.com/jobs/view/2024-rbcis-s...,Job Summary\n\nJob Description\n\nWhat is the ...,Toronto,CA,2023-12-19 22:30:06,http://www.rbc.com,Finance,...,0,0,0,1,0,0,0,0,0,0
3,5y7azm6gLeu1ab_9AAAAAA==,FULLTIME,Lead Data Engineer,https://ca.linkedin.com/jobs/view/lead-data-en...,"Overview\n\nAt Nortal, we are driven by a gran...",,CA,2023-12-19 17:06:15,http://www.nortal.com,Computer Services,...,0,0,0,0,0,0,0,0,0,1
4,CJUJsgmqVdDmJmHgAAAAAA==,FULLTIME,2024 Investor Services - Business Data Analyst...,https://jobs.rbc.com/ca/en/job/R-0000074144/20...,Job Summary\n\nWhat is the opportunity?\n\nAs ...,Toronto,CA,2023-12-20 00:00:00,http://www.rbc.com,Finance,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,0JxLam0cJtXsmjQZAAAAAA==,FULLTIME,Senior Cloud Data Engineer,https://www.linkedin.com/jobs/view/senior-clou...,Job Description\n\nJob Summary:\n\nThis positi...,Las Vegas,US,2023-12-19 12:11:14,http://www.bdo.com,Consulting,...,0,0,0,0,0,0,0,0,0,0
162,EGvoxwF7K6W0LOMtAAAAAA==,FULLTIME,Senior Cloud Data Engineer,https://www.linkedin.com/jobs/view/senior-clou...,Job Description\n\nJob Summary:\n\nThis positi...,Richmond,US,2023-12-19 10:42:36,http://www.bdo.com,Consulting,...,0,0,0,0,0,0,0,0,0,0
163,TvhAbNPDfD24fyHVAAAAAA==,FULLTIME,Senior Cloud Data Engineer,https://www.linkedin.com/jobs/view/senior-clou...,Job Description\n\nJob Summary:\n\nThis positi...,Fort Worth,US,2023-12-19 11:52:55,http://www.bdo.com,Consulting,...,0,0,0,0,0,0,0,0,0,0
164,6N_3eAHv_8HXYM0cAAAAAA==,FULLTIME,Senior Cloud Data Engineer,https://www.linkedin.com/jobs/view/senior-clou...,Job Description\n\nJob Summary:\n\nThis positi...,Columbus,US,2023-12-19 12:15:04,http://www.bdo.com,Consulting,...,0,0,0,0,0,0,0,0,0,0


In [48]:
# Generate OHE using pandas dummy
dummy_df = pd.get_dummies(df['job_city'], prefix='job_city')
dummy_df

Unnamed: 0,job_city_Ajax,job_city_Atlanta,job_city_Auburn Hills,job_city_Austin,job_city_Baltimore,job_city_Belfast,job_city_Birmingham,job_city_Boston,job_city_Bristol,job_city_Cambridge,...,job_city_Rolling Meadows,job_city_San Diego,job_city_Seattle,job_city_Springdale,job_city_Toronto,job_city_Vancouver,job_city_Warwick,job_city_Washington,job_city_West Palm Beach,job_city_Wokingham
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
162,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
163,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
164,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [49]:
dummy_df.style.applymap(lambda x: 'background-color: green' if x > 0 else 'background-color: yellow')

Unnamed: 0,job_city_Ajax,job_city_Atlanta,job_city_Auburn Hills,job_city_Austin,job_city_Baltimore,job_city_Belfast,job_city_Birmingham,job_city_Boston,job_city_Bristol,job_city_Cambridge,job_city_Camden,job_city_Charlotte,job_city_Chicago,job_city_Cincinnati,job_city_Claymont,job_city_Cleveland,job_city_Columbia,job_city_Columbus,job_city_Costa Mesa,job_city_Dallas,job_city_Denver,job_city_Edinburgh,job_city_Edmonton,job_city_Fareham,job_city_Fort Lauderdale,job_city_Fort Worth,job_city_Fremont,job_city_Grand Rapids,job_city_Halifax,job_city_Houston,job_city_Irving,job_city_Jersey City,job_city_Kingston,job_city_Knutsford,job_city_Las Vegas,job_city_Leeds,job_city_London,job_city_Los Angeles,job_city_Malmesbury,job_city_Manchester,job_city_Markham,job_city_Miami,job_city_Milton Keynes,job_city_Milwaukee,job_city_Montréal,job_city_Mountain View,job_city_New York,job_city_Newcastle upon Tyne,job_city_Niles,job_city_North Vancouver,job_city_Oakville,job_city_Ogden,job_city_Ottawa,job_city_Palo Alto,job_city_Philadelphia,job_city_Plano,job_city_Porton Down,job_city_Reading,job_city_Richmond,job_city_Rockville,job_city_Rolling Meadows,job_city_San Diego,job_city_Seattle,job_city_Springdale,job_city_Toronto,job_city_Vancouver,job_city_Warwick,job_city_Washington,job_city_West Palm Beach,job_city_Wokingham
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [12]:
# Function to fill missing volumes with the mode value of each city
# def fill_missing_city(df):
#     if np.isnan(df['job_city']):
#         return modeCity[modeCity['job_country'] == df['job_country']]['mode']
    
#     return df['job_city']

# df['job_city'] = df.apply(fill_missing_city, axis=1)

In [13]:
df.columns

Index(['job_id', 'job_employment_type', 'job_title', 'job_apply_link',
       'job_description', 'job_city', 'job_country', 'job_posted_at_timestamp',
       'employer_website', 'employer_company_type'],
      dtype='object')

In [16]:
len(df['job_city'].unique())

71

In [None]:
# imputer = SimpleImputer(strategy='most_frequent')
# imputer.fit_transform(df['job_city'])

In [None]:
# Impute job_city using the most frequent value
# from sklearn.impute import SimpleImputer
# impute_mode = SimpleImputer(strategy='most_frequent')
# impute_mode.fit(df[['job_city']])

# df[['job_city']] = impute_mode.fit_transform(df[['job_city']].values.reshape(-1,1))
# df['job_city'].isna().sum()


0

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166 entries, 0 to 165
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   job_id                   166 non-null    object
 1   job_employment_type      166 non-null    object
 2   job_title                166 non-null    object
 3   job_apply_link           166 non-null    object
 4   job_description          166 non-null    object
 5   job_city                 128 non-null    object
 6   job_country              166 non-null    object
 7   job_posted_at_timestamp  166 non-null    object
 8   employer_website         113 non-null    object
 9   employer_company_type    76 non-null     object
dtypes: object(10)
memory usage: 13.1+ KB


In [None]:
# Drop missing rows
# df_copy = df.copy()

# df_copy.dropna(inplace=True)

# df_copy.isna().sum()

### Exploratory Data Analysis