In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.options.display.max_columns = None

# Load Dataset + Merge

In [3]:
postings = pd.read_csv('postings.csv')
print(postings.shape)
postings.head()

FileNotFoundError: [Errno 2] No such file or directory: 'postings.csv'

In [None]:
for col in postings.columns:
    na_vals = round(postings[col].isnull().sum()/postings.shape[0], 2)
    print(col, na_vals)

In [None]:
postings.dropna(subset=['normalized_salary'], inplace=True)

In [None]:
postings['job_id'].nunique()

In [None]:
industries = pd.read_csv('companies/company_industries.csv')
industries.head()

In [None]:
postings = postings.merge(industries, on='company_id')
print(postings.shape)
postings.head()

In [None]:
skills = pd.read_csv('jobs/job_skills.csv')
skills.head()

In [None]:
skills_mapping = pd.read_csv('mappings/skills.csv').rename(columns={'skill_name': 'field'})
skills_mapping.head()

In [None]:
skills = skills.merge(skills_mapping, on='skill_abr').drop(columns='skill_abr')

In [None]:
grouped_skills = pd.DataFrame(skills.groupby('job_id')['field'].apply(lambda x: ', '.join(x))).reset_index()
grouped_skills

In [None]:
postings = postings.merge(grouped_skills, on='job_id')
print(postings.shape)
postings.head()

# Data Cleaning

In [None]:
postings_cleaned = postings.copy()

In [None]:
postings_cleaned.drop(columns = ['max_salary', 'pay_period', 'company_id',
                         'med_salary', 'original_listed_time', 'job_posting_url', 
                         'application_url', 'expiry', 'min_salary', 'closed_time',
                         'listed_time', 'work_type', 'currency', 'fips', 'sponsored',
                         'remote_allowed', 'title', 'skills_desc', 'applies'
                         ], inplace=True)

In [None]:
postings_cleaned.rename(columns = {
    'company_name': "company",
    'formatted_work_type': "work_type", 
    'formatted_experience_level': 'experience',
    'compensation_type': 'compensation',
    'normalized_salary': 'average_salary'
}, inplace=True)

In [None]:
postings_cleaned['compensation'] = postings_cleaned['compensation'].str.lower()
postings_cleaned['compensation'] = postings_cleaned['compensation'].str.replace('_', ' ')
postings_cleaned['compensation'] = postings_cleaned['compensation'].str.title()

In [None]:
postings_cleaned['work_type'] = postings_cleaned['work_type'].str.replace('-', ' ')
postings_cleaned['work_type'] = postings_cleaned['work_type'].str.title()

In [None]:
postings_cleaned

In [None]:
for col in postings_cleaned.columns:
    na_count = round(postings_cleaned[col].isnull().sum()/postings_cleaned.shape[0], 2)
    print(col, na_count)

In [None]:
Q1 = postings_cleaned['average_salary'].quantile(0.25)
Q3 = postings_cleaned['average_salary'].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

postings_cleaned = postings_cleaned[(postings_cleaned['average_salary'] >= lower_bound) & (postings_cleaned['average_salary'] <= upper_bound)]
print(postings_cleaned.shape)
postings_cleaned

# Exploratory Data Analysis

In [None]:
sns.set_theme(style="whitegrid", context="talk")
sns.set_context("notebook", font_scale=1.2)


sns.histplot(data=postings_cleaned, x="average_salary", bins=30, kde=True, color="skyblue", stat='probability')

plt.xlabel("Salary", fontsize=16)
plt.ylabel("Frequency", fontsize=16)
plt.title("Distribution of Salaries", fontsize=20)
plt.show()

In [None]:
ungrouped_skills = postings_cleaned.merge(skills, on='job_id')

In [None]:
ungrouped_skills

In [None]:
plt.figure(figsize=(15, 10))

sns.set_theme(style="whitegrid", context="talk")
sns.boxplot(x='field_y', y='average_salary', data=ungrouped_skills, color="skyblue")

plt.xticks(rotation=45)

plt.xlabel('Fields', fontsize=24)
plt.ylabel('Average Salary', fontsize=24)
plt.title('Distribution of Normalized Salary Within Each Field', fontsize=30)

plt.show()