In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go

import pandas as pd
import numpy as np

# Skill Set Plots

In this notebook you can find the code to generate the skill set plots. All data sets are generated on a different notebook.

## Loading the data

First, lets load the webscraped data:
1. `programming_skills` loads the programming skills for all jobs
2. `python_sills` will load the python libraries found for the inspected positions
3. `tech_skills` loads the other skills (e.g. aws, hadoop, tableau, etc) for all the jobs

Afterwards we label the correspoding files to concatenate them. Thus, `skills` will contain all the relevant information

In [5]:
programming_skills = pd.read_csv("../data/raw/programming_summary_2.csv", index_col=[0])
python_skills = pd.read_csv("../data/raw/python_summary_2.csv", index_col=[0])
tech_skills = pd.read_csv("../data/raw/skills_summary_2.csv", index_col=[0])

In [7]:
programming_skills["type"] = "programming"
python_skills["type"] = "python"
tech_skills["type"] = "tech"

In [25]:
skills = pd.concat([programming_skills, python_skills, tech_skills])
skills.head(3)

Unnamed: 0,skill,count,type
0,python,2208,programming
1,scala,121,programming
2,powershell,115,programming


## Loading the skills by job category

The `df_all` data frame contains the set of skills found for the `data scientist`, `data analyst`, and `data engineer` positions. This data frame is generated on a separate notebook.

In [2]:
df_all = pd.read_csv("../data/raw/skills_by_category.csv", index_col=[0])
df_all.head()

Unnamed: 0,skill,count,type,keyword
0,sql,300,programming,data scientist
1,python,200,programming,data scientist
2,r,150,programming,data scientist
3,javascript,50,programming,data scientist
4,php,25,programming,data scientist


## Ploting by group

1. `fig1` displays the programming skills per job category. Thus, first a filtering on `df_all` is applied.
2. `fig2` displays the python skills per job category
3. `fig3` displays the tech skills per job category

These are the plots shown in the slideshow.

### Figure 1 (slide 11)

In [3]:
df_programming = df_all[df_all["type"] == "programming"]

fig1 = go.Figure(layout={'barmode': 'stack'})

for i, (grp, df) in enumerate(df_programming.groupby(by = "keyword")):
    fig1.add_trace(go.Bar(x=df["skill"], y=df["count"], name=grp))

fig1.update_layout(font_size = 16, height = 800, width = 1500, title = "Required Programming Skills", 
                   title_x = 0.5, xaxis=dict(title='Skill'), yaxis=dict(title='Number of occurrences in Job Postings'))
fig1.show()

### Figure 2 (slide 12)

In [4]:
df_python = df_all[df_all["type"] == "python"]

fig2 = go.Figure(layout={'barmode': 'stack'})

for i, (grp, df) in enumerate(df_python.groupby(by = "keyword")):
    fig2.add_trace(go.Bar(x=df["skill"], y=df["count"], name=grp))

fig2.update_layout(font_size = 16, height = 800, width = 1500, title = "Required Python Skills", 
                   title_x = 0.5, xaxis=dict(title='Skill'), yaxis=dict(title='Number of occurrences in Job Postings'))
fig2.show()

### Figure 3 (slide 12)

In [5]:
df_tech = df_all[df_all["type"] == "tech"]

fig3 = go.Figure(layout={'barmode': 'stack'})

for i, (grp, df) in enumerate(df_tech.groupby(by = "keyword")):
    fig3.add_trace(go.Bar(x=df["skill"], y=df["count"], name=grp))

fig3.update_layout(font_size = 16, height = 800, width = 1500, title = "Required Tech Skills", 
                   title_x = 0.5, xaxis=dict(title='Skill'), yaxis=dict(title='Number of occurrences in Job Postings'))
fig3.show()