In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
import plotly.io as pio
pio.renderers.default = 'iframe'

In [3]:
import plotly.express as px
from jupyter_dash import JupyterDash
from dash import dcc, html
from dash.dependencies import Input, Output

import logging
log = logging.getLogger('werkzeug')
log.setLevel(logging.ERROR)

import random

In [4]:
skills_df = pd.read_csv('../data/final_cleaned_files/all_skills_info.csv')
edu_df = pd.read_csv('../data/final_cleaned_files/all_education_info.csv')
exp_df = pd.read_csv('../data/final_cleaned_files/all_name_and_experience_info.csv')

In [5]:
skills_df.head()

Unnamed: 0,profile_id_dummy,all_skills_link,skills_list,.net,8051 microcontroller,a/b testing,abaqus,accounting,adobe design programs,advertising,...,vietnamese,visual analytics,water,water treatment,wellness,workforce planning,working capital management,writing,written communication,xslt
0,DataScience_0,https://www.linkedin.com/in/david-benham-4582b...,"['SQL', 'Business Intelligence', 'Data Warehou...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,DataScience_1,https://www.linkedin.com/in/siddharthmahapatra...,"['Analytics', 'Statistical Modeling', 'Statist...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,DataScience_2,https://www.linkedin.com/in/vamsi-nellutla/det...,"['Software Project Management', 'SDLC', 'Proje...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,DataScience_3,https://www.linkedin.com/in/jing-xu-39447146/d...,"['Python (Programming Language)', 'SQL', 'R', ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,DataScience_4,https://www.linkedin.com/in/shaunakbangale/det...,"['Business Analytics', 'Python', 'Machine Lear...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
exp_df.head()

Unnamed: 0,profile_id_dummy,company,positions,durations_cleaned,start_date,end_date,names,profile_url,profile_heading,profile_category
0,DataScience_0,Jasper County Collector of Revenue,Deputy Collector of Revenue,2006 - 2010,2006-11-03,2010-11-03,David Benham,https://www.linkedin.com/in/david-benham-4582b...,Principal Data Scientist & Manager of Innovati...,DataScience
1,DataScience_0,Leggett & Platt,Advanced Application Developer Analyst,May 2010 - May 2012,2010-05-03,2012-05-03,David Benham,https://www.linkedin.com/in/david-benham-4582b...,Principal Data Scientist & Manager of Innovati...,DataScience
2,DataScience_0,Chesapeake Energy,Senior Software Developer,May 2012 - Oct 2017,2012-05-03,2017-10-03,David Benham,https://www.linkedin.com/in/david-benham-4582b...,Principal Data Scientist & Manager of Innovati...,DataScience
3,DataScience_0,Chesapeake Energy,Data Scientist,Oct 2017 - Feb 2019,2017-10-03,2019-02-03,David Benham,https://www.linkedin.com/in/david-benham-4582b...,Principal Data Scientist & Manager of Innovati...,DataScience
4,DataScience_0,Chesapeake Energy,Senior Data Scientist,Feb 2019 - Nov 2019,2019-02-03,2019-11-03,David Benham,https://www.linkedin.com/in/david-benham-4582b...,Principal Data Scientist & Manager of Innovati...,DataScience


In [7]:
edu_df.head()

Unnamed: 0,profile_id_dummy,education_institute,degree_name,start_year_degree,end_year_degree,profile_category
0,DataScience_0,Missouri Southern State University,"Bachelor's degree, Mathematics",2005,2010,DataScience
1,DataScience_1,Delhi University,"Bachelor of Arts (B.A.), Economics",2002,2005,DataScience
2,DataScience_1,Anna University,"Master of Science (M.Sc.), Economics",2006,2008,DataScience
3,DataScience_2,Northcentral University,"Doctor of Philosophy - PhD, Data Science Speci...",1900,1900,DataScience
4,DataScience_2,The University of Iowa Tippie College of Business,"MBA, Business Analytics",2007,2007,DataScience


In [8]:
edu_df['profile_category'].unique()

array(['DataScience', 'CTO', 'Consultant'], dtype=object)

# Dash APP

In [28]:
############# Initialize Dash App #############

app = JupyterDash(__name__)

############# Define Plot Functions #############

@app.callback(
    Output('skills-graph', 'figure'),
    [Input("profile-type-dropdown", "value")]
)
def skills_barplot(profile_type):
    single_profile_type_skills = skills_df[skills_df['profile_category'] == profile_type].copy()
    skills_cols = single_profile_type_skills.columns.difference(
        ['profile_id_dummy', 'all_skills_link', 'skills_list', 'profile_category']
    )

    top_skills = single_profile_type_skills[skills_cols].sum(axis=0).sort_values(ascending=False).reset_index()
    top_skills.columns = ['skill_name', 'count_people']
    top_skills['percent_people'] = round((top_skills['count_people']/single_profile_type_skills.shape[0])*100,0)

    plt_df = top_skills.head(30)
    
    fig = px.bar(
        plt_df, 
        x = 'skill_name',
        y = 'percent_people',
        text = 'percent_people',
        hover_data = ['percent_people'],
        height = 600,
        width = 1100,
        labels = dict(
            skill_name = "Skill Name", 
            percent_people = "Percent of People %"
        ),
        title = 'Top 30 Skills for ' + profile_type,
        template = 'simple_white'
    )
    fig.update_xaxes(tickangle=270)
    fig.update_layout(
        title={
            'text' : 'Top 30 Skills for ' + profile_type,
            'y' : 0.9,
            'x' : 0.5,
            'font' : dict(
                size = 18
            )
        }
    )
    return fig


def highest_education_barplot():
    fig = px.bar(
        highest_education_level, 
        x = 'highest_edu_level',
        y = 'percent_people',
        text = 'percent_people',
        facet_col = 'profile_category',
        color = 'profile_category',
        hover_data = ['percent_people'],
        height = 600,
        width = 1100,
        labels = dict(
            highest_edu_level = "Highest Education Level", 
            percent_people = "Percent of People %",
            profile_category = 'Profile Category'
        ),
        title = 'Higest Education Level',
        template = 'simple_white'
    )
    fig.update_xaxes(tickangle=270)
    fig.update_layout(
        title = {
            'text' : 'Higest Education Level for All Profile Categories',
            'y' : 0.95,
            'x' : 0.5,
            'font' : dict(
                size = 18
            )
        },
        barmode = 'stack'
    )
    return fig


############# Define Dash Components #############

skills_dash_component = [
    html.H1("Analyzing the Top Skills for Each Profile Type"),
    html.Div(
        [
            dcc.Dropdown(
                id = 'profile-type-dropdown',
                clearable=False,
                value = 'DataScience',
                options = [
                    {'label': item, 'value': item}
                    for item in ['DataScience', 'CTO', 'Consultant']
                ]
            )
        ],
        style = {
            'width': '30%', 'display': 'inline-block'
        }
    ),
    html.Br(),
    html.Div(
        [dcc.Graph(id = 'skills-graph')], 
        style = {
            'width': '100%'
        }
    ),
]

intermediate_component = [
    html.Br(),
    html.Hr(
        style = {
            'width': '90%', 'height': '1px', 'color': 'black', 'background': 'black'
        }
    ),
    html.Br()
]

highest_edu_dash_component = [
    html.H1("Analyzing the Highest Education Level for Each Profile Type"),
    html.Br(),
    html.Div(
        [dcc.Graph(id = 'highest-edu-graph', figure = highest_education_barplot())],
        style = {
            'width': '100%'
        }
    ),
]

############# Bind Dash App Components Together #############

app.layout = html.Center(
    skills_dash_component + intermediate_component + highest_edu_dash_component,
    style = {
        'justify' : 'center', 'align' : 'center'
    }
)

In [29]:
app.run_server(mode='external', port = random.randrange(3000, 4000) , debug=True)

Dash app running on http://127.0.0.1:3303/


# Skills Analysis

# Education Level

In [9]:
edu_df['degree_name'] = edu_df['degree_name'].str.lower()

**Bachelor**

In [10]:
bachelor_pattern = (
    r'^(b\.{0,}\s{0,1}tech\.{0,}|b\.{0,}s\.{0,}|bachelor\’{0,}\'{0,}s{0,}|bachellor|'
    r'b\.{0,}sc\.{0,}|b\.{0,}a\.{0,}|b\.{0,}b\.{0,}a\.{0,}|b\.{0,}e\.{0,})(?:,|\)|\(|\s)'
)
print(bachelor_pattern)

edu_df['degree_bachelor'] = edu_df['degree_name'].apply(
    lambda x: 'bachelor' if re.match(bachelor_pattern, str(x)) is not None else None
)
edu_df.head()

^(b\.{0,}\s{0,1}tech\.{0,}|b\.{0,}s\.{0,}|bachelor\’{0,}\'{0,}s{0,}|bachellor|b\.{0,}sc\.{0,}|b\.{0,}a\.{0,}|b\.{0,}b\.{0,}a\.{0,}|b\.{0,}e\.{0,})(?:,|\)|\(|\s)


Unnamed: 0,profile_id_dummy,education_institute,degree_name,start_year_degree,end_year_degree,profile_category,degree_bachelor
0,DataScience_0,Missouri Southern State University,"bachelor's degree, mathematics",2005,2010,DataScience,bachelor
1,DataScience_1,Delhi University,"bachelor of arts (b.a.), economics",2002,2005,DataScience,bachelor
2,DataScience_1,Anna University,"master of science (m.sc.), economics",2006,2008,DataScience,
3,DataScience_2,Northcentral University,"doctor of philosophy - phd, data science speci...",1900,1900,DataScience,
4,DataScience_2,The University of Iowa Tippie College of Business,"mba, business analytics",2007,2007,DataScience,


**Master**

In [11]:
master_pattern = (
    r"^(master\'{0,1}\’{0,1}s{0,1}|m\.{0,1}s\.{0,1}|m\.{0,1}a\.{0,1}|m\.{0,1}sc\.{0,1}|"
    r"m\.{0,1}eng\.{0,1}|mca|post graduate)(?:,|\)|\(|\s)"
)
edu_df['degree_master'] = edu_df['degree_name'].apply(
    lambda x: 'master' if re.match(master_pattern, str(x)) is not None else None
)
edu_df.head()

Unnamed: 0,profile_id_dummy,education_institute,degree_name,start_year_degree,end_year_degree,profile_category,degree_bachelor,degree_master
0,DataScience_0,Missouri Southern State University,"bachelor's degree, mathematics",2005,2010,DataScience,bachelor,
1,DataScience_1,Delhi University,"bachelor of arts (b.a.), economics",2002,2005,DataScience,bachelor,
2,DataScience_1,Anna University,"master of science (m.sc.), economics",2006,2008,DataScience,,master
3,DataScience_2,Northcentral University,"doctor of philosophy - phd, data science speci...",1900,1900,DataScience,,
4,DataScience_2,The University of Iowa Tippie College of Business,"mba, business analytics",2007,2007,DataScience,,


**PhD**

In [12]:
phd_pattern = (
    r"^(p\.{0,1}h\.{0,1}d\.{0,1}|doctor of philosophy|doctorate)(?:,|\)|\(|\s)"
)
edu_df['degree_doctorate'] = edu_df['degree_name'].apply(
    lambda x: 'phd' if re.match(phd_pattern, str(x)) is not None else None
)
edu_df.head()

Unnamed: 0,profile_id_dummy,education_institute,degree_name,start_year_degree,end_year_degree,profile_category,degree_bachelor,degree_master,degree_doctorate
0,DataScience_0,Missouri Southern State University,"bachelor's degree, mathematics",2005,2010,DataScience,bachelor,,
1,DataScience_1,Delhi University,"bachelor of arts (b.a.), economics",2002,2005,DataScience,bachelor,,
2,DataScience_1,Anna University,"master of science (m.sc.), economics",2006,2008,DataScience,,master,
3,DataScience_2,Northcentral University,"doctor of philosophy - phd, data science speci...",1900,1900,DataScience,,,phd
4,DataScience_2,The University of Iowa Tippie College of Business,"mba, business analytics",2007,2007,DataScience,,,


**MBA**

In [13]:
mba_pattern = (
    r"(m\.{0,1}b\.{0,1}a\.{0,1}|master of business|p\.{0,1}g\.{0,1}d\.{0,1}m\.{0,1}|post graduate diploma in management|"
    r"post graduate programm{0,1}e{0,1} in management |pgp|pgpx)(?:,|\)|\(|\s)"
)
edu_df['degree_mba'] = edu_df['degree_name'].apply(
    lambda x: 'mba' if re.match(mba_pattern, str(x)) is not None else None
)
edu_df.head()

Unnamed: 0,profile_id_dummy,education_institute,degree_name,start_year_degree,end_year_degree,profile_category,degree_bachelor,degree_master,degree_doctorate,degree_mba
0,DataScience_0,Missouri Southern State University,"bachelor's degree, mathematics",2005,2010,DataScience,bachelor,,,
1,DataScience_1,Delhi University,"bachelor of arts (b.a.), economics",2002,2005,DataScience,bachelor,,,
2,DataScience_1,Anna University,"master of science (m.sc.), economics",2006,2008,DataScience,,master,,
3,DataScience_2,Northcentral University,"doctor of philosophy - phd, data science speci...",1900,1900,DataScience,,,phd,
4,DataScience_2,The University of Iowa Tippie College of Business,"mba, business analytics",2007,2007,DataScience,,,,mba


### Education Analysis

In [14]:
edu_df['education_degree_all'] = edu_df['degree_bachelor'].astype(str) + '#' + edu_df['degree_master'].astype(str) + '#' +\
        edu_df['degree_mba'].astype(str) + '#' + edu_df['degree_doctorate'].astype(str)

In [15]:
profile_level_education = edu_df.groupby(['profile_category', 'profile_id_dummy'])['education_degree_all'].agg(
    lambda x: list(dict.fromkeys(('#'.join(x)).split('#')))
).reset_index()

In [16]:
def get_highest_level_of_edu(x):
    if 'phd' in x:
        edu_level = 'phd'
    elif 'mba' in x:
        edu_level = 'mba'
    elif 'master' in x:
        edu_level = 'master'
    elif 'bachelor' in x:
        edu_level = 'bachelor'
    else:
        edu_level = 'other'
    
    return edu_level

profile_level_education['highest_edu_level'] = profile_level_education['education_degree_all'].apply(get_highest_level_of_edu)

In [17]:
highest_education_level = profile_level_education.groupby(
    ['profile_category', 'highest_edu_level']
)['profile_id_dummy'].nunique().reset_index()

highest_education_level['percent_people'] = highest_education_level['profile_id_dummy']/highest_education_level.groupby(
    'profile_category'
)['profile_id_dummy'].transform('sum')
highest_education_level['percent_people'] = (highest_education_level['percent_people']*100).astype(int)
highest_education_level = highest_education_level.sort_values(
    ['percent_people'], ascending = [False]
).reset_index(drop=True)
highest_education_level.head()

Unnamed: 0,profile_category,highest_edu_level,profile_id_dummy,percent_people
0,Consultant,mba,91,61
1,DataScience,master,50,42
2,DataScience,phd,49,41
3,CTO,bachelor,31,30
4,CTO,master,20,19


In [18]:
highest_education_level

Unnamed: 0,profile_category,highest_edu_level,profile_id_dummy,percent_people
0,Consultant,mba,91,61
1,DataScience,master,50,42
2,DataScience,phd,49,41
3,CTO,bachelor,31,30
4,CTO,master,20,19
5,Consultant,master,28,19
6,CTO,phd,18,17
7,CTO,mba,17,16
8,CTO,other,16,15
9,Consultant,bachelor,18,12
