# HR IPM Data

This notebook describes how the HR data is downloaded, and transformed into a form suitable for further analysis.

There is also a little light characterisation, in terms of some of the major feature fields.

## Setup

In [None]:
import pandas
from bokeh.layouts import gridplot
from bokeh.plotting import figure, output_file, show, output_notebook
from bokeh.palettes import Category20

In [None]:
output_notebook()

In [None]:
import json

In [None]:
secrets = json.load(open("./secrets/secrets.json"))

In [None]:
from db_utils import minio_utils

## Downloading Datafiles
Datafile has been provided as Excel spreadsheet, placed in our Edge Minio. Need to download, and put into the `tempdata` directory.

In [None]:
minio_utils.minio_to_file(
    "tempdata/Full KPA File.xlsx",
    minio_bucket="writeonly",
    minio_key=secrets["minio"]["edge"]["access"],
    minio_secret=secrets["minio"]["edge"]["secret"],
    data_classification=minio_utils.DataClassification.EDGE
)

In [None]:
minio_utils.minio_to_file(
    "tempdata/data_influencer_list.txt",
    minio_bucket="writeonly",
    minio_key=secrets["minio"]["edge"]["access"],
    minio_secret=secrets["minio"]["edge"]["secret"],
    data_classification=minio_utils.DataClassification.EDGE
)

In [None]:
data_strategy_team_members_df = minio_utils.minio_to_dataframe(
    "data-strategy-team-members",
    minio_key=secrets["minio"]["edge"]["access"],
    minio_secret=secrets["minio"]["edge"]["secret"],
    data_classification=minio_utils.DataClassification.EDGE
)

In [None]:
data_strategy_dcc_members_df = minio_utils.minio_to_dataframe(
    "data-strategy-dcc-members",
    minio_key=secrets["minio"]["edge"]["access"],
    minio_secret=secrets["minio"]["edge"]["secret"],
    data_classification=minio_utils.DataClassification.EDGE
)

In [None]:
data_strategy_dcc_members_df.sample(10)

## Constructing list of Data Strategy Associated Names

In [None]:
with open('tempdata/data_influencer_list.txt') as data_influencer_file:
    data_influencer_list = data_influencer_file.read().split("\n")
    
data_influencer_lower_list = pandas.Series(
    map(lambda x: x.lower(), data_influencer_list)
)

In [None]:
data_strategy_affliated = pandas.DataFrame(
    data_influencer_lower_list.append([
        data_strategy_team_members_df.Name.str.lower(),
        data_strategy_dcc_members_df.Name.str.lower()
    ]).unique(),
    columns=("Name",)
)

In [None]:
minio_utils.dataframe_to_minio(
    data_strategy_affliated,
    minio_bucket="data-strategy-affliated",
    minio_key=secrets["minio"]["confidential"]["access"],
    minio_secret=secrets["minio"]["confidential"]["secret"],
    data_classification=minio_utils.DataClassification.CONFIDENTIAL,
    file_format="pickle.gz"
)

## Converting HR Data to Dataframe and uploading

In [None]:
kpa_df = pandas.read_excel('./tempdata/Full KPA File.xlsx')

### Data Munging

In [None]:
print(kpa_df.iloc[1].values)

Removing some unnecessary rows at beginning of spreadsheet:

In [None]:
formatted_kpa_df = kpa_df.drop(
    kpa_df.iloc[:2].index
)

Setting column names, without spaces:

In [None]:
formatted_kpa_df.columns = (
    'Directorate',
    'Department',
    'EmployeeNumber',
    'EmployeeName',
    'PositionNumber',
    'PositionName',
    'PayScaleGroup',
    'Template',
    'CriteriaGroup',
    'Criterion',
    'Row',
    'AppraisalScoreWeight'
)

In the `Row` field, often spaces are inserted between L and the number, e.g. `L 1`. Removing those spaces:

In [None]:
formatted_kpa_df.Row = formatted_kpa_df.Row.str.replace(r'L\s*(?P<level>\d+)', 
                                                        lambda m: 'L' + m.group('level'))

Extracting the T Level into a separate, standalone field:

In [None]:
formatted_kpa_df['TLevel'] = formatted_kpa_df.PayScaleGroup.str.extract(r'T(\d+).*').astype(float)

### Uploading

In [None]:
minio_utils.dataframe_to_minio(
    formatted_kpa_df,
    minio_bucket="hr-ipm-data",
    minio_key=secrets["minio"]["confidential"]["access"],
    minio_secret=secrets["minio"]["confidential"]["secret"],
    data_classification=minio_utils.DataClassification.CONFIDENTIAL,
    file_format="pickle.gz"
)

## HR Data Characterisation

### Tabular Form

In [None]:
formatted_kpa_df.groupby(['Directorate', 'Department', 'PositionName']).PositionNumber.nunique().to_frame().reset_index().sort_values(
    by=['Directorate', 'Department', 'PositionNumber'],
    ascending=False
).to_html('./report/hr_data_summary_table.html', index=False)

### Plotting

In [None]:
directorate_counts = formatted_kpa_df.groupby(['Directorate']).EmployeeNumber.nunique().sort_values(ascending=False)

In [None]:
department_counts = formatted_kpa_df.groupby(['Department']).EmployeeNumber.nunique().sort_values(ascending=False)

In [None]:
tlevel_counts = formatted_kpa_df.groupby(['TLevel']).EmployeeNumber.nunique()#.sort_values(ascending=False)

In [None]:
position_counts = formatted_kpa_df.groupby(['PositionName']).EmployeeNumber.nunique().sort_values(ascending=False)

In [None]:
output_file("report/hr_data_summary.html", mode="cdn")

TOOLS = ["save"]

# Directorate Plot
directorate_figure = figure(width=400, plot_height=600, title="Directorate Breakdown", x_range=directorate_counts.index.values, tools=TOOLS)
directorate_figure.vbar(
    directorate_counts.index.values, top=directorate_counts.values, 
    width=0.9, color=Category20[len(directorate_counts.values)]
)
directorate_figure.xaxis.major_label_orientation = "vertical"
directorate_figure.xaxis.axis_label = "Directorate"

# Department Plot
depts = 15
department_figure = figure(width=400, plot_height=600, title=f"Department Breakdown (top {depts})", tools=TOOLS, 
                           x_range=department_counts.index.values[:depts])
department_figure.vbar(
    department_counts.index.values[:depts], top=department_counts.values[:depts], 
    width=0.9, color=Category20[depts]
)
department_figure.xaxis.major_label_orientation = "vertical"
department_figure.xaxis.axis_label = "Department"

# TLevel Plot
tlevel_figure = figure(width=400, plot_height=600, title=f"T-Level Breakdown", x_range=list(map(str,tlevel_counts.index.values[:])), tools=TOOLS)
tlevel_figure.vbar(
    list(map(str,tlevel_counts.index.values[:])), top=tlevel_counts.values[:], 
    width=0.9, color=Category20[len(tlevel_counts)]
)
tlevel_figure.xaxis.major_label_orientation = "vertical"
tlevel_figure.xaxis.axis_label = "T-Level"

# Position Plot
positions = 15
positions_figure = figure(width=400, plot_height=600, 
                          title=f"Position Breakdown (top {positions})", x_range=position_counts.index.values[:positions], tools=TOOLS,
                         )
positions_figure.vbar(
    position_counts.index.values[:positions], top=position_counts.values[:positions], 
    width=0.9, color=Category20[depts]
)
positions_figure.xaxis.major_label_orientation = "vertical"
positions_figure.xaxis.axis_label = "Position Names"

# show the results
show(gridplot([
    [tlevel_figure, positions_figure],
    [directorate_figure, department_figure],
]))