# IPM Data Analysis

This notebook describes the analysis of the HR IPM data with respect to answering questions around different types of data-related work happening at the City.

## Setup

In [None]:
import spacy
import pandas
import numpy
import sklearn.decomposition
import sklearn.metrics.pairwise
import sklearn.preprocessing

from tqdm import tqdm
from tqdm.auto import tqdm

from bokeh.layouts import gridplot
from bokeh.plotting import figure, output_file, show, output_notebook
from bokeh.palettes import Category20, Category20b, Category20c
from bokeh.models import ColumnDataSource, LabelSet, Legend, BoxAnnotation, LinearAxis, Range1d, Arrow
from bokeh.transform import dodge
from bokeh.core.properties import value

In [None]:
output_notebook()
tqdm.pandas()

In [None]:
import multiprocessing
import json
from collections import Counter
import pprint

In [None]:
secrets = json.load(open("./secrets/secrets.json"))

In [None]:
from db_utils import minio_utils

## Getting Data

In [None]:
hr_na_df = minio_utils.minio_to_dataframe(
    minio_bucket="hr-ipm-data-non-admin",
    minio_key=secrets["minio"]["confidential"]["access"],
    minio_secret=secrets["minio"]["confidential"]["secret"],
    data_classification=minio_utils.DataClassification.CONFIDENTIAL,
)

In [None]:
hr_admin_df = minio_utils.minio_to_dataframe(
    minio_bucket="hr-ipm-data-admin",
    minio_key=secrets["minio"]["confidential"]["access"],
    minio_secret=secrets["minio"]["confidential"]["secret"],
    data_classification=minio_utils.DataClassification.CONFIDENTIAL,
)

In [None]:
data_strategy_affliated = minio_utils.minio_to_dataframe(
    minio_bucket="data-strategy-affliated",
    minio_key=secrets["minio"]["confidential"]["access"],
    minio_secret=secrets["minio"]["confidential"]["secret"],
    data_classification=minio_utils.DataClassification.CONFIDENTIAL,
)

### For Example?

Creating an example df that will be used to illustrate the mapping process, using yours truly:

In [None]:
employee_name = 'Gordon Inggs'

example_hr_source_df = hr_na_df.query(
    "EmployeeName == @employee_name"
)[["Directorate", "Department", "PositionName", "CriteriaGroup", "Row", "AppraisalScoreWeight"]]

example_hr_source_df

In [None]:
example_hr_source_df.to_html(
    './report/gordon_source_df.html', index=False
)

## Vector Space Embedding

Using [spaCy's English large model](https://github.com/explosion/spacy-models/releases/tag/en_core_web_lg-2.1.0).

In [None]:
nlp = spacy.load('en_core_web_lg')

### Parallel Helper

Lots of embarrisingly parallel operations, hence, it makes sense to define a parallel helper function.

In [None]:
N_PROCS = 16
N_CHUNKS = N_PROCS*4
MIN_CHUNKSIZE = 10000

def parallel_helper(data_df, df_apply, unordered=True):
    with multiprocessing.Pool(N_PROCS) as pool:
        chunk_size = max(data_df.shape[0] // N_CHUNKS + 1, MIN_CHUNKSIZE)

        # Chunk up the dataframe
        df_chunks = (
            (data_df.iloc[i * chunk_size:(i + 1) * chunk_size])
            for i in range(N_CHUNKS)
        )

        # Chunking work across processors
        imap_func = pool.imap_unordered if unordered else pool.imap
        result_chunks = imap_func(
            df_apply,
            df_chunks
        )

        # Pull it all back together
        results = pandas.concat(result_chunks)

    return results

### Stop Words

Before we do any sort of NLP, we need to define our domain specific stop words.

In [None]:
%%time

def get_non_stop_words(data_df):
    return data_df.Row.apply(
        lambda x: [
            token.text.lower() 
            for token in nlp(x) 
            if not token.is_punct and not token.is_stop
        ]
    )

non_stop_words = parallel_helper(
    pandas.concat([hr_admin_df, hr_na_df]),
    get_non_stop_words,
)

In [None]:
non_stop_word_counter = Counter([
    word
    for row in non_stop_words
    for word in row
])

In [None]:
non_stop_word_counter.most_common()[:30]

In [None]:
stop_words = {
    "service", "delivery", # All of us are doing this
    "function", "functions", # City-dialect for job
    "orientation", "orientations", # City-dialect for skill
    "problem", "solving", # Again, all of us should be doing this
    "cfadm", "cfpro", "cfuni", "cfsup", "cfart", "cfman", "cfart", "cftec", # Competency frameworks - 
    "kpaa", "kpan", # KPA classifications
    "l1",  "l2", "l3", "l4", "l5" # Level descriptions - captured in T-level
}
nlp.Defaults.stop_words |= stop_words

### Embedding

Now, to actually map the `Row` column values into the vector space.

In [None]:
%%time

def get_vectors(data_df):
    return data_df.Row.apply(
        lambda row: nlp(row.lower()).vector
    )

for hr_df in (hr_na_df, hr_admin_df):
    row_vectors = parallel_helper(
        hr_df,
        get_vectors,
        unordered=False,
    )
    hr_df["RowVector"] = row_vectors

In [None]:
example_hr_df = hr_na_df.query(
    "EmployeeName == @employee_name"
)

In [None]:
example_hr_df[
    ["Directorate", "Department", "PositionName", "CriteriaGroup", "Row", "RowVector", "AppraisalScoreWeight"]
].to_html(
    './report/gordon_source_wv_df.html', index=False
)

In [None]:
numpy.vstack(
    example_hr_df["RowVector"].values
).shape

### Reducing from Criteria -> Poisition

Using centre of mass formula:

$$C = \frac{\sum_i^N{W_i X_i}}{\sum_i^N{W_i}}$$

* $C$ - new position
* $N$ - Number of entries in row $i$
* $W_i$ - row $i$'s weight
* $X_i$ - row $i$'s vector

In [None]:
def reduce_to_cg(hr_df):
    criteria_group_df = hr_df.groupby([
        'Directorate', 'Department', 'EmployeeNumber', 'EmployeeName',
        'PositionNumber', 'PositionName', 'PayScaleGroup', 'Template', 
        'CriteriaGroup', 'Criterion', 'TLevel'
    ]).apply(
        lambda position_cg_df: (
            position_cg_df.AppraisalScoreWeight*position_cg_df.RowVector
        ).sum()/(
            position_cg_df.AppraisalScoreWeight.sum()
        )
    ).reset_index()

    criteria_group_df["CriteriaGroupVector"] = criteria_group_df[0]
    criteria_group_df.drop(0, axis='columns', inplace=True)

    return criteria_group_df

In [None]:
criteria_group_admin_df = reduce_to_cg(hr_admin_df)
criteria_group_na_df = reduce_to_cg(hr_na_df)

In [None]:
def get_criteria_group_weights(criteria, index_string):
    criteria_set = set(criteria.values)
    
    if criteria_set == {'KPA', 'Competency objective'}:
        criteria_weights_dict = {'KPA': 70, 'Competency objective': 30}
    elif criteria_set == {'KPA', 'Competency objective', 'CMC'}:
        criteria_weights_dict = {'KPA': 70, 'Competency objective': 20, 'CMC': 10}
    elif criteria_set == {'CMC', 'Competency objective', 'KID objective', 'KPA'}:
        criteria_weights_dict = {'KPA': 20, 'Competency objective': 20, 'CMC': 30, 'KID objective': 30}
    else:
        print(f"Irregular criteria set: '{criteria_set}' at index '{index_string}'. Falling back to equal weighting")
        return numpy.array([1 for criterion in criteria])
        #criteria_weights_dict = {'KPA': 20, 'Competency objective': 20, 'CMC': 30, 'KID objective': 30}
        
    criteria_weights = numpy.array([
        criteria_weights_dict[criterion]
        for criterion in criteria
    ])
    
    return criteria_weights

In [None]:
def reduce_to_position(criteria_group_df):
    position_df = criteria_group_df.groupby([
        'Directorate', 'Department', 'EmployeeNumber',
        'EmployeeName', 'PositionNumber', 'PositionName',
        'PayScaleGroup', 'Template', 'TLevel'
    ]).apply(
            lambda position_df: (
                position_df.CriteriaGroupVector*get_criteria_group_weights(position_df.Criterion, position_df.index.values[0])
            ).sum() / (
                get_criteria_group_weights(position_df.Criterion, "")
            ).sum()
    ).reset_index()

    position_df["PositionVector"] = position_df[0]
    position_df.drop(0, axis='columns', inplace=True)

    return position_df

In [None]:
position_admin_df = reduce_to_position(criteria_group_admin_df)
position_na_df = reduce_to_position(criteria_group_na_df)

### Example plots

#### Assembling Example Data

In [None]:
example_criteria_group_df = criteria_group_na_df.query(
    "EmployeeName == @employee_name"
)[["Directorate", "Department", "PositionName", "CriteriaGroup", "CriteriaGroupVector"]]
example_criteria_group_df

In [None]:
example_criteria_group_df.to_html(
    './report/gordon_cg_df.html', index=False
)

In [None]:
example_position_df = position_na_df.query(
    "EmployeeName == @employee_name"
)[["Directorate", "Department", "PositionName", "PositionVector"]]
example_position_df

In [None]:
example_position_df.to_html(
    './report/gordon_position_df.html', index=False
)

In [None]:
example_df = example_hr_df.merge(
    example_criteria_group_df[["CriteriaGroup", "CriteriaGroupVector"]],
).merge(
    example_position_df[["PositionName", "PositionVector"]]
)[["Row", "RowVector", "CriteriaGroup", "CriteriaGroupVector", "PositionName", "PositionVector"]]

In [None]:
example_df

#### Reducing Vectors for Plotting

In [None]:
all_vectors = numpy.vstack(
    example_df.RowVector.append([
        example_df.CriteriaGroupVector,
        example_df.PositionVector
    ]).drop_duplicates().values
)

In [None]:
pca = sklearn.decomposition.PCA(n_components=2).fit(all_vectors)

In [None]:
example_df["RowVectorReduced"] = example_df.RowVector.apply(
    lambda row: pca.transform([row])
)
example_df["CriteriaGroupVectorReduced"] = example_df.CriteriaGroupVector.apply(
    lambda cg: pca.transform([cg])
)
example_df["PositionVectorReduced"] = example_df.PositionVector.apply(
    lambda pos: pca.transform([pos])
)

In [None]:
def produce_flat_vector(pd_series):
    return pandas.DataFrame(
        numpy.vstack(pd_series.values)
    ).drop_duplicates().values

row_source = ColumnDataSource(data={
    "pca1": produce_flat_vector(example_df.RowVectorReduced)[:,0],
    "pca2": produce_flat_vector(example_df.RowVectorReduced)[:,1],
    "labels": example_df.Row
})
cg_source = ColumnDataSource(data={
    "pca1": produce_flat_vector(example_df.CriteriaGroupVectorReduced)[:,0],
    "pca2": produce_flat_vector(example_df.CriteriaGroupVectorReduced)[:,1],
    "labels": example_df.CriteriaGroup.drop_duplicates()
})
position_source = ColumnDataSource(data={
    "pca1": produce_flat_vector(example_df.PositionVectorReduced)[:,0],
    "pca2": produce_flat_vector(example_df.PositionVectorReduced)[:,1],
    "labels": ["Gordon"]   
})

#### Plotting

In [None]:
output_file('./report/hr_translation_I.html')

fig = figure(
    width=600, height=600, title="Mapping Gordon's Criteria into his City Position", 
    x_range=(-2.1, 4.5), background_fill_color="#fafafa"
)

colour_dict = {
    "Criteria": "Blue",
    "Criteria Group": "Red",
    "Position": "Green"
}
scatter_size_dict = {
    "Criteria": 10,
    "Criteria Group": 20,
    "Position": 50
}
font_size_dict = {
    "Criteria": "10px",
    "Criteria Group": "15px",
    "Position": "20px"
}
    

def add_line(start_coord, end_coord, colour_key):
    diff = (end_coord[0] - start_coord[0])*0.01
    fig.add_layout(Arrow(
        x_start=start_coord[0,0], y_start=start_coord[0,1],
        x_end=end_coord[0,0]-diff[0], y_end=end_coord[0,1]-diff[1],
        start=None, end=None, line_color=colour_dict[colour_key]
    ))

# Row -> CG Arrows
example_df.groupby(["CriteriaGroup"], as_index=False).apply(
    lambda sub_df: sub_df.RowVectorReduced.apply(
        add_line,
        end_coord=sub_df.CriteriaGroupVectorReduced.values[0],
        colour_key="Criteria"
    )
)
# CG -> Position Arrows
example_df.groupby(["PositionName"], as_index=False).apply(
    lambda sub_df: sub_df.CriteriaGroupVectorReduced.apply(
        add_line,
        end_coord=sub_df.PositionVectorReduced.values[0],
        colour_key="Criteria Group"
    )
)

# Scatters
for (name, source) in zip(["Criteria", "Criteria Group", "Position"], 
                          [row_source, cg_source, position_source]):
    fig.scatter(
        x="pca1", y="pca2", source=source, 
        size=scatter_size_dict[name], color=colour_dict[name], legend=name,
    )
    labels = LabelSet(
        x="pca1", y="pca2", text="labels", source=source, 
        x_offset=scatter_size_dict[name]/1.5, 
        y_offset=-scatter_size_dict[name]/1.5, 
        text_font_size=font_size_dict[name]
    )
    fig.add_layout(labels)

fig.xaxis.visible = False
fig.yaxis.visible = False

show(fig)

### Example plot II - Plot Harder

In [None]:
position_plot_na_df = position_na_df.copy().sample(5000)
position_plot_admin_df = position_admin_df.copy()

In [None]:
pca = sklearn.decomposition.PCA(n_components=2).fit(
    numpy.vstack(
        pandas.concat((position_plot_na_df.PositionVector, position_plot_admin_df.PositionVector)).values
    )
)

for position_plot_df in (position_plot_na_df, position_plot_admin_df):
    position_plot_df["PositionVectorReduced"] = [
        vector 
        for vector in pca.transform(
            numpy.vstack(position_plot_df.PositionVector.values)
        )
    ]

In [None]:
pca.explained_variance_ratio_

In [None]:
def get_position_source_dict(position_plot_df):
    directorates = position_plot_na_df.Directorate.unique()
    
    all_position_source = {
        directorate: ColumnDataSource(data={
            "pca1": numpy.vstack(position_plot_df.query("Directorate==@directorate").PositionVectorReduced.values)[:,0],
            "pca2": numpy.vstack(position_plot_df.query("Directorate==@directorate").PositionVectorReduced.values)[:,1],
            "directorate": position_plot_df.query("Directorate==@directorate").Directorate.str.title(),
            "department": position_plot_df.query("Directorate==@directorate").Department.str.title(),
            "position": position_plot_df.query("Directorate==@directorate").PositionName,
        }) for directorate in directorates
    }
    
    return all_position_source

In [None]:
def generate_position_plot(position_plot_df, output_path, plot_title):
    all_position_source = get_position_source_dict(position_plot_df)
    directorates = sorted(position_plot_na_df.Directorate.unique())
    
    output_file(output_path)

    fig = figure(
        width=600, height=600, title=plot_title,
        tooltips=[
            ("Department", "@department"),
            ("Position", "@position"),
        ],
        background_fill_color="#fafafa"
    )

    for i,directorate in enumerate(directorates):
        plt = fig.scatter(
                x="pca1", y="pca2", source=all_position_source[directorate], 
                size=5, color=Category20c[len(directorates)][i], alpha=0.8, muted_alpha=0.1,
                legend="".join(map(lambda x: x[0], directorate.title().split()))
        )

    fig.legend.location = "bottom_right"
    fig.legend.click_policy = "hide"
    fig.legend.visible = True

    fig.xaxis.visible = False
    fig.yaxis.visible = False

    show(fig)

In [None]:
generate_position_plot(position_plot_na_df, './report/hr_translation_II_na.html', "Mapping Non-Admin Positions")

In [None]:
generate_position_plot(position_plot_admin_df, './report/hr_translation_II_admin.html', "Mapping Admin Positions")

## PCA

In [None]:
pca = sklearn.decomposition.PCA().fit(
    numpy.vstack(
        pandas.concat((position_plot_na_df.PositionVector, position_plot_admin_df.PositionVector)).values
    )
)

for position_plot_df in (position_plot_na_df, position_plot_admin_df):
    position_plot_df["PositionVectorReduced"] = [
        vector 
        for vector in pca.transform(
            numpy.vstack(position_plot_df.PositionVector.values)
        )
    ]

In [None]:
pca.explained_variance_ratio_[:30].sum()

## Data Word Scoring

In [None]:
data_words = [
    "data",
    "gathering",
#     "collection",
#     "acquisition",
#     "accumulation",
    "processing",
#     "transformation",
    "analysis",
#     "research",
#     "interpretation",
#     "understanding",
    "dissemination",
#     "formal communication"
#     "communication",
#     "distribution",
]

### Computing the Scores

In [None]:
data_word_vectors = {
    word: nlp(word.lower()).vector
    for word in data_words
}

In [None]:
score_na_df = position_na_df.copy()
score_admin_df = position_admin_df.copy()

In [None]:
for score_df in (score_na_df, score_admin_df):
    for word, word_vector in data_word_vectors.items():
        score_df[f"{word.title()}Score"] = sklearn.metrics.pairwise.cosine_similarity(
            numpy.vstack(score_df.PositionVector.values),
            numpy.array([word_vector])
        )
        #score_df[f"{word.title()}Score"] = numpy.linalg.norm(
        #    (numpy.vstack(score_df.PositionVector.values) - numpy.array([word_vector])), 
        #    axis=1
        #)
        #score_df[f"{word.title()}Score"] = score_df[f"{word.title()}Score"]/score_df[f"{word.title()}Score"].max()

In [None]:
data_score_example_df = score_na_df.query(
    "EmployeeName == @employee_name"
)[["Directorate", "Department", "PositionName"] + [f"{word.title()}Score" for word in data_words]]
data_score_example_df

In [None]:
data_score_example_df.to_html(
    "./report/data_score_df.html"
)

### Inspecting the Scores

In [None]:
def generate_score_source(score_df):
    data_score_sources = {
        word: ColumnDataSource(data={
            "count": range(score_df.shape[0]),
            "score": score_df.sort_values(by=f"{word.title()}Score", ascending=False)[f"{word.title()}Score"],
            "directorate": score_df.sort_values(by=f"{word.title()}Score", ascending=False).Directorate.str.title(),
            "department": score_df.sort_values(by=f"{word.title()}Score", ascending=False).Department.str.title(),
            "position": score_df.sort_values(by=f"{word.title()}Score", ascending=False).PositionName,
        })
        for word in data_words
    }
    
    return data_score_sources

In [None]:
data_word_colour_dict = {
    "data": "blue",
    "gathering": "orange",
    "processing": "green",
    "analysis": "red",
    "dissemination": "purple",
    #"formal communication": "purple",
}
# data_word_colour_dict = {
#     word: Category20[len(data_words)][i]
#     for i,word in enumerate(data_words)
# }

In [None]:
def generate_score_plot(score_df, output_path, plot_title, green_stop, red_start):
    data_score_sources = generate_score_source(score_df)
    output_file(output_path)

    fig = figure(
        width=600, height=600, title=plot_title,
        tooltips=[
            ("Directorate", "@directorate"),
            ("Department", "@department"),
            ("Position", "@position"),
        ],
        #x_range=(-10, 17000), #y_range=(-1.5, 2)
        background_fill_color="#fafafa"
    )

    # lines
    for word in data_words:
        fig.line(
            x="count", y="score", source=data_score_sources[word],
            legend=word.title(), line_color=data_word_colour_dict[word],  line_width=2
        )

    # high intensity band
    high_threshold = green_stop
    fig.add_layout(BoxAnnotation(
        left=0, right=high_threshold,
        fill_alpha=0.2, fill_color='green'
    ))

    # middle intensity band
    low_threshold = red_start
    fig.add_layout(BoxAnnotation(
        left=high_threshold, right=low_threshold,
        fill_alpha=0.2, fill_color='grey'
    ))

    # low intensity band
    fig.add_layout(BoxAnnotation(
        left=low_threshold, 
        fill_alpha=0.2, fill_color='red'
    ))

    fig.legend.location = "bottom_center"
    fig.legend.click_policy = "hide"
    fig.legend.visible = True

    fig.xaxis.axis_label = 'Cumulative number of employees'
    fig.yaxis.axis_label = 'Word Similarity'

    show(fig)

In [None]:
generate_score_plot(score_na_df, './report/na_data_scoring.html', "Distribution of Non-Administrative Data Word Scoring", 500, 10000)

In [None]:
generate_score_plot(score_admin_df, './report/admin_data_scoring.html', "Distribution of Administrative Data Word Scoring", 100, 4900)

### Characterising City Employee Segments

In [None]:
data_score_df = score_na_df.sort_values(by="DataScore", ascending=False).head(500)

In [None]:
directorate_counts = data_score_df.groupby(['Directorate']).EmployeeNumber.nunique().sort_values(ascending=False)

In [None]:
department_counts = data_score_df.groupby(['Department']).EmployeeNumber.nunique().sort_values(ascending=False)

In [None]:
tlevel_counts = data_score_df.groupby(['TLevel']).EmployeeNumber.nunique()

In [None]:
position_counts = data_score_df.groupby(['PositionName']).EmployeeNumber.nunique().sort_values(ascending=False)

In [None]:
output_file("report/hr_top_data_summary.html", mode="cdn")

TOOLS = ["save"]

# Directorate Plot
directorate_figure = figure(width=400, plot_height=600, title="Directorate Breakdown", x_range=directorate_counts.index.values, tools=TOOLS)
directorate_figure.vbar(
    directorate_counts.index.values, top=directorate_counts.values, 
    width=0.9, color=Category20[len(directorate_counts.values)]
)
directorate_figure.xaxis.major_label_orientation = "vertical"
directorate_figure.xaxis.axis_label = "Directorate"

# Department Plot
depts = 15
department_figure = figure(width=400, plot_height=600, title=f"Department Breakdown (top {depts})", tools=TOOLS, 
                           x_range=department_counts.index.values[:depts])
department_figure.vbar(
    department_counts.index.values[:depts], top=department_counts.values[:depts], 
    width=0.9, color=Category20[depts]
)
department_figure.xaxis.major_label_orientation = "vertical"
department_figure.xaxis.axis_label = "Department"

# T-Level Plot
tlevel_figure = figure(width=400, plot_height=600, title=f"T-Level Breakdown", x_range=list(map(str,tlevel_counts.index.values[:])), tools=TOOLS)
tlevel_figure.vbar(
    list(map(str,tlevel_counts.index.values[:])), top=tlevel_counts.values[:], 
    width=0.9, color=Category20[len(tlevel_counts)]
)
tlevel_figure.xaxis.major_label_orientation = "vertical"
tlevel_figure.xaxis.axis_label = "T-Level"

# Position Plot
positions = 15
positions_figure = figure(
    width=400, plot_height=600, 
    title=f"Position Breakdown (top {positions})", 
    x_range=position_counts.index.values[:positions], 
    tools=TOOLS,
)

positions_figure.vbar(
    position_counts.index.values[:positions], top=position_counts.values[:positions], 
    width=0.9, color=Category20[depts]
)
positions_figure.xaxis.major_label_orientation = "vertical"
positions_figure.xaxis.axis_label = "Position Names"

# show the results
show(
    gridplot([
        [tlevel_figure, positions_figure],
        [directorate_figure, department_figure],
    ])
)

## Validating the Scores

In [None]:
score_na_df["DataStrategyAffliliated"] = score_na_df.EmployeeName.str.lower().isin(
    data_strategy_affliated.Name
)

In [None]:
score_na_df.sort_values(by="DataScore", ascending=False).head(5)

In [None]:
data_score_comparison_source = ColumnDataSource({
    "data_words": list(map(lambda x: x.title(), data_words)),
    "not_affliated": [
        score_na_df[~score_na_df.DataStrategyAffliliated][f"{word.title()}Score"].median()
        for word in data_words
    ],
    "affliated": [
        score_na_df[score_na_df.DataStrategyAffliliated][f"{word.title()}Score"].median()
        for word in data_words     
    ]
})

In [None]:
output_file('./report/data_scoring_comparison_test.html')

fig = figure(
    x_range=list(map(lambda x: x.title(), data_words)),
    y_range=(0, 0.6),
    width=800, height=600, title="Median Data Word Score by Data Strategy Affliation",
    background_fill_color="#fafafa"
)

fig.vbar(
    x=dodge("data_words", -0.1, range=fig.x_range), top='affliated', source=data_score_comparison_source,
    width=0.2, color='blue', legend=value("Affliated",)
)

fig.vbar(
    x=dodge("data_words", 0.1, range=fig.x_range), top='not_affliated', source=data_score_comparison_source,
    width=0.2, color='grey', legend=value("Not Affliated",)
)

fig.yaxis.axis_label = 'Median Data Word Score'
fig.xaxis.axis_label = 'Data Word'

show(fig)

## Data Mining

In [None]:
score_na_df.columns

In [None]:
pca = sklearn.decomposition.PCA(n_components=2).fit(
    score_na_df[[
        f"{word.title()}Score"
        for word in data_words
    ]].values
)

In [None]:
data_score_vector_reduced = pca.transform(
    score_na_df[[
        f"{word.title()}Score"
        for word in data_words
    ]].values
)
score_na_df["PCA1"] = data_score_vector_reduced[:,0]
score_na_df["PCA2"] = data_score_vector_reduced[:,1]

In [None]:
non_ds_sample = score_na_df[~score_na_df.DataStrategyAffliliated].sample(2000)
ds_sample = score_na_df[score_na_df.DataStrategyAffliliated]

In [None]:
pca_source = ColumnDataSource({
    "pca1": non_ds_sample.PCA1,
    "pca2": non_ds_sample.PCA2,
    "directorate": non_ds_sample.Directorate,
    "department": non_ds_sample.Department,
    "position": non_ds_sample.PositionName,
})
pca_source_ds = ColumnDataSource({
    "pca1": ds_sample.PCA1,
    "pca2": ds_sample.PCA2,
    "directorate": ds_sample.Directorate,
    "department": ds_sample.Department,
    "position": ds_sample.PositionName,
})

In [None]:
vector_loadings = pca.components_.T * numpy.sqrt(pca.explained_variance_)

In [None]:
pca.explained_variance_ratio_

In [None]:
output_file('./report/data_scoring_pca_test.html')

fig = figure(
    width=600, height=600, title="Data Scoring Component Analysis",
    tooltips=[
        ("Directorate", "@directorate"),
        ("Department", "@department"),
        ("Position", "@position"),
    ],
    x_range=(-0.6, 0.6), y_range=(-0.5, 0.5),
    x_axis_label="PCA Dimenions 1", y_axis_label="PCA Dimension 2",
    background_fill_color="#fafafa"
)
fig.extra_y_ranges = {"vector_y_axis": Range1d(start=-0.05, end=0.05)}
fig.extra_x_ranges = {"vector_x_axis": Range1d(start=-0.06, end=0.06)}

fig.add_layout(LinearAxis(y_range_name="vector_y_axis", axis_label='Component Dimension 1'), 
               'right')
fig.add_layout(LinearAxis(x_range_name="vector_x_axis", axis_label='Component Dimension 1'), 
               'above')

#City Positions
fig.scatter(
    x="pca1", y="pca2", source=pca_source, 
    size=5, color='grey', alpha=0.8,
    legend="Not DS Affliated"
)
fig.scatter(
    x="pca1", y="pca2", source=pca_source_ds, 
    size=5, color='blue', alpha=0.8,
    legend="DS Affliated"
)

for word, vector_position in zip(data_words, vector_loadings):
    fig.line(
        x=(0, vector_position[0]), y=(0, vector_position[1]),
        x_range_name="vector_x_axis", y_range_name="vector_y_axis",
        line_width=2, color=data_word_colour_dict[word], legend=word.title()
    )

fig.legend.location = "top_right"
fig.legend.click_policy = "hide"
fig.legend.visible = True

show(fig)