##### At the end of this **Advanced EDA** chapter, I created a tool for custom grouping based on **selected groups of skills**.

##### You can adjust:

- the number of people in these groups (between 2 and 10),
- the minimum number of 4.0 responses in created groups,
- the minimum level of interest in selected skills.
  
You can also manually select particular skills or whole groups.

In [22]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering, KMeans, DBSCAN
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib.colors as mcolors
from matplotlib.patches import Patch
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML

# Load the dataset
data = pd.read_csv(r'C:\Users\rejen\OneDrive\Documents\GitHub\ProjektAnalitiks\data\03_Processed\cleaned_data.csv')

# Define the updated categories with original names for columns
categories_updated = {
    'Cloud & Databases': ['Cloud: AWS', 'Cloud: Azure', 'Cloud: GPC', 'Databases: NoSQL', 'Databases: SQL'],
    'Business Intelligence (BI)': ['BI: PowerBI', 'BI: Tableau'],
    'Programming': ['Programming: R', 'Programming: Python', 'Programming: Bash', 'CLI: (np. Bash, PowerShell, CMD)', 'Version Control: GIT', 'Containers: Docker', 'Front End: (HTML, JavaScript, CSS)'],
    'Data Science': ['Area: Time Series', 'Area: Classical ML (Clustering, Regression, Classification)', 'Area: NLP', 'Area: Computer Vision'],
    'Industry knowledge': ['FinTech', 'HealthTech', 'FashionTech', 'E-commerce', 'SportTech', 'Non-profit', 'PropTech (nieruchomości)', 'Cybersecurity', 'HR'],
    'Soft skills': ['Project Management','Promocja w Social Media','Ux/Ui','Projektowanie graficzne','Nawiązywanie Relacji z Biznesem','Nawiązywanie Relacji z naukowcami','Pozyskiwanie finansowania','Współpraca z administracją UEW']
}

def process_category(category_name, columns, min_group_size, max_group_size, min_4_0_answers, min_threshold, clustering_option, max_generated_groups, n_clusters=None, metric='euclidean', linkage='average', distance_threshold=1.5):
    try:
        competencies_data = data[columns]

        # Filter competencies based on the minimum threshold
        filtered_data = competencies_data.apply(lambda col: col.map(lambda x: x if x >= min_threshold else np.nan)).dropna(how='all', axis=1)

        # Normalize the data
        scaler = StandardScaler()
        competencies_scaled = scaler.fit_transform(filtered_data.fillna(0))

        # Calculate similarity matrix
        similarity_matrix = cosine_similarity(competencies_scaled)

        # Perform clustering
        if clustering_option == 'Agglomerative':
            clustering = AgglomerativeClustering(n_clusters=n_clusters, metric=metric, linkage=linkage, distance_threshold=distance_threshold)
        elif clustering_option == 'K-means':
            clustering = KMeans(n_clusters=n_clusters)
        elif clustering_option == 'DB-SCAN':
            clustering = DBSCAN(eps=0.5, min_samples=min_group_size)

        clustering.fit(similarity_matrix if clustering_option != 'K-means' else competencies_scaled)

        # Assign groups based on clustering
        data['Group'] = clustering.labels_

        # Filter out groups with less than min_group_size people
        group_sizes = data['Group'].value_counts()
        valid_groups = group_sizes[group_sizes >= min_group_size].index
        data_valid = data[data['Group'].isin(valid_groups)]

        # Split larger groups into smaller ones if necessary, ensuring each group has at least min_4_0_answers 4.0 values
        new_groups = []
        for group in valid_groups:
            group_data = data_valid[data_valid['Group'] == group]
            group_size = len(group_data)
            while group_size > max_group_size:
                high_competency = (group_data[columns] == 4.0).sum(axis=1)
                if high_competency.sum() == 0:
                    break

                high_comp_individuals = group_data[high_competency >= min_4_0_answers]
                remaining_individuals = group_data[high_competency < min_4_0_answers]

                if len(high_comp_individuals) >= max_group_size:
                    new_groups.append(high_comp_individuals.iloc[:max_group_size])
                    group_data = group_data.iloc[max_group_size:]
                else:
                    split_group = pd.concat([high_comp_individuals, remaining_individuals.iloc[:max_group_size - len(high_comp_individuals)]])
                    new_groups.append(split_group)
                    group_data = remaining_individuals.iloc[max_group_size - len(high_comp_individuals):]

                group_size = len(group_data)

            if group_size >= min_group_size and (group_data[columns] == 4.0).sum().sum() >= min_4_0_answers:
                new_groups.append(group_data)

        if not new_groups:
            display(widgets.HTML("<p style='color:red;'><b>No groups meet the specified criteria.</b></p>"))
            return

        # Plot heatmaps for each new group, showing only the best skills
        group_counter = 1
        for group_data in new_groups:
            if group_counter > max_generated_groups:
                break
            group_ids = group_data['ID']
            group_competencies = group_data[columns]

            # Filter competencies to only include those rated according to the minimum threshold
            best_competencies = group_competencies.apply(lambda col: col.map(lambda x: x if x >= min_threshold else np.nan))
            best_competencies = best_competencies.dropna(how='all').dropna(axis=1, how='all')

            if (best_competencies == 4.0).sum().sum() < min_4_0_answers:
                continue

            sorted_competencies = best_competencies.apply(lambda col: col.sort_values(ascending=False), axis=0)

            if not sorted_competencies.empty:
                heatmap_data_renamed = sorted_competencies.set_index(group_ids)
                group_id_list = ', '.join(map(str, group_ids))

                plt.figure(figsize=(14, 8))
                sns.heatmap(heatmap_data_renamed.T, cmap='YlGnBu', cbar=False, annot=True, fmt=".1f", linewidths=.5, vmin=2, vmax=4)
                plt.title(f'ID: {group_id_list} - group {group_counter} - top competencies', fontsize=20)
                plt.xticks(fontsize=8)
                plt.yticks(fontsize=8)
                plt.xlabel('ID', fontsize=15)
                plt.ylabel('Competencies', fontsize=15)
                legend_patches_custom = [
                    Patch(color=sns.color_palette("YlGnBu", 3)[0], label='2.0 - Interested'),
                    Patch(color=sns.color_palette("YlGnBu", 3)[1], label='3.0 - Competent'),
                    Patch(color=sns.color_palette("YlGnBu", 3)[2], label='4.0 - Expert')
                ]
                plt.legend(handles=legend_patches_custom, loc='upper left', bbox_to_anchor=(1, 1), fontsize='small')
                plt.show()
                group_counter += 1
    except Exception as e:
        display(widgets.HTML(f"<p style='color:red;'><b>Error: {str(e)}</b></p>"))

# Function to create "Select all" checkboxes and dropdowns for category selection
def choose_skill_categories():
    # Embed the CSS for hover effects and other styles
    display(HTML("""
    <style>
        .hover-bold:hover { font-weight: bold; }
        .title { font-size: 24px; font-weight: bold; margin: 10px 0px 20px 0px; }
        .description { font-size: 16px; margin: 0px 0px 20px 0px; }
        .section { border: solid 1px #ccc; margin: 10px; padding: 10px; background-color: #ADDFFF; }
        .slider { width: 80%; margin: 10px; }
        .button { width: 200px; margin: 20px 0px 10px 0px; background-color: #4CAF50; color: white; }
        .button:hover { background-color: #45a049; }
        .clear-button { width: 200px; margin: 20px 0px 10px 10px; background-color: #F94449; color: white; }
        .clear-button:hover { background-color: #d23c3c; }
        .output { border: solid 1px #ccc; margin: 20px; padding: 10px; background-color: #ADDFFF; }
        .checkbox-label { cursor: pointer; }
        .checkbox-label:hover { font-weight: bold; }

        /* Custom styles for sliders */
        .noUi-handle {
            background: #4CAF50 !important;
            border: none !important;
        }
    </style>
    """))

    title = widgets.HTML(value="<div class='title'>Skill grouping tool 🔧</div>")
    description = widgets.HTML(value="<div class='description'>Select desired skills and adjust the parameters for the grouping process:</div>")
    
    # Create "Select all" checkboxes and dropdowns for skill selection
    select_all_checkboxes = {category: widgets.Checkbox(value=False, description='Select all', 
                                                        style={'description_width': 'initial'}, 
                                                        layout=widgets.Layout(margin='5px 0px')) 
                             for category in categories_updated.keys()}
    
    category_dropdowns = {category: widgets.SelectMultiple(options=skills, description='', 
                                                           style={'description_width': 'initial'}, 
                                                           layout=widgets.Layout(width='90%', height='50px', overflow='auto')) 
                          for category, skills in categories_updated.items()}
    
    # Function to handle "Select all" checkbox
    def on_select_all_change(change, category):
        if change['new']:
            category_dropdowns[category].value = categories_updated[category]
        else:
            category_dropdowns[category].value = []

    # Function to handle individual skill selection
    def on_skill_select(change, category):
        selected_skills = category_dropdowns[category].value
        if set(selected_skills) == set(categories_updated[category]):
            select_all_checkboxes[category].value = True
        else:
            select_all_checkboxes[category].value = False

    # Attach the handler to the "Select all" checkboxes and skill selection
    for category in categories_updated.keys():
        select_all_checkboxes[category].observe(lambda change, category=category: on_select_all_change(change, category), names='value')
        category_dropdowns[category].observe(lambda change, category=category: on_skill_select(change, category), names='value')
    
    # Create "Technical skills" and "Other skills" checkboxes
    technical_skills_checkbox = widgets.Checkbox(value=False, description='Technical skills',
                                                 style={'description_width': 'initial'},
                                                 layout=widgets.Layout(margin='5px 10px'))
    other_skills_checkbox = widgets.Checkbox(value=False, description='Other skills',
                                             style={'description_width': 'initial'},
                                             layout=widgets.Layout(margin='5px 10px'))
    
    # Function to handle "Technical skills" checkbox
    def on_technical_skills_change(change):
        categories = ['Cloud & Databases', 'Business Intelligence (BI)', 'Programming', 'Data Science']
        for category in categories:
            select_all_checkboxes[category].value = change['new']
    
    # Function to handle "Other skills" checkbox
    def on_other_skills_change(change):
        categories = ['Industry knowledge', 'Soft skills']
        for category in categories:
            select_all_checkboxes[category].value = change['new']
    
    # Attach the handler to the "Technical skills" and "Other skills" checkboxes
    technical_skills_checkbox.observe(on_technical_skills_change, names='value')
    other_skills_checkbox.observe(on_other_skills_change, names='value')
    
    # Create a VBox with "Select all" checkboxes and dropdowns
    checkbox_dropdowns = [widgets.VBox([
                            widgets.HTML(value=f"<b>{category}</b>"),
                            widgets.HBox([select_all_checkboxes[category]]), 
                            category_dropdowns[category]
                          ]) 
                          for category in categories_updated.keys()]
    
    checkboxes_dropdowns_vbox = widgets.VBox(checkbox_dropdowns, 
                                             layout=widgets.Layout(padding='10px', border='solid 1px #ccc', margin='10px', width='50%', background_color='#ADDFFF'))
    
    min_group_slider = widgets.IntSlider(value=2, min=2, max=10, step=1, description='Min. group size:', 
                                         style={'description_width': 'initial'}, layout=widgets.Layout(width='80%', margin='10px'))
    max_group_slider = widgets.IntSlider(value=4, min=2, max=10, step=1, description='Max. group size:', 
                                         style={'description_width': 'initial'}, layout=widgets.Layout(width='80%', margin='10px'))
    min_4_0_slider = widgets.IntSlider(value=2, min=0, max=25, step=1, description='Min. 4.0 answers:', 
                                       style={'description_width': 'initial'}, layout=widgets.Layout(width='80%', margin='10px'))
    min_threshold_slider = widgets.SelectionSlider(options={'Not aware': 0.0, 'Not interested': 1.0, 'Interested': 2.0}, 
                                                   value=1.0, description='Min. threshold:', 
                                                   style={'description_width': 'initial'}, layout=widgets.Layout(width='90%', margin='10px'))

    clustering_radio_label = widgets.HTML(value="<b>Clustering:</b>")
    clustering_radio = widgets.RadioButtons(
        options=['Agglomerative', 'K-means', 'DB-SCAN'],
        disabled=False
    )

    kmeans_clusters_slider = widgets.IntSlider(value=4, min=1, max=10, step=1, description='Number of clusters (k-means):',
                                               style={'description_width': 'initial'}, layout=widgets.Layout(width='80%', margin='10px'))
    kmeans_clusters_slider.layout.display = 'none'  # Initially hide the slider

    metric_dropdown = widgets.Dropdown(
        options=['euclidean', 'l1', 'l2', 'manhattan', 'cosine', 'precomputed'],
        value='euclidean',
        description='Metric:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='80%', margin='10px')
    )

    linkage_dropdown = widgets.Dropdown(
        options=['ward', 'complete', 'average', 'single'],
        value='average',
        description='Linkage:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='80%', margin='10px')
    )

    distance_threshold_slider = widgets.FloatSlider(value=1.5, min=0.0, max=5.0, step=0.25, description='Distance threshold:', 
                                                    style={'description_width': 'initial'}, layout=widgets.Layout(width='80%', margin='10px'))

    def on_clustering_option_change(change):
        if change['new'] == 'K-means':
            kmeans_clusters_slider.layout.display = 'block'
            metric_dropdown.layout.display = 'none'
            linkage_dropdown.layout.display = 'none'
            distance_threshold_slider.layout.display = 'none'
        elif change['new'] == 'Agglomerative':
            kmeans_clusters_slider.layout.display = 'none'
            metric_dropdown.layout.display = 'block'
            linkage_dropdown.layout.display = 'block'
            distance_threshold_slider.layout.display = 'block'
        else:
            kmeans_clusters_slider.layout.display = 'none'
            metric_dropdown.layout.display = 'none'
            linkage_dropdown.layout.display = 'none'
            distance_threshold_slider.layout.display = 'none'

    clustering_radio.observe(on_clustering_option_change, names='value')

    max_generated_groups_slider = widgets.IntSlider(value=3, min=1, max=10, step=1, description='Max. generated groups:', 
                                                    style={'description_width': 'initial'}, layout=widgets.Layout(width='80%', margin='10px'))

    sliders = widgets.VBox([
        min_group_slider, max_group_slider, min_4_0_slider, min_threshold_slider, max_generated_groups_slider, 
        clustering_radio_label, clustering_radio, kmeans_clusters_slider, metric_dropdown, linkage_dropdown, distance_threshold_slider
    ], layout=widgets.Layout(margin='20px', padding='10px', border='solid 1px #ccc', width='50%', background_color='#ADDFFF'))

    button = widgets.Button(description="Generate heatmaps", button_style='success', 
                            layout=widgets.Layout(width='200px', margin='20px 0px 10px 0px', background_color='#4CAF50', color='white'))
    
    clear_button = widgets.Button(description="Clear all selections", 
                                  layout=widgets.Layout(width='200px', margin='20px 0px 10px 10px', background_color='#F94449', color='white'))

    output = widgets.Output(layout=widgets.Layout(border='solid 1px #ccc', margin='20px', padding='10px', background_color='#ADDFFF'))

    def on_button_click(b):
        selected_categories = [category for category in categories_updated.keys() if len(category_dropdowns[category].value) > 0]
        selected_skills = sum([list(category_dropdowns[category].value) for category in selected_categories], [])
        min_group_size = min_group_slider.value
        max_group_size = max_group_slider.value
        min_4_0_answers = min_4_0_slider.value
        min_threshold = min_threshold_slider.value
        clustering_option = clustering_radio.value
        k_clusters = kmeans_clusters_slider.value if clustering_option == 'K-means' else None
        metric = metric_dropdown.value
        linkage = linkage_dropdown.value
        distance_threshold = distance_threshold_slider.value
        max_generated_groups = max_generated_groups_slider.value
        with output:
            clear_output(wait=True)
            if selected_skills:
                process_category('Custom selection', selected_skills, min_group_size, max_group_size, min_4_0_answers, min_threshold, clustering_option, max_generated_groups, n_clusters=k_clusters, metric=metric, linkage=linkage, distance_threshold=distance_threshold)
            else:
                display(widgets.HTML("<p style='color:red;'><b>Please select at least one category.</b></p>"))

    def on_clear_button_click(b):
        for category in categories_updated.keys():
            select_all_checkboxes[category].value = False
            category_dropdowns[category].value = []
        min_group_slider.value = 2
        max_group_slider.value = 4
        min_4_0_slider.value = 2
        min_threshold_slider.value = 1.0
        clustering_radio.value = 'Agglomerative'
        kmeans_clusters_slider.value = 3
        metric_dropdown.value = 'euclidean'
        linkage_dropdown.value = 'average'
        distance_threshold_slider.value = 1.5
        max_generated_groups_slider.value = 3
        with output:
            clear_output(wait=True)
            display(widgets.HTML("<p style='color:green;'><b>Selections cleared.</b></p>"))

    button.on_click(on_button_click)
    clear_button.on_click(on_clear_button_click)
    
    # Event handlers for sliders
    def on_min_group_slider_change(change):
        if change['new'] > max_group_slider.value:
            max_group_slider.value = change['new']
    
    def on_max_group_slider_change(change):
        if change['new'] < min_group_slider.value:
            min_group_slider.value = change['new']
    
    min_group_slider.observe(on_min_group_slider_change, names='value')
    max_group_slider.observe(on_max_group_slider_change, names='value')

    display(widgets.VBox([title, description, widgets.HBox([checkboxes_dropdowns_vbox, sliders]), widgets.HBox([technical_skills_checkbox, other_skills_checkbox]), widgets.HBox([button, clear_button]), output], 
                         layout=widgets.Layout(margin='20px', padding='10px', border='solid 1px #ccc', background_color='#ADDFFF')))

choose_skill_categories()


VBox(children=(HTML(value="<div class='title'>Skill grouping tool 🔧</div>"), HTML(value="<div class='descripti…