In [1]:
import pandas as pd
import os
import jsonpickle

from typing import Tuple, Dict, Union, List
import pandas as pd
import numpy as np
import pickle
from sklearn.manifold import TSNE
import plotly.express as px
import plotly
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import csr_matrix
import implicit
from implicit.nearest_neighbours import bm25_weight, tfidf_weight
from scipy.spatial.distance import cdist

import umap
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from scipy.sparse.linalg import svds
from sklearn.decomposition import PCA


  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
RANDOM_STATE = 42

In [4]:
def load(config_path : str) -> object:
    """load config from file
    exsabple of using: 
        self.config : ParserConfig = config.load(config_path) or ParserConfig()
        """

    jsonpickle.set_preferred_backend('json')
    jsonpickle.set_encoder_options('json', ensure_ascii=False)

    if config_path is not None and os.path.isfile(config_path):
        with open(config_path, 'r', encoding='utf-8') as f:
            return jsonpickle.decode(f.read())
    else:
        return None


In [5]:
# load reference pairs
skill_pairs : List[List[str]] = load("../cnf/skill_pairs.json")

# remove duplicates
print('all pairs count =', len(skill_pairs))
skill_pairs = [x for x in skill_pairs if [x[1], x[0]] not in skill_pairs]
print('unique pairs count =', len(skill_pairs))

all pairs count = 215
unique pairs count = 165


In [6]:
## Load data from APP DATA folder

with open('../data/features/matrix.pkl', 'rb') as f:
#with open('../data/features/matrix_name_tfidf.pkl', 'rb') as f:
#with open('../data/features/matrix_description_name_tfidf.pkl', 'rb') as f:
    matrix = pickle.load(f)

with open('../data/features/prof_index_to_prof_name.pkl', 'rb') as f:
    prof_index_to_prof_name = pickle.load(f)

# with open('../data/features/quety_to_prof_index.pkl', 'rb') as f:
#     quety_to_prof_index = pickle.load(f)

with open('../data/features/skill_index_to_corrected.pkl', 'rb') as f:
    skill_index_to_corrected = pickle.load(f)
    
with open('../data/features/skill_original_to_index.pkl', 'rb') as f:
    skill_original_to_index = pickle.load(f)
    
skill_df = pd.read_csv('../data/features/skills.csv')
prof_df = pd.read_csv('../data/features/prof.csv')

In [7]:
def prepare_plot_df(skill_df: pd.DataFrame,
                    matrix: np.array,
                    skill_index_to_corrected: Dict[int, str],
                    prof_index_to_corrected: Dict[int, str], 
                    top_n_skill_per_profession: int = 200, 
                    salary: Union[Tuple[float, float], None] = None,
                    norm_type: str = 'none',
                    factor_alg_type: str = 'none',
                    perplexity: int = 30,
                    early_exaggeration=12,
                    learning_rate=200,
                    dim: int = 5,
                    use_tsne: bool = True
                    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Prepare datafame for plotting
    
        Parameters
        ----------
        skill_df: pd.DataFrame
            Data frame of skills
        matrix: np.array
            Skill to Profession matrix (skill in rows, professions in columns)
        skill_index_to_corrected: Dict[int, str]
            Dictionary from index to skill name
        prof_index_to_corrected: Dict[int, str]
            Dictionary from index to profession name
        top_n_skill_per_profession: int
            maximum skills for each profession
        salary: Tuple[float, float]:
            filter by salary, from and to, or None
            !!! do not set because too low vacancies whith salary
        norm_type: str
            How normalize skill-prof matrix ('none', 'skill', 'prof', 'bm', 'tfidf')
        factor_alg_type: str
            Type of algorithm for skill map ('tsne', 'als', 'svd', 'pca', 'sim_euclid', 'sim_cos')

        Returns
        -------
        Tuple[pd.DataFrame, pd.DataFrame] : 
            - DataFrame for skill map plot
            - DataFrame for profession map plot
    
    
    """

    def apply_filter(df, top_n_skill_per_profession, salary):
        if salary is not None:
            df = df[(df.salary_q75 > salary[0]) & (df.salary_q25 <= salary[1])]
        if top_n_skill_per_profession is not None:
            df = df.head(top_n_skill_per_profession)
        return df

    # skill_filter_indexes - skills for plot
    skill_filter_indexes = []
    for p in prof_index_to_corrected.values():
        skills_per_profession = skill_df.sort_values(p, ascending=False)
        skills_per_profession = apply_filter(skills_per_profession, top_n_skill_per_profession, salary)

        skill_filter_indexes = skill_filter_indexes + \
            [x for x in skills_per_profession.skill_id if x not in skill_filter_indexes]

    # Normalization
    m = matrix[skill_filter_indexes, :].T

    if norm_type == 'skill':
        # norm for earch skill
        m = m / np.sum(m, axis=0, keepdims=True)
    elif norm_type == 'prof':
        # norm for each prof
        m = m / np.sum(m, axis=1, keepdims=True)
    elif norm_type == 'bm':
        m = np.asarray(bm25_weight(m).todense())
    elif norm_type == 'tfidf':
        m = np.asarray(tfidf_weight(m).todense())
    elif norm_type != 'none':
        raise ValueError("norm_type parametr must be ('none', 'skill', 'prof')")

    # als is needed for professions because T-NSE doesn't work with few objects
    csr = csr_matrix(m)
    model = implicit.als.AlternatingLeastSquares(factors=2, regularization=0.0, 
        iterations=20, alpha=10, random_state=RANDOM_STATE)
    model.fit(csr, show_progress=False)
    profs_xy = model.user_factors

    use_tsne = False

    if factor_alg_type == 'svd':
        skills_xy, _, _ = svds(csr.T, k=dim, random_state=RANDOM_STATE)
    elif factor_alg_type == 'pca':
        pca = PCA(n_components=dim)
        skills_xy = pca.fit_transform(m.T)
    elif factor_alg_type == 'tsne':
        #skills_xy = TSNE(n_components=2, random_state=RANDOM_STATE).fit_transform(m.T)
        skills_xy = m.T.copy()
        use_tsne = True
    elif factor_alg_type == 'als':
        skills_xy = model.item_factors
        use_tsne = True
    elif factor_alg_type == 'sim_euclid':
        skills_xy = euclidean_distances(m.T)
        use_tsne = True
    elif factor_alg_type == 'sim_cos':     
        skills_xy = cosine_similarity(m.T)
        use_tsne = True
    elif factor_alg_type == 'umap':
#         skills_xy = cosine_similarity(m.T)
        skills_xy = umap.UMAP(n_neighbors=10,
                              transform_seed=RANDOM_STATE,
                              random_state=RANDOM_STATE).fit_transform(m.T)
    else:
        raise ValueError("skills_xy parametr must be ('tsne', 'als', 'svd', 'pca', 'sim_euclid', 'sim_cos')")
    
    if use_tsne:
        skills_xy = TSNE(n_components=2, 
                         perplexity=perplexity, 
                         early_exaggeration=early_exaggeration,
                         learning_rate=learning_rate,
                         random_state=RANDOM_STATE).fit_transform(skills_xy)


    df = pd.DataFrame(skills_xy).rename(columns={0:'x',1:'y'})
    df['skill_id'] = list(skill_filter_indexes)
    df['Навык'] = df.skill_id.apply(lambda x: skill_index_to_corrected[x])

    # be ensure that for every profession has skill point 
    # (one skill may be in severall professions)
    df_plot_skill = None
    for p in prof_index_to_corrected.values():

        skills_per_profession = skill_df.sort_values(p, ascending=False)
        skills_per_profession = apply_filter(skills_per_profession, top_n_skill_per_profession, salary)
        ids = skills_per_profession.skill_id.to_numpy()

        df_for_prof = df[df.skill_id.isin(ids)]
        df_for_prof['Профессия'] = p

        min_f = skills_per_profession[p].min()
        max_f = skills_per_profession[p].max()
        if max_f - min_f < 1e-10:
            max_f += 1
        size_series = skills_per_profession[p].apply(lambda x: (x - min_f) / (max_f - min_f))
        size_dict = pd.Series(size_series.values, index=skills_per_profession.skill_id).to_dict()
        df_for_prof['size'] = df_for_prof.skill_id.apply(lambda x: 15 * size_dict[x] + 0.8)

        if df_plot_skill is None:
            df_plot_skill = df_for_prof
        else:
            df_plot_skill = pd.concat([df_plot_skill, df_for_prof])

    if profs_xy.shape[0] < 1000:
        df_plot_prof = pd.DataFrame(profs_xy).rename(columns={0:'x',1:'y'})
        df_plot_prof['Профессия'] = pd.Series(df_plot_prof.index).apply(lambda x: prof_index_to_corrected[x])
    else:
        # это потроение по th-idf признакам, а не профессиям -> нельзя построить карту профессий
        df_plot_prof = None

    return df_plot_skill, df_plot_prof

In [8]:
prof_index_to_prof_name.values()

dict_values(['Бизнес-аналитик', 'Data Scientist', 'Аналитик', 'NLP', 'Computer Vision', 'Администратор баз данных', 'Аналитик BI', 'Big Data', 'Продуктовый аналитик', 'Аналитик данных', 'Системный аналитик', 'ML инженер', 'Инженер данных'])

In [9]:
def plot_skill_map(df: pd.DataFrame, width=1000, height=600) -> plotly.graph_objs.Figure:



    color_list = [
        '#F8A19F', '#AA0DFE', '#3283FE', '#1CBE4F', '#C4451C', '#F6222E', 
        '#FE00FA', '#325A9B', '#FEAF16', 
        '#90AD1C', '#2ED9FF', '#B10DA1',
         '#909090', '#FBE426',
        '#FA0087', '#C075A6', '#FC1CBF'
    ]


    
    # custom visualization order
    # for better reproducability
    prof_order = [
        'Data Scientist', 'ML инженер', 'Computer Vision', 'NLP',
        'Инженер данных', 'Big Data', 'Администратор баз данных', 'Аналитик данных',
        'Аналитик', 'Бизнес-аналитик', 'Продуктовый аналитик', 'Аналитик BI',
        'Системный аналитик' ]

    fig = px.scatter(df, x='x', y='y',
                    color='Профессия', hover_name='Навык', 
                    hover_data= {'x':False, 'y':False, 'size':False, 'Профессия': False},
                    size='size', category_orders={'Профессия': prof_order},
                    #color_discrete_sequence=px.colors.qualitative.Plotly,
                    color_discrete_sequence=color_list,
                    title = None, width=width, height=height)

    fig.update_traces(marker=dict(opacity=0.7, line=dict(width=0.5, color='DarkSlateGrey')), 
                  selector=dict(mode='markers'))

    fig.update_xaxes(visible=False)
    fig.update_yaxes(visible=False)

    return fig

In [10]:
df_plot_skill, _ = prepare_plot_df(skill_df, matrix, skill_index_to_corrected,
                                              prof_index_to_prof_name, 
                                              top_n_skill_per_profession=100,
                                              norm_type='none', factor_alg_type='tsne')



In [11]:
df_plot_skill.drop_duplicates(subset=['x', 'y', 'skill_id', 'Навык'])

Unnamed: 0,x,y,skill_id,Навык,Профессия,size
0,25.143127,-9.562544,5,Бизнес-анализ,Бизнес-аналитик,15.800000
1,25.103373,-9.756906,158,Моделирование бизнес процессов,Бизнес-аналитик,13.891518
2,25.284517,-16.467566,2,Аналитическое мышление,Бизнес-аналитик,12.050000
3,24.821108,-7.667171,265,BPMN,Бизнес-аналитик,10.141518
4,24.834440,-10.984385,19,1С: Предприятие 8,Бизнес-аналитик,9.505357
...,...,...,...,...,...,...
416,-9.495220,-1.461552,1420,AWS,Инженер данных,0.937300
417,-9.603197,-2.289730,1007,ELK,Инженер данных,0.834325
418,-10.436560,-1.282676,855,Azure,Инженер данных,0.834325
419,-10.610202,-0.625837,473,PL/pgSQL,Инженер данных,0.800000


In [12]:
df = df_plot_skill.drop_duplicates(subset=['x', 'y', 'skill_id'])
df.shape

(421, 6)

In [13]:
df.groupby(['x', 'y']).filter(lambda x: len(x) > 1)

Unnamed: 0,x,y,skill_id,Навык,Профессия,size
237,-15.683398,4.827402,1642,Генеративный Дизайн,Computer Vision,1.228571
238,-15.683398,4.827402,1548,Generative Design,Computer Vision,1.228571
239,-15.683398,4.827402,2274,GD,Computer Vision,1.228571
284,-3.740696,24.152159,1208,Telegraph,Администратор баз данных,0.864378
287,-4.383193,23.488447,1884,Tantor,Администратор баз данных,0.864378
288,-3.740696,24.152159,1273,barman,Администратор баз данных,0.864378
290,-4.383193,23.488447,241,Database Engine,Администратор баз данных,0.864378


Это очень странно (выше, что разные навыки получили одну и ту же точку)

In [14]:
matrix.shape

(2299, 13)

In [15]:
np.unique(matrix, axis=0).shape

(748, 13)

Думаю объяснение выше: есть множество навыков, которые имеют абсолютно одиниковое предстваление в профессиях

Mean reciprocal rank https://en.wikipedia.org/wiki/Mean_reciprocal_rank

In [16]:
# df_plot_skill, _ = prepare_plot_df(skill_df, matrix, skill_index_to_corrected,
#                                               prof_index_to_prof_name, 
#                                               top_n_skill_per_profession=30,
#                                               norm_type='skill', alg_type='tnse')

In [17]:
def get_mean_reciprocal_rank(df: pd.DataFrame, pairs: List[List[str]]) -> Tuple[float, int]:
    """Mean reciprocal rank https://en.wikipedia.org/wiki/Mean_reciprocal_rank
        
        Parameters
        ----------
        df: pd.DataFrame
            Data frame with columns ['x', 'y', 'Навык']
        pairs: List[List[str]]
            Target nearest skills pairs
   
        Returns
        -------
        Tuple[float, int] : 
            - mean_reciprocal_rank
            - pairs count
    """

    df = df.drop_duplicates(subset=['x', 'y', 'Навык'])

    # берем только те строки, которые есть в тестовых парах
    # берем только те пары, навыки которых есть в данных

    skills_test_set = set([])
    for p in skill_pairs:
        skills_test_set = skills_test_set.union(set(p))

    df = df[df['Навык'].apply(lambda x: x in skills_test_set)]
    df = df.reset_index(drop=True)

    su = df['Навык'].unique()
    pairs = [x for x in pairs if x[0] in su and x[1] in su]

    # считаем расстояния в наших данных
    x = df[['x', 'y']].to_numpy()
    dist = cdist(x, x, 'sqeuclidean')

    cs = 0
    for p in pairs:
        ind_p0 = df[df['Навык'] == p[0]].index[0]
        ind_p1 = df[df['Навык'] == p[1]].index[0]

        nn = np.argsort(dist[ind_p0, :])
        rank = np.where(nn==ind_p1)[0][0]
        cs += 1/(rank+1)

        nn = np.argsort(dist[ind_p1, :])
        rank = np.where(nn==ind_p0)[0][0]
        cs += 1/(rank+1)

    mean_reciprocal_rank = cs / (2 * len(pairs))
    return mean_reciprocal_rank, len(pairs)

get_mean_reciprocal_rank(df_plot_skill, skill_pairs)

(0.08933945645323976, 108)

Метрика **mean_reciprocal_rank@pairs=50**. Стремимся увеличить метрику, но сохранив количество пар, которые встречаются в данных, на уровне не менее 50




In [18]:
# оптимизация
k = 50
best_norm_type = ''
best_alg_type = ''
best_top_n_skill_per_profession = -1
best_mean_reciprocal_rank = 0
best_dim = 0
best_use_tsne = False
stat = []
for norm_type in ['none', 'skill', 'prof', 'bm', 'tfidf']:
    for alg_type, dim, use_tsne in \
        [('tsne', 5, True),
         ('als', 5, True),
         ('svd', 5, True),
         ('svd', 2, False),
         ('pca', 5, True),
         ('pca', 2, False),
         ('sim_euclid', 5, True),
         ('sim_cos', 5, True),
         ('umap', 5, True),
         ('umap', 2, False), ]:

        for top_n_skill_per_profession in range(20, 51, 1):
            df, _ = prepare_plot_df(skill_df, matrix, skill_index_to_corrected,
                                    prof_index_to_prof_name, 
                                    top_n_skill_per_profession=top_n_skill_per_profession,
                                    norm_type=norm_type,
                                    factor_alg_type=alg_type)
            mrr, pairs_count = get_mean_reciprocal_rank(df, skill_pairs)

            print()
            print(norm_type, alg_type, dim, use_tsne, top_n_skill_per_profession)
            print(f'mrr = {mrr}, pairs_count = {pairs_count}')
            stat.append((norm_type, alg_type, dim, use_tsne, top_n_skill_per_profession, mrr, pairs_count))

            if pairs_count > k and mrr > best_mean_reciprocal_rank:
                best_mean_reciprocal_rank = mrr
                best_norm_type = norm_type
                best_alg_type = alg_type
                best_top_n_skill_per_profession = top_n_skill_per_profession
                best_dim = dim
                best_use_tsne = use_tsne

print()
print('best_mean_reciprocal_rank =', best_mean_reciprocal_rank)
print('best_norm_type =', best_norm_type)
print('best_alg_type =', best_alg_type)
print('best_dim =', best_dim)
print('best_use_tsne =', best_use_tsne)
print('best_top_n_skill_per_profession =', best_top_n_skill_per_profession)

df, _ = prepare_plot_df(skill_df, matrix, skill_index_to_corrected,
                                    prof_index_to_prof_name, 
                                    top_n_skill_per_profession=best_top_n_skill_per_profession,
                                    norm_type=best_norm_type,
                                    factor_alg_type=best_alg_type,
                                    dim=best_dim,
                                    use_tsne=best_use_tsne)

plot_skill_map(df).show()

##!!! Возможно нужно иметь топ-н алгоритмов, потому что есть алгоритмы с одинаковой метрикой или близкой...


none tsne 5 True 20
mrr = 0.1811193673129521, pairs_count = 29

none tsne 5 True 21
mrr = 0.1779311293172396, pairs_count = 29

none tsne 5 True 22
mrr = 0.1733003355315871, pairs_count = 29

none tsne 5 True 23
mrr = 0.13693458812819143, pairs_count = 35

none tsne 5 True 24
mrr = 0.1393971426158016, pairs_count = 35

none tsne 5 True 25
mrr = 0.14203689847158257, pairs_count = 36

none tsne 5 True 26
mrr = 0.14332632479005833, pairs_count = 38

none tsne 5 True 27
mrr = 0.13040463530580973, pairs_count = 38

none tsne 5 True 28
mrr = 0.14696055578389483, pairs_count = 38

none tsne 5 True 29
mrr = 0.1311307068283463, pairs_count = 38

none tsne 5 True 30
mrr = 0.13541687204158834, pairs_count = 39

none tsne 5 True 31
mrr = 0.13049669935203112, pairs_count = 42

none tsne 5 True 32
mrr = 0.13936881806373969, pairs_count = 42

none tsne 5 True 33
mrr = 0.12630290880415554, pairs_count = 45

none tsne 5 True 34
mrr = 0.12217412989080692, pairs_count = 46

none tsne 5 True 35
mrr = 0.1

In [19]:
# Сравнить построение графика

# norm_type skill это плохо?

# очень интересно про косинусное растояние

In [20]:
# plot_skill_map(df_plot_skill).show()

In [21]:
df, _ = prepare_plot_df(skill_df, matrix, skill_index_to_corrected,
                                    prof_index_to_prof_name, 
                                    top_n_skill_per_profession=best_top_n_skill_per_profession,
                                    norm_type=best_norm_type,
                                    factor_alg_type=best_alg_type,
                                    dim=best_dim,
                                    use_tsne=best_use_tsne)

plot_skill_map(df).show()

In [22]:
df.to_csv(f'../notebooks/docker/front/data/best.csv')