In [None]:
import pandas as pd
import os

import numpy as np

# Data Statistics

In [40]:
path = "../../data/annotated/processed/"
metrics = ['size', 'avg_sentence_length', 'avg_nb_skills', 'max_nb_skills', 'percent_without_skill', 'avg_span_length', 'total_unique_skills', 'skill_overlap']
res_df = pd.DataFrame(columns=['dataset'] + metrics)
for dataset_name in ['green', 'skillspan', 'fijo', 'sayfullina', 'kompetencer', 'gnehm']:
    print("###############", dataset_name, "###############")
    test = pd.read_json(os.path.join(path, dataset_name, 'test.json'))
    train = pd.read_json(os.path.join(path, dataset_name, 'train.json'))
    all = pd.concat([test, train])
    dataset_metrics = {}
    for split_name, split in {'train':train, 'test':test, 'all':all}.items():
        # avg sentence length
        split['sentence_length'] = split['tokens'].apply(lambda x: len(x))
        avg_sentence_length = split['sentence_length'].mean()
        # nb of skills per sentence
        split['nb_skills'] = split['list_extracted_skills'].apply(lambda x: len(x))
        avg_nb_skills = split['nb_skills'].mean()
        max_nb_skills = split['nb_skills'].max()
        nb_negative = len(split[split['nb_skills'] == 0]) / len(split) * 100
        # avg span length
        split['avg_span_length'] = split['skill_spans'].apply(lambda x: np.mean([span[1][1] - span[1][0] for span in x]) if len(x) > 0 else 0)
        avg_span_length = np.mean([length for length in split['avg_span_length'] if length != 0])

        dataset_metrics[split_name] = {'size': len(split), 'avg_sentence_length': avg_sentence_length, 'avg_nb_skills': avg_nb_skills, 'max_nb_skills':max_nb_skills, 'percent_without_skill':nb_negative, 'avg_span_length': avg_span_length}
    print(pd.DataFrame(dataset_metrics))
    # overall metrics:
    # nb of unique skills in the whole split
    all_unique_skills = all.explode('list_extracted_skills')['list_extracted_skills'].unique()
    # skill overlap
    test_unique_skills = test.explode('list_extracted_skills')['list_extracted_skills'].unique()
    train_unique_skills = train.explode('list_extracted_skills')['list_extracted_skills'].unique()
    skill_overlap = len(set(train_unique_skills).intersection(set(test_unique_skills))) / len(all_unique_skills) * 100
    dataset_res = pd.DataFrame({'dataset': dataset_name, **dataset_metrics['all'], 'total_unique_skills': len(all_unique_skills), 'skill_overlap': skill_overlap}, index=[len(res_df)])
    res_df = pd.concat([res_df, dataset_res])



############### green ###############
                             train        test          all
size                   8668.000000  335.000000  9003.000000
avg_sentence_length      23.265805   22.940299    23.253693
avg_nb_skills             1.330641    2.005970     1.355770
max_nb_skills            24.000000   17.000000    24.000000
percent_without_skill    40.759114   30.447761    40.375430
avg_span_length           5.647831    2.685889     5.525317
############### skillspan ###############
                             train         test          all
size                   4782.000000  3569.000000  8351.000000
avg_sentence_length      18.275826    11.988232    15.588672
avg_nb_skills             0.424509     0.305688     0.373728
max_nb_skills            15.000000    11.000000    15.000000
percent_without_skill    79.401924    83.636873    81.211831
avg_span_length           4.121293     3.561679     3.916752
############### fijo ###############
                            train   

In [41]:
res_df

Unnamed: 0,dataset,size,avg_sentence_length,avg_nb_skills,max_nb_skills,percent_without_skill,avg_span_length,total_unique_skills,skill_overlap
0,green,9003,23.253693,1.35577,24,40.37543,5.525317,9974,1.473832
1,skillspan,8351,15.588672,0.373728,15,81.211831,3.916752,2706,3.178123
2,fijo,449,22.309577,1.877506,17,5.122494,9.308204,623,2.568218
3,sayfullina,5556,14.338373,1.00324,3,0.107991,1.711552,983,40.895219
4,kompetencer,1040,14.155769,0.490385,14,79.134615,3.430034,457,2.625821
5,gnehm,22374,10.305265,0.381872,26,83.074104,1.301364,4661,5.942931


In [None]:
# NB of unique skill normalized by data size
# check max span length

# Data cleaning

In [None]:
# Rule-based detection of 'bad' sentences