In [1]:
import plotly.express as px
from datasets import load_dataset

In [2]:
import re

pl_char = 'żźćńółęąś'


def maper_text_function(row):
    text = row['text']

    n_of_sentences = count_sentences(text)
    n_of_words = count_words(text)
    len_text = count_characters(text)

    return {'number_of_sentences': n_of_sentences, 'number_of_words': n_of_words, 'number_of_characters': len_text}


def count_characters(text):
    return len(re.sub('\s+', "", text))


def count_words(text):
    pattern = f'[a-zA-z{pl_char}{pl_char.upper()}]+'
    return len(re.findall(pattern, text))


def count_sentences(text):
    pattern = f'(\.(\s*)[A-Z{pl_char.upper()}])|$'
    return len(re.findall(pattern, text))

# count_words('Na samym wejściu hotel śmierdzi. W pokojach jest pleśń na ścianach , brudny dywan')


In [19]:

def plot_hist(target: list, maper_values, title, label_name):

    target = list(map(lambda x: maper_values[x], target))

    fig = px.histogram(x=target, category_orders={'x': list(maper_values.values())}, histnorm='percent',
                       labels={'x': label_name},
                       title=title)

    fig.update_layout(
        title_x=0.5,
        title_font_size=25,
        autosize=True,
        # line_color='black',
        #  meanline_visible=True,
        font_color='white',
        hoverlabel_bordercolor='lightseagreen'
    )

    fig.update_xaxes(rangeselector_font_size=20)
    fig.show()


def plot_print_text_statistics(dataset, target_col, mapper_target_values, title):
    text_desc_col = ['number_of_sentences', 'number_of_words', 'number_of_characters']
    print(dataset.to_pandas()[text_desc_col + [target_col]].describe())

    # target = list(map(lambda x: maper_values[x], dataset[target_col]))

    fig = make_subplots(rows=1, cols=3,
                        # specs=[
                        #     [{"colspan": 3}, None, None],
                        #     [{}, {}, {}]
                        # ],
                        # subplot_titles=("Target","Second Subplot", "Third Subplot", 'a'),
                        )

    # fig.append_trace(
    #         go.Histogram(x=target,
    #         category_orders={'x': list(maper_values.values())}, histnorm='percent',
    #                    labels={'x': target_col}),
    #         row=1, col=1)

    for i, col in enumerate(text_desc_col):
        fig.append_trace(
            go.Violin(y=dataset[col], box_visible=True, line_color='black',
                      meanline_visible=True, fillcolor='lightseagreen', opacity=0.6,
                      x0=col.replace('_', " ")),
            row=1, col=i + 1)

    fig.update_layout(
        title_x=0.5,
        autosize=True,
        title_text="Distribution of text response",
        showlegend=False

    )

    plot_hist(dataset[target_col], mapper_target_values, title, target_col)

    fig.show()



# polemo2-official

In [20]:
dataset = load_dataset("clarin-pl/polemo2-official")
train_dataset = dataset['train'].map(maper_text_function)
dataset

No config specified, defaulting to: polemo2-official/all_text
Found cached dataset polemo2-official (/home/andrzej/.cache/huggingface/datasets/clarin-pl___polemo2-official/all_text/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/andrzej/.cache/huggingface/datasets/clarin-pl___polemo2-official/all_text/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-4dbc51a3e3b85af0.arrow


DatasetDict({
    train: Dataset({
        features: ['text', 'target'],
        num_rows: 6573
    })
    validation: Dataset({
        features: ['text', 'target'],
        num_rows: 823
    })
    test: Dataset({
        features: ['text', 'target'],
        num_rows: 820
    })
})

Output ('target' column): label for sentence sentiment ('zero': neutral, 'minus': negative, 'plus': positive, 'amb': ambiguous)
Around: 8000 objects

In [21]:
maper_values = {0: '0_neutral', 1: '1_negative', 2: '2_positive', 3: '3_ambiguous'}
title =  'Distribution of responses for the polemo2-official training dataset'
plot_print_text_statistics(train_dataset, 'target', maper_values,  title)

       number_of_sentences  number_of_words  number_of_characters       target
count          6573.000000      6573.000000           6573.000000  6573.000000
mean              7.162179       110.147878            625.412901     1.528069
std               4.494216        62.680246            349.882118     0.971135
min               1.000000         1.000000              1.000000     0.000000
25%               4.000000        68.000000            389.000000     1.000000
50%               6.000000        98.000000            560.000000     1.000000
75%               9.000000       131.000000            765.000000     2.000000
max              50.000000       765.000000           4405.000000     3.000000


Example

In [5]:
unique_target = set(dataset['train']['target'])

[dataset['train'][dataset['train']['target'].index(x)] for x in unique_target]

[{'text': 'Gdy Dominika Nowis zastanawia się , jak zniszczyć raka - zawsze sprawdza , czy jej metody walki nie zniszczą naszych zdrowych organów . To ona jako pierwsza w świecie zbadała , czy inhibitory proteasomu upośledzają funkcję serca . Prace , jakie prowadziła razem z dr . Gaetano Vattemim z Uniwersytetu w Weronie , miały pomoc lekarzom w lepszym doborze pacjentów do leczenia inhibitorami proteasomu , aby ograniczyć ryzyko pojawienia się nieoczekiwanych działań niepożądanych . Okazało się , że chociaż leki są toksyczne , to ich efekty mijają po pewnym czasie od odstawienia . Pacjenci nie muszą się więc bać , że wyleczą się z raka po to , żeby – zdrowi – umrzeć na serce . Badaczka od wielu lat szuka także sposobów na zabijanie raka światłem . Najpierw " uczula " komórki nowotworowe na światło , a potem stosuje zabójczą kombinację tlenu i światła laserowego . Rak jest jednak twardym przeciwnikiem . Kiedy okazało się , że niektóre nowotwory potrafią przeżyć tak morderczą terapię , b

# Allegro_reviews

In [6]:
dataset_allegro = load_dataset("allegro_reviews")
train_dataset = dataset_allegro['train'].map(maper_text_function)
dataset_allegro

Using custom data configuration default
Found cached dataset allegro_reviews (/home/andrzej/.cache/huggingface/datasets/allegro_reviews/default/1.1.0/d0fd6bf6f7f8f2c8faed7ecf9d295c991eac403ca516d3d3593061a913c0476b)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/9577 [00:00<?, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'rating'],
        num_rows: 9577
    })
    test: Dataset({
        features: ['text', 'rating'],
        num_rows: 1006
    })
    validation: Dataset({
        features: ['text', 'rating'],
        num_rows: 1002
    })
})




Around: 10k object


In [7]:
maper_values = {1: '1_really_negative', 2: '2_negative', 3: '3_neutral', 4: '4_positive', 5: '5_really_positive'}

title = 'Distribution of responses for the allegro_reviews training dataset'
plot_print_text_statistics(train_dataset, 'rating', maper_values, title)

       number_of_sentences  number_of_words  number_of_characters       rating
count          9577.000000      9577.000000           9577.000000  9577.000000
mean              4.810275        71.056281            398.830323     3.517385
std               3.183478        46.133819            255.526464     1.542132
min               1.000000         0.000000              0.000000     1.000000
25%               3.000000        54.000000            299.000000     2.000000
50%               4.000000        63.000000            355.000000     4.000000
75%               6.000000        81.000000            461.000000     5.000000
max              65.000000      1667.000000           4801.000000     5.000000


Example

In [8]:
[dataset_allegro['train'][dataset_allegro['train']['rating'].index(x)] for x in set(dataset_allegro['train']['rating'])]


[{'text': 'Na słuchawkę czekałam spory czas a po zadzwonieniu okazało się ,że paczka im się zawieruszyła i w ten sam dzień mieli wysłać najszybszym kurierem i mimo to i tak czekałam znowu gdzie bardzo mi była potrzebna.  Do tego niby bateria trzyma długo gdzie tak nie jest i słuchawka się rozłącza . Dostałam ponownie wiadomość o tym ,że znowu dostanę te samą paczkę po wiadomości do nich ,że to pomyłka Pan odpisał mi,że nie bo to gratis no to się nastawiłam ,że gratis i ,że opłacona za to ,że tyle musiałam czekać .Tym razem przyjeżdża listonosz i mówi paczka do opłaty .Paczka poszła na pocztę i tam sobie leży ;) !  ',
  'rating': 1.0},
 {'text': 'Pomysł na produkt (uchwyt na kierownicę) bardzo dobry natomiast wykonanie uchwytu słabe. Po około 1-2 miesiącach sporadycznego użytkowania dolna częśc uchwytu po prostu się odrywa od reszty i uchwyt nadaje się do wyrzucenia. Dolna część uchwytu oderwała się w obu sztukach zakupionego przedmiotu. Wystarczy po prostu produkować uchwyt z lepszej j

# hate_speech_pl

In [9]:
dataset = load_dataset("hate_speech_pl")
train_dataset = dataset['train'].map(maper_text_function)
dataset


Using custom data configuration default
Found cached dataset hate_speech_pl (/home/andrzej/.cache/huggingface/datasets/hate_speech_pl/default/1.1.0/40101693880e807040a6b999faa7441cc231e95c81eefa5922ff4a76c8aa48fd)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/13887 [00:00<?, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'text_id', 'annotator_id', 'minority_id', 'negative_emotions', 'call_to_action', 'source_of_knowledge', 'irony_sarcasm', 'topic', 'text', 'rating'],
        num_rows: 13887
    })
})

Only training dataset

In [10]:

maper_values = {0: '0_neutral', 1: '1_negative', 2: '2_more_negative', 3: '3_really_negative', 4: '4_the_most_negative'}

title = 'Distribution of responses for the hate_speach_pl training dataset'
plot_print_text_statistics(train_dataset, 'rating', maper_values, title)

       number_of_sentences  number_of_words  number_of_characters  \
count         13887.000000     13887.000000          13887.000000   
mean              1.435515        34.509901            192.242241   
std               4.932966        89.318656            529.968622   
min               1.000000         2.000000             17.000000   
25%               1.000000        17.000000             94.000000   
50%               1.000000        24.000000            132.000000   
75%               1.000000        34.000000            188.000000   
max             214.000000      4144.000000          22470.000000   

             rating  
count  13887.000000  
mean       0.551739  
std        0.889979  
min        0.000000  
25%        0.000000  
50%        0.000000  
75%        1.000000  
max        4.000000  


Example


In [11]:
unique_target = set(dataset['train']['rating'])

[dataset['train'][dataset['train']['rating'].index(x)] for x in unique_target]


[{'id': 1,
  'text_id': 121713,
  'annotator_id': 1,
  'minority_id': 72,
  'negative_emotions': True,
  'call_to_action': True,
  'source_of_knowledge': 2,
  'irony_sarcasm': True,
  'topic': 18,
  'text': ' <font color="blue"> Niemiec</font> mówi co innego',
  'rating': 0},
 {'id': 7,
  'text_id': 99496,
  'annotator_id': 10,
  'minority_id': 51,
  'negative_emotions': True,
  'call_to_action': True,
  'source_of_knowledge': 2,
  'irony_sarcasm': True,
  'topic': 11,
  'text': ' <s> Podobno</s> <font color="blue"> homoseksualiści</font> mają normalny popęd seksualny tylko uległ on zaburzeniu często na skutek trudnych przeżyć urazów emocjonalnych',
  'rating': 1},
 {'id': 6,
  'text_id': 52072,
  'annotator_id': 1,
  'minority_id': 80,
  'negative_emotions': True,
  'call_to_action': True,
  'source_of_knowledge': 2,
  'irony_sarcasm': True,
  'topic': 7,
  'text': ' Tak <s> Chinczycy</s> jak i <font color="blue"> Wietnamczycy</font> rzadko siedza na zasilkach i rzadko kombinuja jak o


Data Fields

   id: unique identifier of the entry
    text_id: text identifier, useful when a single text is rated several times by different annotators
    annotator_id: identifier of the person who annotated the text
    minority_id: the internal identifier of the minority described in the text
    negative_emotions: boolean indicator of the presence of negative emotions in the text
    call_to_action: boolean indicator set to true, if the text calls the audience to perform any action, typically with a negative emotions
    source_of_knowledge: categorical variable, describing the source of knowledge for the post rating - 0, 1 or 2 (direct, lexical or contextual, but the description of the meaning for different values couldn't be found)
    irony_sarcasm: boolean indicator of the present of irony or sarcasm
    topic: internal identifier of the topic the text is about
    text: post text content
    rating: integer value, from 0 to 4 - the higher the value, the more negative the text content is


# poleval2019_cyberbullying


In [12]:
dataset = load_dataset("poleval2019_cyberbullying", 'task02')
train_dataset = dataset['train'].map(maper_text_function)
dataset


Found cached dataset poleval2019_cyberbullying (/home/andrzej/.cache/huggingface/datasets/poleval2019_cyberbullying/task02/1.0.0/ce6060c56dae43c469bab309a7573b86299b0bcc2484e85cfe0ae70b5f770450)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10041 [00:00<?, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10041
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
})

In [13]:
maper_values = {0: '0_neutral', 1: '1_negative', 2: '2_more_negative'}


title = 'Distribution of responses for the poleval2019_cyberbullying training dataset'

plot_print_text_statistics(train_dataset,'label',maper_values,title)

       number_of_sentences  number_of_words  number_of_characters  \
count         10041.000000     10041.000000          10041.000000   
mean              1.208844        12.176277             82.215018   
std               0.481434         4.790158             29.541680   
min               1.000000         1.000000              6.000000   
25%               1.000000         8.000000             59.000000   
50%               1.000000        11.000000             80.000000   
75%               1.000000        16.000000            105.000000   
max               5.000000        32.000000            200.000000   

              label  
count  10041.000000  
mean       0.144308  
std        0.492564  
min        0.000000  
25%        0.000000  
50%        0.000000  
75%        0.000000  
max        2.000000  


Example

In [14]:
unique_target = set(dataset['train']['label'])

[dataset['train'][dataset['train']['label'].index(x)] for x in unique_target]


[{'text': 'Dla mnie faworytem do tytułu będzie Cracovia. Zobaczymy, czy typ się sprawdzi.',
  'label': 0},
 {'text': '@anonymized_account @anonymized_account @anonymized_account Jak narazie to masz przywidzenia co nie zmienia faktu że cały czas jesteś idiotą.',
  'label': 1},
 {'text': '@anonymized_account @anonymized_account @anonymized_account Gdzie jest @anonymized_account . Brudziński jesteś kłamcą i marnym kutasem @anonymized_account',
  'label': 2}]

# Paul/hatecheck-polish


In [15]:
dataset = load_dataset("Paul/hatecheck-polish")
dataset = dataset['test'].rename_column('test_case','text')
train_dataset = dataset.map(maper_text_function)
dataset


Using custom data configuration Paul--hatecheck-polish-4a18ae0888047920
Found cached dataset csv (/home/andrzej/.cache/huggingface/datasets/Paul___csv/Paul--hatecheck-polish-4a18ae0888047920/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3815 [00:00<?, ?ex/s]

Dataset({
    features: ['mhc_case_id', 'functionality', 'text', 'label_gold', 'target_ident', 'ref_case_id', 'ref_templ_id', 'templ_id', 'case_templ', 'gender_male', 'gender_female', 'label_annotated', 'label_annotated_maj', 'disagreement_in_case', 'disagreement_in_template'],
    num_rows: 3815
})

In [16]:
maper_values = {'non-hateful': 'non-hateful', 'hateful': 'hateful'}

title = 'Distribution of responses for the Paul/hatecheck-polish training dataset'

plot_print_text_statistics(train_dataset,'label_annotated_maj',maper_values,title)

       number_of_sentences  number_of_words  number_of_characters
count          3815.000000      3815.000000           3815.000000
mean              1.036697         7.647444             43.062385
std               0.188042         3.227309             16.399678
min               1.000000         1.000000              8.000000
25%               1.000000         5.000000             30.000000
50%               1.000000         7.000000             41.000000
75%               1.000000         9.000000             53.000000
max               2.000000        26.000000            103.000000


Example

In [17]:
unique_target = set(dataset['label_annotated_maj'])

[dataset[dataset['label_annotated_maj'].index(x)] for x in unique_target]


[{'mhc_case_id': 'polish-1',
  'functionality': 'derog_neg_emote_h',
  'text': 'Nienawidzę gejów.',
  'label_gold': 'hateful',
  'target_ident': 'gay people',
  'ref_case_id': None,
  'ref_templ_id': None,
  'templ_id': 1,
  'case_templ': 'Nienawidzę [IDENT_P].',
  'gender_male': None,
  'gender_female': None,
  'label_annotated': "['hateful', 'hateful', 'hateful']",
  'label_annotated_maj': 'hateful',
  'disagreement_in_case': False,
  'disagreement_in_template': False},
 {'mhc_case_id': 'polish-852',
  'functionality': 'slur_h',
  'text': 'To naturalne dla mośka jak ty.',
  'label_gold': 'hateful',
  'target_ident': 'jewish people',
  'ref_case_id': None,
  'ref_templ_id': None,
  'templ_id': 121,
  'case_templ': 'To naturalne dla [SLR_S] jak ty.',
  'gender_male': None,
  'gender_female': None,
  'label_annotated': "['hateful', 'non-hateful', 'non-hateful']",
  'label_annotated_maj': 'non-hateful',
  'disagreement_in_case': True,
  'disagreement_in_template': False}]