In [1]:
import sys
sys.path.append('../')

import pandas as pd
import json
import numpy as np
import sys

from annotations.dataset_preprocessing import join_with_data, prepare_dataset



In [2]:
data = pd.read_json('../annotations/data/annotations.json', lines=True)

In [3]:
len(data)

5959

In [4]:
data = data.explode('annotations')

In [5]:
len(data)

17883

In [6]:
data.head()

Unnamed: 0,id,text,annotations,meta,annotation_approver
0,1043,"@atteint: Ty no kurwa, że też ja na to nie wpa...","{'label': 1, 'user': 7, 'created_at': '2020-11...",{},
0,1043,"@atteint: Ty no kurwa, że też ja na to nie wpa...","{'label': 1, 'user': 1, 'created_at': '2020-12...",{},
0,1043,"@atteint: Ty no kurwa, że też ja na to nie wpa...","{'label': 1, 'user': 6, 'created_at': '2020-12...",{},
1,2377,@Kosciany: coś ponad 1 a mniej niż 2,"{'label': 1, 'user': 1, 'created_at': '2020-11...",{},
1,2377,@Kosciany: coś ponad 1 a mniej niż 2,"{'label': 1, 'user': 6, 'created_at': '2020-11...",{},


In [7]:
data.dropna(subset=['annotations'], inplace=True)
len(data)

17883

In [8]:
data['annotator_id'] = data['annotations'].apply(lambda x: x['user'])
data['annotation'] = data['annotations'].apply(lambda x: x['label'])

In [9]:
data.head()

Unnamed: 0,id,text,annotations,meta,annotation_approver,annotator_id,annotation
0,1043,"@atteint: Ty no kurwa, że też ja na to nie wpa...","{'label': 1, 'user': 7, 'created_at': '2020-11...",{},,7,1
0,1043,"@atteint: Ty no kurwa, że też ja na to nie wpa...","{'label': 1, 'user': 1, 'created_at': '2020-12...",{},,1,1
0,1043,"@atteint: Ty no kurwa, że też ja na to nie wpa...","{'label': 1, 'user': 6, 'created_at': '2020-12...",{},,6,1
1,2377,@Kosciany: coś ponad 1 a mniej niż 2,"{'label': 1, 'user': 1, 'created_at': '2020-11...",{},,1,1
1,2377,@Kosciany: coś ponad 1 a mniej niż 2,"{'label': 1, 'user': 6, 'created_at': '2020-11...",{},,6,1


In [10]:
len(data)

17883

In [11]:
common = data.groupby(['id', 'annotation']).count().reset_index()

In [12]:
common.head()

Unnamed: 0,id,annotation,text,annotations,meta,annotation_approver,annotator_id
0,1006,2,2,2,2,2,2
1,1006,3,1,1,1,1,1
2,1007,1,2,2,2,0,2
3,1007,3,1,1,1,0,1
4,1008,1,2,2,2,2,2


In [13]:
common = common[(common['annotations']==3)]
texts_id = common.id.to_list()
len(texts_id)

5630

In [14]:
data.groupby(['annotator_id', 'annotation']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,text,annotations,meta,annotation_approver
annotator_id,annotation,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,5272,5272,5272,5272,28
1,2,646,646,646,646,9
1,3,41,41,41,41,3
6,1,5188,5188,5188,5188,31
6,2,705,705,705,705,9
6,3,66,66,66,66,0
7,1,5250,5250,5250,5250,29
7,2,662,662,662,662,10
7,3,53,53,53,53,2


In [15]:
skipped = data[data.annotation == 3]
not_skipped = data[data.annotation != 3]

In [16]:
ids = skipped['id']

In [17]:
def filter_by_number_of_annotations(df: pd.DataFrame, nbr: int) -> pd.DataFrame:
    data = df.copy()
    calc = data[['id', 'annotation']] \
                            .groupby('id')['annotation'] \
                            .transform('size')
    data['occurance'] = calc.copy()
    filtered_data = data[data['occurance'] >= nbr].copy()
    print(len(filtered_data))
    return filtered_data


def fleiss_kappa_from_array(annotations: np.ndarray):
    '''
    Calculates Fleiss kappa (inter-annotator agreement score). Equal number of annotations for every example assumed.
    :param annotations: each ij-th cell denotes the number of raters who assigned the i-th example to the j-th category
    :param normalize: if values should be normalized row-wise
    :return: Fleiss kappa for given annotations or None
    '''
    n = annotations[0].sum()  # number of annotations per example
    p_j = annotations.sum(axis=0)
    p_j = p_j / p_j.sum()
    P_i = ((annotations * annotations).sum(axis=1) - n) / (n * (n - 1))
    P_mean = P_i.mean()
    P_e = (p_j * p_j).sum()
    if P_e == 1:
        return 1
    kappa = (P_mean - P_e) / (1 - P_e)
    return kappa


def calculate_weighted_kappa(annotations: pd.DataFrame) -> float:
    '''
    Calculates weighted Fleiss kappa for given annotations set.
    :param annotations: a 3-column dataframe, where 1st column is example_id, 2nd column is annotator_id and 3rd column is annotation
    :return: weighted Fleiss kappa
    '''
    columns = annotations.columns
    example_id, worker_id, label = columns[0], columns[1], columns[2]
    filtered_df = filter_by_number_of_annotations(annotations, 2)  # filter out examples with less than 2 annotations
    if filtered_df.empty:
        return None
    num_labels = filtered_df[label].nunique()
    grouped = filtered_df.groupby('occurance')
    weights, kappas = [], []
    for name, group in grouped:
        pivoted = group.pivot_table(values=worker_id, index=[example_id], columns=[label], aggfunc=len, fill_value=0)
        if pivoted.shape[1] == 1:  # all one label
            kappa = 1
        else:
            votes_array = pivoted.to_numpy(copy=True)
            kappa = fleiss_kappa_from_array(votes_array)
        kappas.append(kappa)
        weights.append(pivoted.shape[0])
    weights_total = sum(weights)
    multiplied = np.array(weights) * np.array(kappas)
    return multiplied.sum() / weights_total

In [18]:
not_skipped[not_skipped['id'].isin(texts_id)].groupby(['annotator_id', 'annotation']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,text,annotations,meta,annotation_approver
annotator_id,annotation,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,5056,5056,5056,5056,27
1,2,571,571,571,571,7
6,1,5056,5056,5056,5056,27
6,2,571,571,571,571,7
7,1,5056,5056,5056,5056,27
7,2,576,576,576,576,8


In [19]:
not_skipped[(not_skipped['id'].isin(texts_id)) & (not_skipped['annotator_id'].isin([6,7])) & (not_skipped['annotation']==2)]

Unnamed: 0,id,text,annotations,meta,annotation_approver,annotator_id,annotation
6,1009,@AgentGRU: kurwa człowieku no do kurwy nędzych...,"{'label': 2, 'user': 6, 'created_at': '2020-11...",{},,6,2
6,1009,@AgentGRU: kurwa człowieku no do kurwy nędzych...,"{'label': 2, 'user': 7, 'created_at': '2020-12...",{},,7,2
7,1010,@diogene: a ile razy mam pisac ze ja mam w dup...,"{'label': 2, 'user': 7, 'created_at': '2020-11...",{},,7,2
7,1010,@diogene: a ile razy mam pisac ze ja mam w dup...,"{'label': 2, 'user': 6, 'created_at': '2020-12...",{},,6,2
9,1045,@Sl_w_k_1: Zobacz ta ustawę i jak pierdoli to ...,"{'label': 2, 'user': 6, 'created_at': '2020-11...",{},,6,2
...,...,...,...,...,...,...,...
5949,1409,"@Opalka: Mnie tez, podobnie jak przy innych ka...","{'label': 2, 'user': 6, 'created_at': '2020-12...",{},alicja,6,2
5949,1409,"@Opalka: Mnie tez, podobnie jak przy innych ka...","{'label': 2, 'user': 7, 'created_at': '2020-12...",{},alicja,7,2
5956,1215,@muwieszeptem: tylko niech potem nikogo nie zd...,"{'label': 2, 'user': 7, 'created_at': '2020-12...",{},marcin,7,2
5957,2315,"@Gon70: a wiesz co jest najgorsze, ze jak Mich...","{'label': 2, 'user': 6, 'created_at': '2020-12...",{},marcin,6,2


In [20]:
calculate_weighted_kappa(not_skipped[~not_skipped.id.isin(ids)][['id', 'annotator_id', 'annotation']])

17468


0.8870202850665773

In [21]:
general_data = join_with_data(data, src_filepath='../wykop_scraper/data/latest/filtered_comments.csv')
cleaned_data = prepare_dataset(general_data)

general_data.to_csv('../annotations/data/comments_with_annotations.csv')
cleaned_data.to_csv('../annotations/data/classification_dataset.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
