# Analysis of Manual Labeling from Additional New Data 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

In [2]:
dfn = pd.read_csv('bbc_dataset_news.csv', sep=',\s', delimiter=',')
dfn = dfn.replace({r'\r\n': ''}, regex=True)
dfn = dfn.replace('\'s','', regex=True)
dfn = dfn.replace('\"','', regex=True)
dfn = dfn.replace('  ',' ', regex=True)
dfn['length'] =  dfn['news'].str.count(' ') + 1
dfn

Unnamed: 0,news,type,length
0,China had role in Yukos split-up China lent Ru...,business,443
1,Oil rebounds from weather effect Oil prices re...,business,233
2,Indonesia 'declines debt freeze' Indonesia no ...,business,349
3,$1m payoff for former Shell boss Shell is to p...,business,236
4,US bank in $515m SEC settlement Five Bank of A...,business,386
...,...,...,...
2220,Microsoft launches its own search Microsoft ha...,tech,426
2221,Warnings about junk mail deluge The amount of ...,tech,447
2222,Microsoft gets the blogging bug Software giant...,tech,233
2223,Gamers snap up new Sony PSP Gamers have bought...,tech,181


In [4]:
dfn_filtered = dfn[dfn.length > 350]
dfn_filtered = dfn_filtered.drop(['type'], axis=1)
dfn_filtered

Unnamed: 0,news,length
0,China had role in Yukos split-up China lent Ru...,443
4,US bank in $515m SEC settlement Five Bank of A...,386
7,US seeks new $280bn smoker ruling The US Justi...,369
11,UK bank seals South Korean deal UK-based bank ...,647
14,US adds more jobs than expected The US economy...,364
...,...,...
2217,'Friends fear' with lost mobiles People are be...,573
2218,Podcasts mark rise of DIY radio An Apple iPod ...,1028
2220,Microsoft launches its own search Microsoft ha...,426
2221,Warnings about junk mail deluge The amount of ...,447


In [5]:
dfn_filtered.to_csv('additional_data_to_be_labeled.csv')

In [6]:
dfn_labeled = pd.read_csv('filtered_dataset_mlc.csv')
dfn_labeled = dfn_labeled.drop(['original_idx','type','length'], axis=1)
dfn_labeled

Unnamed: 0,news,category_mlc
0,China had role in Yukos split-up China lent Ru...,talk.politics.misc
1,US bank in $515m SEC settlement Five Bank of A...,talk.politics.misc
2,US seeks new $280bn smoker ruling The US Justi...,talk.politics.misc
3,UK bank seals South Korean deal UK-based bank ...,talk.politics.misc
4,US adds more jobs than expected The US economy...,talk.politics.misc
...,...,...
1023,'Friends fear' with lost mobiles People are be...,sci.electronics
1024,Podcasts mark rise of DIY radio An Apple iPod ...,comp.sys.mac.hardware
1025,Microsoft launches its own search Microsoft ha...,comp.os.ms-windows.misc
1026,Warnings about junk mail deluge The amount of ...,sci.electronics


In [7]:
labeled_pcc = pd.read_csv('filtered_dataset_pcc.csv')
labeled_yhg = pd.read_csv('filtered_dataset_yhg.csv')
labeled_pcc, labeled_yhg

(                 category_pcc
 0          talk.politics.misc
 1          talk.politics.misc
 2          talk.politics.misc
 3          talk.politics.misc
 4          talk.politics.misc
 ...                       ...
 1023             misc.forsale
 1024    comp.sys.mac.hardware
 1025  comp.os.ms-windows.misc
 1026          sci.electronics
 1027    comp.sys.mac.hardware
 
 [1028 rows x 1 columns],
             category_yhg
 0     talk.politics.misc
 1     talk.politics.misc
 2           misc.forsale
 3     talk.politics.misc
 4     talk.politics.misc
 ...                  ...
 1023     sci.electronics
 1024     sci.electronics
 1025     sci.electronics
 1026     sci.electronics
 1027     sci.electronics
 
 [1028 rows x 1 columns])

In [8]:
dfn_labeled = pd.concat([dfn_labeled, labeled_pcc, labeled_yhg], axis=1)
dfn_labeled

Unnamed: 0,news,category_mlc,category_pcc,category_yhg
0,China had role in Yukos split-up China lent Ru...,talk.politics.misc,talk.politics.misc,talk.politics.misc
1,US bank in $515m SEC settlement Five Bank of A...,talk.politics.misc,talk.politics.misc,talk.politics.misc
2,US seeks new $280bn smoker ruling The US Justi...,talk.politics.misc,talk.politics.misc,misc.forsale
3,UK bank seals South Korean deal UK-based bank ...,talk.politics.misc,talk.politics.misc,talk.politics.misc
4,US adds more jobs than expected The US economy...,talk.politics.misc,talk.politics.misc,talk.politics.misc
...,...,...,...,...
1023,'Friends fear' with lost mobiles People are be...,sci.electronics,misc.forsale,sci.electronics
1024,Podcasts mark rise of DIY radio An Apple iPod ...,comp.sys.mac.hardware,comp.sys.mac.hardware,sci.electronics
1025,Microsoft launches its own search Microsoft ha...,comp.os.ms-windows.misc,comp.os.ms-windows.misc,sci.electronics
1026,Warnings about junk mail deluge The amount of ...,sci.electronics,sci.electronics,sci.electronics


In [9]:
dfn_labeled['mlc/pcc'] = np.where(dfn_labeled['category_mlc'] == dfn_labeled['category_pcc'], 1, 0)
dfn_labeled['pcc/yhg'] = np.where(dfn_labeled['category_yhg'] == dfn_labeled['category_pcc'], 1, 0)
dfn_labeled['yhg/mlc'] = np.where(dfn_labeled['category_mlc'] == dfn_labeled['category_yhg'], 1, 0)
dfn_labeled['agreement'] = (dfn_labeled['mlc/pcc'] + dfn_labeled['pcc/yhg'] + dfn_labeled['yhg/mlc']) / 3
dfn_labeled

Unnamed: 0,news,category_mlc,category_pcc,category_yhg,mlc/pcc,pcc/yhg,yhg/mlc,agreement
0,China had role in Yukos split-up China lent Ru...,talk.politics.misc,talk.politics.misc,talk.politics.misc,1,1,1,1.000000
1,US bank in $515m SEC settlement Five Bank of A...,talk.politics.misc,talk.politics.misc,talk.politics.misc,1,1,1,1.000000
2,US seeks new $280bn smoker ruling The US Justi...,talk.politics.misc,talk.politics.misc,misc.forsale,1,0,0,0.333333
3,UK bank seals South Korean deal UK-based bank ...,talk.politics.misc,talk.politics.misc,talk.politics.misc,1,1,1,1.000000
4,US adds more jobs than expected The US economy...,talk.politics.misc,talk.politics.misc,talk.politics.misc,1,1,1,1.000000
...,...,...,...,...,...,...,...,...
1023,'Friends fear' with lost mobiles People are be...,sci.electronics,misc.forsale,sci.electronics,0,0,1,0.333333
1024,Podcasts mark rise of DIY radio An Apple iPod ...,comp.sys.mac.hardware,comp.sys.mac.hardware,sci.electronics,1,0,0,0.333333
1025,Microsoft launches its own search Microsoft ha...,comp.os.ms-windows.misc,comp.os.ms-windows.misc,sci.electronics,1,0,0,0.333333
1026,Warnings about junk mail deluge The amount of ...,sci.electronics,sci.electronics,sci.electronics,1,1,1,1.000000


In [10]:
labelers = ['MLC/PCC', 'PCC/YHG', 'YHG/MLC']

irr_mlc_pcc = (dfn_labeled['mlc/pcc'].sum() / len(dfn_labeled))
irr_pcc_yhg = (dfn_labeled['pcc/yhg'].sum() / len(dfn_labeled))
irr_yhg_mlc = (dfn_labeled['yhg/mlc'].sum() / len(dfn_labeled))

irr = [irr_mlc_pcc, irr_pcc_yhg, irr_yhg_mlc]
for i in range(3):
    print('Percent agreement between {}: {:.3f} %'.format(labelers[i], irr[i] * 100))

Percent agreement between MLC/PCC: 65.856 %
Percent agreement between PCC/YHG: 89.397 %
Percent agreement between YHG/MLC: 63.230 %


In [11]:
print('Inter-Rater Reliability: {:.3f} %'.format((dfn_labeled['agreement'].sum() / len(dfn_labeled)) * 100))

Inter-Rater Reliability: 72.827 %


In [12]:
from sklearn.metrics import cohen_kappa_score

cks_mlc_pcc = cohen_kappa_score(dfn_labeled['category_mlc'], dfn_labeled['category_pcc'])
cks_pcc_yhg = cohen_kappa_score(dfn_labeled['category_yhg'], dfn_labeled['category_pcc'])
cks_yhg_mlc = cohen_kappa_score(dfn_labeled['category_mlc'], dfn_labeled['category_yhg'])

cks = [cks_mlc_pcc, cks_pcc_yhg, cks_yhg_mlc]
for i in range(3):
    print('Cohen Kappa Score between {}: {:.3f} %'.format(labelers[i], cks[i] * 100))

Cohen Kappa Score between MLC/PCC: 52.316 %
Cohen Kappa Score between PCC/YHG: 83.341 %
Cohen Kappa Score between YHG/MLC: 48.760 %


In [13]:
print('Average Cohen Kappa Score between Labelers: {:.3f} %'.format((sum(cks) / len(cks)) * 100))

Average Cohen Kappa Score between Labelers: 61.472 %
