In [1]:
import pandas as pd
import os
import sys
sys.path.append('../')

fact_checking_file_path = '../data/misinfome/fact_checking_labels.tsv'
misinfome_raw_file_path = '../data/misinfome/joined_tables.tsv'
data = pd.read_csv(fact_checking_file_path,delimiter = '\t') 

In [2]:
# Reference: https://github.com/amirziai/learning/blob/master/statistics/Inter-rater%20agreement%20kappas.ipynb
def fleiss_kappa(ratings, n):
    '''
    Computes the Fleiss' kappa measure for assessing the reliability of 
    agreement between a fixed number n of raters when assigning categorical
    ratings to a number of items.
    
    Args:
        ratings: a list of (item, category)-ratings
        n: number of raters
        k: number of categories
    Returns:
        the Fleiss' kappa score
    
    See also:
        http://en.wikipedia.org/wiki/Fleiss'_kappa
    '''
    items = set()
    categories = set()
    n_ij = {}
    
    for i, c in ratings:
        items.add(i)
        categories.add(c)
        n_ij[(i,c)] = n_ij.get((i,c), 0) + 1
    
    N = len(items)
    
    p_j = dict(((c, sum(n_ij.get((i, c), 0) for i in items) / (1.0 * n * N)) for c in categories))
    P_i = dict(((i, (sum(n_ij.get((i, c), 0) ** 2 for c in categories) - n) / (n * (n - 1.0))) for i in items))

    P_bar = sum(P_i.values()) / (1.0 * N)
    P_e_bar = sum(value ** 2 for value in p_j.values())
    
    kappa = (P_bar - P_e_bar) / (1 - P_e_bar)
    
    return kappa

def prepare_fless_kappa_input(data_frame):
    ratings_df = data_frame.iloc[:,1:-2]
    raters = data.iloc[:,1:-2].columns
    n = len(raters)
    
    ratings = []
    
    for index, row in ratings_df.iterrows():
        for rater in raters:
            ratings.append((index+1,row[rater]))
            
    return ratings, n

In [3]:
ratings, n = prepare_fless_kappa_input(data)
print(fleiss_kappa(ratings,n))

0.505160160216


In [4]:
import numpy as np
system_labels = ['credible','mostly_credible','credible_uncertain','mostly_not_credible','not_credible','not_verifiable']

ratings_df = data.iloc[:,1:-2]
print('Unique fact-checking labels {}'.format(ratings_df.shape[0]))

# drop None's values if len of columns are bigger than 3 
ratings_df.dropna(thresh=3, inplace = True)
print('After dropping None values {}'.format(ratings_df.shape[0]))

mode = ratings_df.mode(axis=1)
ratings_df['majority'] = np.where(mode.isna().any(1), mode[0], 'disagreed')

disagreed_df = ratings_df[ratings_df['majority'] == 'disagreed']
print('Disagreed samples {}'.format(disagreed_df.shape[0]))

# drop NaN values and substract from disagreed samples
ratings_df.dropna(subset=['majority'],inplace=True)
agreed_df = ratings_df.drop(disagreed_df.index)
print('Agreed samples {}'.format(agreed_df.shape[0]))

# fix the label noise due to the coders
agreed_df['majority'] = agreed_df['majority'].str.strip()
agreed_df['majority'] = agreed_df['majority'].str.replace(" ","_")
agreed_df['majority'] = agreed_df['majority'].str.replace("_credible","credible")
agreed_df['majority'] = agreed_df['majority'].str.replace("\xc2\xa0\xc2\xa0credible","credible")
agreed_df['majority'] = agreed_df['majority'].str.replace("notcredible","not_credible")
agreed_df['majority'] = agreed_df['majority'].str.replace("mostlycredible","mostly_credible")
agreed_df['majority'] = agreed_df['majority'].str.replace("mostlycredible","mostly_not_credible")
print(agreed_df.majority.unique())

Unique fact-checking labels 224
After dropping None values 169


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Disagreed samples 2
Agreed samples 167
['not_credible' 'credible_uncertain' 'mostly_credible'
 'mostly_not_credible' 'not_verifiable' 'credible']


In [5]:
fc_maps = dict(zip(data.loc[agreed_df.index]['fact_checking_label'], agreed_df['majority'].values))

In [6]:
def _map_label(fact_checker_label):
    '''
    map to fact-checker labels to system's labels
    '''
    return fc_maps[fact_checker_label] if fact_checker_label in fc_maps else None

In [7]:
from utils import parse_id

misinfome_data = pd.read_csv(misinfome_raw_file_path, sep='\t')
mask = (misinfome_data['lang'] == 'en') & (misinfome_data['source'].str.contains('twitter'))
misinfome_data = misinfome_data[mask]
print('Tweet Dataset before mapping to system labels: {}'.format(misinfome_data.shape[0]))
misinfome_data['normalized_url'] = misinfome_data.url.apply(parse_id)
misinfome_data['expected_credible'] = misinfome_data.factchecker_label.apply(_map_label)
# drop ids having NaN label
misinfome_data.dropna(inplace = True)
misinfome_data.reset_index(drop=True)

Tweet Dataset before mapping to system labels: 1335


Unnamed: 0,id,url,source,headline,lang,body,factchecker_label,normalised_score,normalised_confidence,normalized_url,expected_credible
0,4,https://twitter.com/realDonaldTrump/status/120...,twitter.com,Donald J. Trump on Twitter,en,‚ÄúBreaking News: The President of Ukraine has j...,Not what Zelensky said,0.000000,0.000,1201499577645449216,not_credible
1,51,https://twitter.com/bbcquestiontime/status/119...,twitter.com,BBC Question Time on Twitter,en,‚Äú‚ÄúI‚Äôd like to call out Labour as liars. I am o...,This is not correct,0.000000,0.000,1197651546940608514,not_credible
2,54,https://twitter.com/JeffJacksonNC/status/11964...,twitter.com,Sen. Jeff Jackson on Twitter,en,‚ÄúThe majority party just redrew the congressio...,Half True,-0.277778,1.000,1196439138486108162,credible_uncertain
3,269,https://twitter.com/VP/status/1197173285626105856,twitter.com,Vice President Mike Pence on Twitter,en,‚ÄúHeading to Wisconsin to visit Fincantieri Mar...,Mostly True,0.111111,1.000,1197173285626105856,mostly_credible
4,271,https://twitter.com/realDonaldTrump/status/119...,twitter.com,Donald J. Trump on Twitter,en,‚ÄúToday I opened a major Apple Manufacturing pl...,False.,0.000000,0.000,1197293250115014656,not_credible
5,272,https://twitter.com/realDonaldTrump/status/119...,twitter.com,Donald J. Trump on Twitter,en,‚ÄúToday I opened a major Apple Manufacturing pl...,Not True,0.000000,0.000,1197293250115014656,not_credible
6,273,https://twitter.com/RepMarkMeadows/status/1195...,twitter.com,Mark Meadows on Twitter,en,‚ÄúFor Washington Democrats and pundits who are ...,Mostly False,-0.666667,1.000,1195453661759123457,mostly_not_credible
7,344,https://twitter.com/RepDavidRLewis/status/1192...,twitter.com,Rep. David R. Lewis on Twitter,en,‚ÄúNEWS: Gov. Cooper just vetoed another pay rai...,Half True,-0.277778,1.000,1192821303440551936,credible_uncertain
8,349,https://twitter.com/VoteOjeda2020/status/11801...,twitter.com,"Richard N. Ojeda, II on Twitter",en,"‚ÄúOne in eight American adults, or 12.7 percent...",Half True,-0.277778,1.000,1180126902293540865,credible_uncertain
9,396,https://twitter.com/ShehabKhan/status/88035302...,twitter.com,Shehab Khan on Twitter,en,‚ÄúSome Tory and DUP MP's won't like being named...,"ACCURATE WITH CONSIDERATION: On 28 June 2017, ...",1.000000,1.000,880353020172783617,mostly_credible


In [8]:
# pairs of tweet ids and system's credible labels
system_dataset = misinfome_data[['normalized_url','expected_credible']]
print(system_dataset.groupby(['expected_credible'])['expected_credible'].agg(['count']))
# write system dataset with 6 labels
system_dataset_with6_file_path = '../data/misinfome/system_dataset_with6.csv'
system_dataset.to_csv(system_dataset_with6_file_path, sep='\t')

                     count
expected_credible         
credible                32
credible_uncertain      97
mostly_credible         86
mostly_not_credible    167
not_credible           173
not_verifiable          16


In [9]:
# map mostly not credible to credible uncertain
system_dataset['expected_credible'] = system_dataset['expected_credible'].str.replace("mostly_not_credible","credible_uncertain")
print(system_dataset.groupby(['expected_credible'])['expected_credible'].agg(['count']))
# write system dataset with 5 labels
system_dataset_with5_file_path = '../data/misinfome/system_dataset_with5.csv'
system_dataset.to_csv(system_dataset_with5_file_path, sep='\t')

                    count
expected_credible        
credible               32
credible_uncertain    264
mostly_credible        86
not_credible          173
not_verifiable         16


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


NameError: name 'system_dataset_with5_file_path' is not defined