#### This notebook is used to create the excel sheet for annotators to annotate the tweets as accurate/inaccurate and to evaluate the annotations. 

In [3]:
import os
import sys
import pandas as pd
import networkx as nx
from tqdm import tqdm
import matplotlib.pyplot as plt
import math
import numpy as np
import random
# import tweepy
import datetime
from sklearn.metrics import roc_curve,auc
from sklearn import metrics

We read the data and perform the hawkeye algorithm on the data (we process the tweets using the hawkeye) algorithm. This allows us to get the accuracy metric valeus for all the tweets.

In [4]:
notes = pd.read_csv("../data/notes-00000-13-04-21.tsv", sep='\t')
ratings = pd.read_csv("../data/ratings-00000-13-04-21.tsv", sep='\t')
notes = notes[['noteId', 'participantId','tweetId','classification']]
ratings = ratings[['noteId', 'participantId','helpful']]

In [5]:
init_lambda = 0.1
lambda1 = init_lambda
lambda2 = init_lambda
lambda3 = init_lambda
alpha1 = 1
beta1 = 1
gamma1 = 1
delta1 = 1

#initialize goodness (assume every note has highest goodness)
init_goodness = 1
convergence_threshold = 0.001

In [6]:
#do initializations
ratings['goodness'] = [init_goodness]*len(ratings)
ratings['rating'] = ratings.apply(lambda x : 1 if x['helpful']==1 else -1,axis=1)
notes['goodness'] = [init_goodness]*len(notes)
notes['verdict'] = notes.apply(lambda x : 1 if x['classification']=='NOT_MISLEADING' else -1,axis=1)

In [7]:
no_of_rating_participants = len(set(ratings['participantId']))
no_of_writing_participants = len(set(notes['participantId']))
no_of_tweets = len(set(notes['tweetId']))
no_of_notes = len(set(notes['noteId']))
mu_r = 1*no_of_rating_participants/no_of_rating_participants
mu_w = 1*no_of_writing_participants/no_of_writing_participants
mu_t = 1*no_of_tweets/no_of_tweets                                       
mu_g = 1*no_of_notes/no_of_notes

In [8]:
print("at the beginning : ",datetime.datetime.now().strftime("%H:%M:%S"))

#Fairness of user in rating notes
ratings['score_goodness_difference_metric'] = 1-((ratings['rating']-ratings['goodness']).abs()/2)
ratings['rating_fairness'] = (ratings.groupby(['participantId'])['score_goodness_difference_metric'].transform("sum") + alpha1*mu_r)/(ratings.groupby(['participantId'])['participantId'].transform("count") + alpha1)

#Fairness of user in writing notes
notes['writing_fairness'] = (notes.groupby(['participantId'])['goodness'].transform("sum") + beta1*mu_w)/(notes.groupby(['participantId'])['participantId'].transform("count") + beta1)

#Accuracy of Tweet
notes['weighted_goodness'] = notes['goodness']*notes['verdict']
notes['tweet_accuracy'] = (notes.groupby(['tweetId'])['weighted_goodness'].transform("sum") + delta1*mu_t)/(notes.groupby(['tweetId'])['tweetId'].transform("count") + delta1)

#Goodness of notes
ratings['weighted_rating_fairness'] = ratings['rating_fairness']*ratings['rating']
ratings['goodness_term1'] = (ratings.groupby(['noteId'])['weighted_rating_fairness'].transform("sum") + gamma1*mu_g)/(ratings.groupby(['noteId'])['noteId'].transform("count") + gamma1)
notes['goodness_term1'] = lambda1*notes.apply(lambda x: 1 if len(ratings.loc[ratings['noteId'] == x['noteId']])==0 else ratings.loc[ratings['noteId'] == x['noteId']].iloc[0]['goodness_term1'],axis=1)
notes['goodness_term3'] = lambda3*(1-(notes['tweet_accuracy']-notes['verdict']).abs())
notes['goodness'] = 1/3 * (notes['goodness_term1'] + lambda2*notes['writing_fairness'] + notes['goodness_term3'])

#IMPORTANT : Update goodness ratings df
ratings['goodness'] = ratings.apply(lambda x: notes.loc[notes['noteId'] == x['noteId']].iloc[0]['goodness'],axis=1)

print("at the end : ",datetime.datetime.now().strftime("%H:%M:%S"))

at the beginning :  22:30:35
at the end :  22:31:08


In [9]:
times = []
errors = []
t = 1
error = math.inf

print("at the beginning : ",datetime.datetime.now().strftime("%H:%M:%S"))
while(error>convergence_threshold):
   
    old_rating_fairness_values = np.array(ratings['rating_fairness'])
    old_writing_fairness_values = np.array(notes['writing_fairness'])
    old_tweet_accuracy_values = np.array(notes['tweet_accuracy'])
    old_goodness_values = np.array(notes['goodness'])

    #Fairness of user in rating notes
    ratings['score_goodness_difference_metric'] = 1-((ratings['rating']-ratings['goodness']).abs()/2)
    ratings['rating_fairness'] = (ratings.groupby(['participantId'])['score_goodness_difference_metric'].transform("sum") + alpha1*mu_r)/(ratings.groupby(['participantId'])['participantId'].transform("count") + alpha1)
    
    #Fairness of user in writing notes
    notes['writing_fairness'] = (notes.groupby(['participantId'])['goodness'].transform("sum") + beta1*mu_w)/(notes.groupby(['participantId'])['participantId'].transform("count") + beta1)
    
    #Accuracy of Tweet
    notes['weighted_goodness'] = notes['goodness']*notes['verdict']
    notes['tweet_accuracy'] = (notes.groupby(['tweetId'])['weighted_goodness'].transform("sum") + delta1*mu_t)/(notes.groupby(['tweetId'])['tweetId'].transform("count") + delta1)
    
    #Goodness of notes
    ratings['weighted_rating_fairness'] = ratings['rating_fairness']*ratings['rating']
    ratings['goodness_term1'] = (ratings.groupby(['noteId'])['weighted_rating_fairness'].transform("sum") + gamma1*mu_g)/(ratings.groupby(['noteId'])['noteId'].transform("count") + gamma1)
    notes['goodness_term1'] = lambda1*notes.apply(lambda x: 1 if len(ratings.loc[ratings['noteId'] == x['noteId']])==0 else ratings.loc[ratings['noteId'] == x['noteId']].iloc[0]['goodness_term1'],axis=1)
    notes['goodness_term3'] = lambda3*(1-(notes['tweet_accuracy']-notes['verdict']).abs())
    notes['goodness'] = 1/3 * (notes['goodness_term1'] + lambda2*notes['writing_fairness'] + notes['goodness_term3'])
    
    #IMPORTANT : Update goodness ratings df
    ratings['goodness'] = ratings.apply(lambda x: notes.loc[notes['noteId'] == x['noteId']].iloc[0]['goodness'],axis=1)

    new_rating_fairness_values = np.array(ratings['rating_fairness'])
    new_writing_fairness_values = np.array(notes['writing_fairness'])
    new_tweet_accuracy_values = np.array(notes['tweet_accuracy'])
    new_goodness_values = np.array(notes['goodness'])

    rating_fairness_error = np.sum(np.absolute((np.subtract(old_rating_fairness_values,new_rating_fairness_values))))
    writing_fairness_error = np.sum(np.absolute(np.subtract(old_writing_fairness_values,new_writing_fairness_values)))
    tweet_accuracy_error = np.sum(np.absolute(np.subtract(old_tweet_accuracy_values,new_tweet_accuracy_values)))
    goodness_error = np.sum(np.absolute(np.subtract(old_goodness_values,new_goodness_values)))

    error = max(rating_fairness_error,writing_fairness_error,tweet_accuracy_error,goodness_error)
    print(t," : ",error)
    times.append(t)
    errors.append(error)
    t += 1
    
print("at the end : ",datetime.datetime.now().strftime("%H:%M:%S"))

at the beginning :  22:31:13
1  :  4942.80289433259
2  :  237.4269628442238
3  :  11.77233938098382
4  :  0.5936264540711889
5  :  0.031199678504140405
6  :  0.0017139285413503291
7  :  9.447260146477898e-05
at the end :  22:35:17


In [11]:
notes.sort_values(by=['tweet_accuracy'],ascending=True).head(10)

Unnamed: 0,noteId,participantId,tweetId,classification,goodness,verdict,writing_fairness,weighted_goodness,tweet_accuracy,goodness_term1,goodness_term3
894,1354875505246351360,7DCE02372757684773156C51BD8A856B,1354848253729234944,MISINFORMED_OR_POTENTIALLY_MISLEADING,0.005638,-1,0.204779,-0.005638,0.005741,-0.002989,-0.000574
4657,1354862350009069568,52E7E8AC27398A29A5D8B54CD24439F9,1354848253729234944,MISINFORMED_OR_POTENTIALLY_MISLEADING,0.015433,-1,0.275441,-0.015433,0.005741,0.01933,-0.000574
1609,1354893429977849861,8F404408FB172B6104E0D8003C9B129C,1354848253729234944,MISINFORMED_OR_POTENTIALLY_MISLEADING,0.021244,-1,0.353497,-0.021244,0.005741,0.028956,-0.000574
4870,1354855483631423493,0CC2FBF56892C9911DFAFBA2C4FF2E4C,1354848253729234944,MISINFORMED_OR_POTENTIALLY_MISLEADING,0.008414,-1,0.047435,-0.008414,0.005741,0.021074,-0.000574
1431,1357798998405447682,02C657744DA50BCE2A90C2D4AB0CDC6E,1354848253729234944,MISINFORMED_OR_POTENTIALLY_MISLEADING,0.017171,-1,0.14303,-0.017171,0.005741,0.037783,-0.000574
981,1354875060885090309,DD591AE4F8045D93FFE2C45C394AC8DE,1354848253729234944,MISINFORMED_OR_POTENTIALLY_MISLEADING,0.015138,-1,0.205696,-0.015138,0.005741,0.025419,-0.000574
1590,1354868783840595969,9EE481CF8EDD9F2F08B20C7ED2C3355D,1354848253729234944,MISINFORMED_OR_POTENTIALLY_MISLEADING,0.016435,-1,0.211177,-0.016435,0.005741,0.028762,-0.000574
3021,1354878458753654787,0CBD8A826C40736F565B0332CD3FD3EB,1354848253729234944,MISINFORMED_OR_POTENTIALLY_MISLEADING,0.006203,-1,0.213058,-0.006203,0.005741,-0.002123,-0.000574
2039,1354874966781657090,833DB7A8DD53BB58009275B560A225CA,1354848253729234944,MISINFORMED_OR_POTENTIALLY_MISLEADING,0.008363,-1,0.07334,-0.008363,0.005741,0.01833,-0.000574
3152,1354863653435002881,A94B6B47281314FC0709C607970487B2,1354848253729234944,MISINFORMED_OR_POTENTIALLY_MISLEADING,0.011945,-1,0.136535,-0.011945,0.005741,0.022755,-0.000574


In [12]:
top = list(notes.sort_values(by=['tweet_accuracy'],ascending=False)[:100]['tweetId'])
bottom = list(notes.sort_values(by=['tweet_accuracy'],ascending=True)[:666]['tweetId'])
selected = set(top + bottom)

In [13]:
next_top = list(notes.sort_values(by=['tweet_accuracy'],ascending=False)[100:250]['tweetId'])
next_bottom = list(notes.sort_values(by=['tweet_accuracy'],ascending=True)[666:1145]['tweetId'])
next_selected = set(next_top+next_bottom)

In [14]:
from collections import OrderedDict
top_ordered = list(OrderedDict.fromkeys(top))
bottom_ordered = list(OrderedDict.fromkeys(bottom))

In [15]:
from collections import OrderedDict
next_top_ordered = list(OrderedDict.fromkeys(next_top))
next_bottom_ordered = list(OrderedDict.fromkeys(next_bottom))

### Annotation Instruction and explanations : 
    
#### Description of columns :

tweet_favorite_count : Indicates approximately how many times this Tweet has been liked by Twitter users..

tweet_possibly_sensitive : An indicator that the URL contained in the Tweet may contain content or media identified as sensitive content.

tweet_retweet_count : Number of times this Tweet has been retweeted.

tweet_text : The actual UTF-8 text of the status update. 

tweet_url : URL of the tweet.

tweet_user_description : The bio of the twitter user.

tweet_user_followers_count : The number of followers this account currently has.

tweet_user_name : The name of the user, as they’ve defined it..

tweet_user_protected : Indicates that this user has chosen to protect their Tweets

tweet_user_screen_name : The screen name, handle, or alias that this user identifies themselves with.

tweet_user_verified : Indicates that the user has a verified account.

tweet_user_withheld_in_countries : When present, indicates a list of uppercase two-letter country codes this content is withheld from.

#### What could each of these features say about the tweet being misleading/spreading misinformation/not being accurate?

tweet_favorite_count and tweet_retweet_count: A misleading tweet from a visibly non reliable source could have lesser likes/retweets. But at the same time a misleading tweet from a popular account can have many likes/retweets. Similarly, very accurate tweets can have a high number of likes/retweets.  

tweet_possibly_sensitive : At times tweets spreading misinfo can have sensitive media content. 

tweet_user_followers_count : Does a popular account that spreads misinfo have a lot of followers? DO fact checking websites have a lot of followers?

tweet_user_name, tweet_user_screen_name or tweet_user_verified: The identity of the user (the name and the handle) and the verification status of the user can sometimes be enough to infer if the account could be a habitual/potential misinformation spreader or not. eg. The CDC account might not spread misinfo, but a bot account might be!

tweet_user_protected : A user might protect thier tweets for variosu reasons. Perhaps, they had previously shared misinfo and are afraid of being reported? Perhaps, it could just be out of privacy concerns.

tweet_user_withheld_in_countries : If the user's tweets are withheld in some countries, what could the reasons be? Is the user a habitual offender when it comes to hate speech, misinformation or abusive content? Or perhaps the user is whistleblower/journalist/activist, whose voice the said country wants to suppress?

Instructions to annotate:

For each of the tweets, consider as many features as you think are necessary to infer, subjectively, if the tweet is accurate or potentially misleading. Sometimes, it could be just easier to go to the URL of the tweet, assess the tweet on the Twitter website and make a decision. 

##### Once, you’ve made the decision, in the column ‘HUMAN ANNOTATION’,  add a ‘1’ if you think the tweet is ‘accurate’ or non-misleading. Else, add ‘0’.

Special Cases :

a.	If the URL field is empty or missing, you can go to the URL by using the following string : https://twitter.com/i/web/status/tweet_id, where tweet_id is the unique ID of the Tweet.

b.	If all of the fields of the Tweet are missing, it was probably deleted. So you can annotate it with ‘0’


We collect the above information for all tweets make a CSV file. _'tweet_groudtruth_annonation.csv'_ denotes the first set of tweets to be annotated (200 tweets) and _'tweet_groudtruth_annonation_next.csv'_ denotes the next set of the tweets (300 tweets) to be annotated. 

In [None]:
def get_tweet_info(api,tweet_id):
    
    tweet = api.get_status(tweet_id)
    tweet_info = {}
    tweet_info['tweet_id'] = str(tweet_id)+'id'
    tweet_info['tweet_text'] = tweet.text
    tweet_info['tweet_user_name'] = tweet.user.name
    tweet_info['tweet_user_description'] = tweet.user.description
    tweet_info['tweet_user_screen_name'] = tweet.user.screen_name
    tweet_info['tweet_user_verified'] = tweet.user.verified
    tweet_info['tweet_user_followers_count'] = tweet.user.followers_count
    tweet_info['tweet_user_withheld_in_countries'] = tweet.user.withheld_in_countries
    tweet_info['tweet_user_protected'] = tweet.user.protected
    try: 
        tweet_info['tweet_url'] = tweet.entities['urls'][0]['expanded_url']
    except IndexError as e:
        tweet_info['tweet_url'] = None   
    tweet_info['tweet_retweet_count'] = tweet.retweet_count
    tweet_info['tweet_favorite_count'] = tweet.favorite_count
    try: 
        tweet_info['tweet_possibly_sensitive'] = tweet.possibly_sensitive
    except AttributeError as e:
        tweet_info['tweet_possibly_sensitive'] = None   
    return tweet_info

def set_tweet_info_as_null(tweet_id):
    
    tweet_info = {}
    tweet_info['tweet_id'] = str(tweet_id)+'id'
    tweet_info['tweet_text'] = None
    tweet_info['tweet_user_name'] = None
    tweet_info['tweet_user_description'] = None
    tweet_info['tweet_user_screen_name'] = None
    tweet_info['tweet_user_verified'] = None
    tweet_info['tweet_user_followers_count'] = None
    tweet_info['tweet_user_withheld_in_countries'] = None
    tweet_info['tweet_user_protected'] = None
    tweet_info['tweet_url'] = None
    tweet_info['tweet_retweet_count'] = None
    tweet_info['tweet_favorite_count'] = None
    tweet_info['tweet_possibly_sensitive'] = None
    return tweet_info

In [None]:
CONSUMER_KEY = 'rZlo0NXfEvgpGuSIxT2v6AsBl'
CONSUMER_SECRET = 'jnqdTr8y4LpYnor8l22y1gqOS6Z7y1mkMbsvXuuxG8wbJuYOAR'
OAUTH_TOKEN = '583637312-v4WkMLaGqkwYUKKLQsrocp8tarCOM6XP7n4VMxuI'
OAUTH_TOKEN_SECRET = 'c2KoPv7DCRF07aBWTRPUdexWHKioiYrpQfu47aRyppnjp'

auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
api = tweepy.API(auth)

all_tweets_info = []
for tweet_id in selected:
    try:
        tweet_info = get_tweet_info(api,tweet_id)
    except tweepy.TweepError as e:
        if e.args[0][0]['code'] == 144:
            print(tweet_id)
            tweet_info = set_tweet_info_as_null(tweet_id)
    all_tweets_info.append(tweet_info)
    
tweet_info_df = pd.DataFrame(all_tweets_info,columns=['tweet_id','tweet_text','tweet_user_name','tweet_user_description','tweet_user_screen_name','tweet_user_verified','tweet_user_followers_count','tweet_user_verified','tweet_user_withheld_in_countries','tweet_user_protected','tweet_url','tweet_retweet_count','tweet_favorite_count','tweet_possibly_sensitive'])

In [None]:
tweet_info_df.to_csv('tweet_groudtruth_annonation.csv',encoding='utf-8-sig')

In [None]:
CONSUMER_KEY = 'rZlo0NXfEvgpGuSIxT2v6AsBl'
CONSUMER_SECRET = 'jnqdTr8y4LpYnor8l22y1gqOS6Z7y1mkMbsvXuuxG8wbJuYOAR'
OAUTH_TOKEN = '583637312-v4WkMLaGqkwYUKKLQsrocp8tarCOM6XP7n4VMxuI'
OAUTH_TOKEN_SECRET = 'c2KoPv7DCRF07aBWTRPUdexWHKioiYrpQfu47aRyppnjp'

auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
api = tweepy.API(auth)

all_tweets_info = []
for tweet_id in next_selected:
    try:
        tweet_info = get_tweet_info(api,tweet_id)
    except tweepy.TweepError as e:
        if e.args[0][0]['code'] == 144:
            print(tweet_id)
            tweet_info = set_tweet_info_as_null(tweet_id)
    all_tweets_info.append(tweet_info)
    
tweet_info_df_next = pd.DataFrame(all_tweets_info,columns=['tweet_id','tweet_text','tweet_user_name','tweet_user_description','tweet_user_screen_name','tweet_user_verified','tweet_user_followers_count','tweet_user_verified','tweet_user_withheld_in_countries','tweet_user_protected','tweet_url','tweet_retweet_count','tweet_favorite_count','tweet_possibly_sensitive'])

In [None]:
tweet_info_df_next_shuffled = tweet_info_df_next.sample(frac=1)
tweet_info_df_next_shuffled.to_csv('tweet_groudtruth_annonation_next.csv',encoding='utf-8-sig')

#### At this point, the annotators annotate the sheets. We  put 0 for no unanimous decision by annotators. After the annotation is done we read the CSV again. 

In [40]:
tweet_annotation_df_next = pd.read_csv('tweet_groudtruth_annonation_next.csv',encoding='utf-8-sig')
tweet_annotation_df = pd.read_csv('tweet_groudtruth_annonation.csv',encoding='utf-8-sig')
tweet_annotation = pd.concat([tweet_annotation_df, tweet_annotation_df_next])

We  employ  a  unsupervised method to detect misinformation tweets for theHawkEye system. For each tweet, from the notes written for the tweet, we select notes having a credibility of at least 0.02. Among these  notes,  if  the  number  of  notes  that  labeled  the  tweet  as misleading are more than or equal to the number of notes thatlabeled  the  tweet  as  not  misleading,  HawkEye  classifies  the tweet as misleading. We compare these Hawkeye derived labels with our annotators labels and report the accruacy, precision and recall. 

In [41]:
from sklearn.metrics import classification_report
y_true,y_pred = [],[]
for idx,row in tweet_annotation.iterrows():
    
    y_true.append(row['human_annotation'])
    notesForTweet = notes.loc[notes['tweetId'] == int(row['tweet_id'][:-2])]
    notesForTweetCredible = notesForTweet[notesForTweet['goodness'] >= 0.02]
    
    scoredNotes_misleading = notesForTweetCredible.loc[notesForTweetCredible['classification'] == 'MISINFORMED_OR_POTENTIALLY_MISLEADING']
    scoredNotes_notmisleading = notesForTweetCredible.loc[notesForTweetCredible['classification'] == 'NOT_MISLEADING']
#     print("scoredNotes_misleading = ",scoredNotes_misleading)
#     print("scoredNotes_notmisleading = ",scoredNotes_notmisleading)
#     break
    if len(scoredNotes_misleading) >= len(scoredNotes_notmisleading): # >= scoredNotesNotMisleading['helpful'].sum():
        y_pred.append(0)
    else:
        y_pred.append(1)

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.49      0.94      0.64       124
           1       0.97      0.68      0.80       376

    accuracy                           0.74       500
   macro avg       0.73      0.81      0.72       500
weighted avg       0.85      0.74      0.76       500



#### We confirm the annotator agreement. 

In [57]:
import statsmodels
from statsmodels.stats.inter_rater import fleiss_kappa

annotations_array = np.array(tweet_annotation[['annotation_rohit','annotation_mohit','annotation_soyoung']])
#fleiss_kappa(annotations_array)
import krippendorff
kappa = krippendorff.alpha(annotations_array)
print(kappa)

0.005681407323895105
