In [9]:
import pandas as pd
import numpy as np
import sklearn as sk
from collections import Counter
from sklearn.metrics import classification_report
import pickle
import math
from tqdm import tqdm

In [5]:
with open('results/hawkeye_metric_tweet_vectors_supervised.pickle', 'rb') as handle:
    tweet_vectors = pickle.load(handle)

notes = pd.read_csv("..//data//notes-00000-13-04-21.tsv", sep='\t')
ratings = pd.read_csv("..//data//ratings-00000-13-04-21.tsv", sep='\t')

In [6]:
tweet_vectors_baseline = {}
for tweetId,vector in tqdm(tweet_vectors.items()):
    
    notesForTweet = notes.loc[notes['tweetId'] == tweetId]
    
    value_counts = notesForTweet['classification'].value_counts()
    if 'MISINFORMED_OR_POTENTIALLY_MISLEADING' in value_counts:
        num_misleading = value_counts['MISINFORMED_OR_POTENTIALLY_MISLEADING']
    else:
        num_misleading = 0
    if 'NOT_MISLEADING' in value_counts:
        num_nonmisleading = value_counts['NOT_MISLEADING']
    else:
        num_nonmisleading = 0

    ratingsWithNotesForTweet = notesForTweet.set_index('noteId').join(ratings.set_index('noteId'), lsuffix="_note", rsuffix="_rating", how='left')
    ratingsWithNotesForTweet['numRatings'] = ratingsWithNotesForTweet.apply(lambda x: 0 if math.isnan(x['helpful']) else 1, axis=1)    
    scoredNotes = ratingsWithNotesForTweet.groupby(['noteId']).agg({'helpful':'sum', 'numRatings' : 'sum', 'classification':'first'})

    scoredNotes['helpfulnessRatio'] = scoredNotes['helpful']/scoredNotes['numRatings']
      
    scoredNotes_misleading = scoredNotes.loc[scoredNotes['classification'] == 'MISINFORMED_OR_POTENTIALLY_MISLEADING']
    misleading_hr_mean = scoredNotes_misleading['helpfulnessRatio'].mean()
    
    scoredNotes_notmisleading = scoredNotes.loc[scoredNotes['classification'] == 'NOT_MISLEADING']
    notmisleading_hr_mean = scoredNotes_notmisleading['helpfulnessRatio'].mean()
    
    if math.isnan(misleading_hr_mean):
        misleading_hr_mean = 0
    if math.isnan(notmisleading_hr_mean):
        notmisleading_hr_mean = 0
        
    tweet_vectors_baseline[tweetId] = [num_misleading,num_nonmisleading,misleading_hr_mean,notmisleading_hr_mean] 

100%|██████████| 4900/4900 [01:01<00:00, 79.27it/s]


In [7]:
tweet_annotation_df_next = pd.read_csv('tweet_groudtruth_annonation_next.csv',encoding='utf-8-sig')
tweet_annotation_df = pd.read_csv('tweet_groudtruth_annonation.csv',encoding='utf-8-sig')
tweet_annotation = pd.concat([tweet_annotation_df, tweet_annotation_df_next])

In [10]:
y_true,y_pred = [],[]
for idx,row in tweet_annotation.iterrows():
    
    y_true.append(row['human_annotation'])
    notesForTweet = notes.loc[notes['tweetId'] == int(row['tweet_id'][:-2])]
    
    ratingsWithNotesForTweet = notesForTweet.set_index('noteId').join(ratings.set_index('noteId'), lsuffix="_note", rsuffix="_rating", how='left')
    ratingsWithNotesForTweet['numRatings'] = ratingsWithNotesForTweet.apply(lambda x: 0 if math.isnan(x['helpful']) else 1, axis=1)    
    
    scoredNotes = ratingsWithNotesForTweet.groupby(['noteId']).agg({'helpful':'sum', 'numRatings' : 'sum', 'classification':'first'})
    scoredNotes['helpfulnessRatio'] = scoredNotes['helpful']/scoredNotes['numRatings']
    scoredNotes = scoredNotes[scoredNotes['helpfulnessRatio'] >= 0.84]

    scoredNotes_misleading = scoredNotes.loc[scoredNotes['classification'] == 'MISINFORMED_OR_POTENTIALLY_MISLEADING']
    scoredNotes_notmisleading = scoredNotes.loc[scoredNotes['classification'] == 'NOT_MISLEADING']
    
    if len(scoredNotes_misleading) >= len(scoredNotes_notmisleading): # >= scoredNotesNotMisleading['helpful'].sum():
        y_pred.append(0)
    else:
        y_pred.append(1)

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.25      0.99      0.40       124
           1       0.75      0.01      0.02       376

    accuracy                           0.25       500
   macro avg       0.50      0.50      0.21       500
weighted avg       0.63      0.25      0.11       500

