In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import pickle
import math

In [2]:
with open('results/tweet_vectors__baseline_supervised.pickle', 'rb') as handle:
    tweet_vectors_baseline = pickle.load(handle)

tweet_annotation_df_next = pd.read_csv('tweet_groudtruth_annonation_next.csv',encoding='utf-8-sig')
tweet_annotation_df = pd.read_csv('tweet_groudtruth_annonation.csv',encoding='utf-8-sig')
tweet_annotation = pd.concat([tweet_annotation_df, tweet_annotation_df_next])

In [None]:
tweet_vectors_baseline = {}
for tweetId,vector in tqdm(tweet_vectors.items()):
    
    notesForTweet = notes.loc[notes['tweetId'] == tweetId]
    
    value_counts = notesForTweet['classification'].value_counts()
    if 'MISINFORMED_OR_POTENTIALLY_MISLEADING' in value_counts:
        num_misleading = value_counts['MISINFORMED_OR_POTENTIALLY_MISLEADING']
    else:
        num_misleading = 0
    if 'NOT_MISLEADING' in value_counts:
        num_nonmisleading = value_counts['NOT_MISLEADING']
    else:
        num_nonmisleading = 0

    ratingsWithNotesForTweet = notesForTweet.set_index('noteId').join(ratings.set_index('noteId'), lsuffix="_note", rsuffix="_rating", how='left')
    ratingsWithNotesForTweet['numRatings'] = ratingsWithNotesForTweet.apply(lambda x: 0 if math.isnan(x['helpful']) else 1, axis=1)    
    scoredNotes = ratingsWithNotesForTweet.groupby(['noteId']).agg({'helpful':'sum', 'numRatings' : 'sum', 'classification':'first'})

    scoredNotes['helpfulnessRatio'] = scoredNotes['helpful']/scoredNotes['numRatings']
      
    scoredNotes_misleading = scoredNotes.loc[scoredNotes['classification'] == 'MISINFORMED_OR_POTENTIALLY_MISLEADING']
    misleading_hr_mean = scoredNotes_misleading['helpfulnessRatio'].mean()
    
    scoredNotes_notmisleading = scoredNotes.loc[scoredNotes['classification'] == 'NOT_MISLEADING']
    notmisleading_hr_mean = scoredNotes_notmisleading['helpfulnessRatio'].mean()
    
    if math.isnan(misleading_hr_mean):
        misleading_hr_mean = 0
    if math.isnan(notmisleading_hr_mean):
        notmisleading_hr_mean = 0
        
    tweet_vectors_baseline[tweetId] = [num_misleading,num_nonmisleading,misleading_hr_mean,notmisleading_hr_mean] 
    
with open('results/tweet_vectors_baseline_supervised.pickle', 'wb') as handle:
    pickle.dump(tweet_vectors_baseline, handle)

In [36]:
X,y = [],[]
annotated_tweets = list(set(tweet_annotation['tweet_id']))
for idx,row in tweet_annotation.iterrows():
    y.append(row['human_annotation'])
    X.append(tweet_vectors_baseline[int(row['tweet_id'][:-2])])
X,y = np.array(X),np.array(y)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) #Train test split

In [37]:
from sklearn.model_selection import cross_validate,cross_val_score
clf = RandomForestClassifier(random_state=0)
scoring = ['accuracy','precision','recall','f1_macro','roc_auc']
scores = cross_validate(clf, X, y, scoring=scoring, cv=10, verbose=2, n_jobs=-1)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.3s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.4s finished


In [38]:
# print(scores)
for k,v in scores.items():
    print("Average " + k + " = " + str(v.mean()))

Average fit_time = 0.281672739982605
Average score_time = 0.029343795776367188
Average test_accuracy = 0.782
Average test_precision = 0.8496319150266519
Average test_recall = 0.8645803698435277
Average test_f1_macro = 0.6999423458148655
Average test_roc_auc = 0.843904101469891


In [58]:
#if number of helpful ratings that say 'misleading' > number of helpful rating 
#that say 'not misleading'

y_true,y_pred = [],[]
for idx,row in tweet_annotation.iterrows():
    
    y_true.append(row['human_annotation'])
    notesForTweet = notes.loc[notes['tweetId'] == int(row['tweet_id'][:-2])]
    
    ratingsWithNotesForTweet = notesForTweet.set_index('noteId').join(ratings.set_index('noteId'), lsuffix="_note", rsuffix="_rating", how='left')
    
    ratingsWithNotesForTweetMisleading = ratingsWithNotesForTweet.loc[ratingsWithNotesForTweet['classification'] == 'MISINFORMED_OR_POTENTIALLY_MISLEADING']
    scoredNotesMisleading = ratingsWithNotesForTweetMisleading.groupby(['noteId']).agg({'helpful':'sum'})
   
    ratingsWithNotesForTweetNotMisleading = ratingsWithNotesForTweet.loc[ratingsWithNotesForTweet['classification'] == 'NOT_MISLEADING']
    scoredNotesNotMisleading = ratingsWithNotesForTweetNotMisleading.groupby(['noteId']).agg({'helpful':'sum'})
    
    if scoredNotesMisleading['helpful'].sum() >= scoredNotesNotMisleading['helpful'].sum():
        y_pred.append(0)
    else:
        y_pred.append(1)
        
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.24      0.97      0.39       124
           1       0.33      0.01      0.01       376

    accuracy                           0.24       500
   macro avg       0.29      0.49      0.20       500
weighted avg       0.31      0.24      0.10       500



In [59]:
#at least 84% of ratings (of notes which call the tweet misleading) 
#are helpful then mark the tweet as inaccurate/misleading else, mark as not-misleading 

y_true,y_pred = [],[]
for idx,row in tweet_annotation.iterrows():
    
    y_true.append(row['human_annotation'])
    notesForTweet = notes.loc[notes['tweetId'] == int(row['tweet_id'][:-2])]
    
    ratingsWithNotesForTweet = notesForTweet.set_index('noteId').join(ratings.set_index('noteId'), lsuffix="_note", rsuffix="_rating", how='left')
    ratingsWithNotesForTweet['numRatings'] = ratingsWithNotesForTweet.apply(lambda x: 0 if math.isnan(x['helpful']) else 1, axis=1)    

    ratingsWithNotesForTweetMisleading = ratingsWithNotesForTweet.loc[ratingsWithNotesForTweet['classification'] == 'MISINFORMED_OR_POTENTIALLY_MISLEADING']
    scoredNotesMisleading = ratingsWithNotesForTweetMisleading.groupby(['noteId']).agg({'helpful':'sum', 'numRatings' : 'sum'})
   
    ratingsWithNotesForTweetNotMisleading = ratingsWithNotesForTweet.loc[ratingsWithNotesForTweet['classification'] == 'NOT_MISLEADING']
    scoredNotesNotMisleading = ratingsWithNotesForTweetNotMisleading.groupby(['noteId']).agg({'helpful':'sum',  'numRatings' : 'sum'})
    
    if scoredNotesMisleading['helpful'].sum()/scoredNotesMisleading['numRatings'].sum() >= 0.84: # >= scoredNotesNotMisleading['helpful'].sum():
        y_pred.append(0)
    else:
        y_pred.append(1)
        
print(classification_report(y_true, y_pred))









              precision    recall  f1-score   support

           0       0.59      0.32      0.42       124
           1       0.81      0.93      0.86       376

    accuracy                           0.78       500
   macro avg       0.70      0.62      0.64       500
weighted avg       0.75      0.78      0.75       500





In [61]:
y_true,y_pred = [],[]
for idx,row in tweet_annotation.iterrows():
    
    y_true.append(row['human_annotation'])
    notesForTweet = notes.loc[notes['tweetId'] == int(row['tweet_id'][:-2])]
    
    ratingsWithNotesForTweet = notesForTweet.set_index('noteId').join(ratings.set_index('noteId'), lsuffix="_note", rsuffix="_rating", how='left')
    ratingsWithNotesForTweet['numRatings'] = ratingsWithNotesForTweet.apply(lambda x: 0 if math.isnan(x['helpful']) else 1, axis=1)    
    
    scoredNotes = ratingsWithNotesForTweet.groupby(['noteId']).agg({'helpful':'sum', 'numRatings' : 'sum', 'classification':'first'})
    scoredNotes['helpfulnessRatio'] = scoredNotes['helpful']/scoredNotes['numRatings']
    scoredNotes = scoredNotes[scoredNotes['helpfulnessRatio'] >= 0.84]

    scoredNotes_misleading = scoredNotes.loc[scoredNotes['classification'] == 'MISINFORMED_OR_POTENTIALLY_MISLEADING']
    scoredNotes_notmisleading = scoredNotes.loc[scoredNotes['classification'] == 'NOT_MISLEADING']
    
    if len(scoredNotes_misleading) >= len(scoredNotes_notmisleading): # >= scoredNotesNotMisleading['helpful'].sum():
        y_pred.append(0)
    else:
        y_pred.append(1)

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.25      0.99      0.40       124
           1       0.75      0.01      0.02       376

    accuracy                           0.25       500
   macro avg       0.50      0.50      0.21       500
weighted avg       0.63      0.25      0.11       500

