## Analysis of the Storygraph Bot Algorithm Compared to the Exemplar Dataset

### Necessary Functions for Analysis

In [1]:
# Necessary Imports
import pandas as pd
import json
import requests
import re
import numpy as np

def getUniqueGraphs(story):
    '''
    This function returns the unique graphs from the story graph bot file as a list of dictionaries. The main goal is to remove any duplicate graphs.
    
    inputs:
        story: A json file from the storygraphbot output of the story.
    
    output:
        unique_graphs: A list of dictionaries that each represent a graph.
    
    '''
    
    #selecting the graphs from the json file from storygraphbot
    graph_ids = story['graph_ids']
    
    #grathering the additional graphs in the reported_graphs section of the storygraphbot json
    reported_graphs = story['reported_graphs']
    
    #combing the graphs from the graph_ids and reported_graphs section
    story_union = graph_ids + reported_graphs
    
    #creating empty lists to append to
    unique_graphs = []
    unique_graph_ids = []
    
    
    for graph in story_union:
        
        #checking if the id has been seen before 
        if graph['id'] not in unique_graph_ids:
            
            #append the graph and id to the lists
            unique_graphs.append(graph)
            unique_graph_ids.append(graph['id'])
            
        else:
            continue
            
    return unique_graphs

def getFormattedBotGraphs(unique_graphs):
    '''
    A function that transforms the storygraph bot format to the exemplar format for direct comparison
    
    inputs:
        unique_graphs: A list of dictionaries that each represent a graph in the storygraphbot format.
        
    outputs:
        bot_graphs: A list of dictionaries that each represent a graph in the exemplar format.
    '''
    
    # list to append the dictionary to
    bot_graphs = []
    
    for graph in unique_graphs:
        # creating a dictionary for the new formating
        graph_dic = {'graph_link': '', 'cc_degree': ''} 
        
        # transforming the graph link from the internet archive format to the csu format
        graph_dic['graph_link'] = graph['graph_uri'].replace('https://web.archive.org/storygraph/graphs/usa/', 'https://storygraph.cs.odu.edu/graphs/polar-media-consensus-graph/').replace('hist=144', 'hist=1440')
        
        # storing the connected component degree
        graph_dic['cc_degree'] = graph['avg_degree'] 
        
        # storing the uri with the most edges in the connected component
        graph_dic['max_node_link'] = graph['max_node_link'] 
        
        bot_graphs.append(graph_dic)
        
    return bot_graphs

def defineConfusionMatrix(bot_graphs, exemplar_links, want_failed_bots = False):
    '''
    Function to calculate the True Positive (tp), False Positive (fp), and False Negative (fn) values.
    
    inputs:
        bot_graphs: A list of dictionaries that represent each graph. The ouput of getUniqueGraphs().
        exemplar_links: A list of dictionaries that represent each graph. The ouput of getFormattedBotGraphs().
    
    returns: 
        tp: an integer for the true positives
        fp: an integer for the false positives
        fn: an integer for the false negatives
        bot_failed_attempts: a list of dictionaries for each graph not found in the exemplar data set
    
    
    '''
    #creating a list of the links found in the exemplar dataset
    exemplar_link_list = [i['graph_link'] for i in exemplar_links]
    
    # list to identify any failed attempts
    bot_failed_attempts = [] 
    
    # setting values for the true positives, false positives, and false negatives
    tp = 0   
    fp = 0 
    fn = 0 
    
    
    for graph in bot_graphs:
        
        # looking at each uri from the storygraphbot ouput and seeing that it is in the exemplar data set
        if graph['graph_link'] in exemplar_link_list:
            
            # finding the index where the uri is found in the exemplar data set
            idx = exemplar_link_list.index(graph['graph_link']) 
            
            # checking that the degrees of the connected components match where the uri of each matched
            # the degree of the storygraphbot output is rounded to 2 decimals because it is a long decimal while
            # the exemplar is only 2 digits
            
            if (round(graph['cc_degree'], 2)) == (exemplar_links[idx]['cc_degree']): 
                
                tp += 1

            else:
                
                # adding to false postive when the storygraphbot output has the correct graph uri but the wrong connected component degree
                fp += 1
                bot_failed_attempts.append(graph)
       
        else:
            # adding to the false positive when the storygraphbot ouput uri is not found in the exemplar
            fp += 1
            bot_failed_attempts.append(graph)
    
    # bot links is a link of uri's ouputed by the storygraphbot algoirthm
    bot_links = [bot_graphs[i]['graph_link'] for i in range(len(bot_graphs))]
    
    #looking at each link in the exemplar links
    for link in exemplar_link_list:
        
        if link not in bot_links:
            # adding to the false negative because a link in the exemplar is not found in the storygraphbot output
            fn += 1    
    if want_failed_bots == True:
        return tp, fp, fn, bot_failed_attempts
    else:
        return tp, fp, fn

def getScores(tp, tn, fp, fn):
    '''
    tp = True Positive
    tn = True Negative
    fp = False Positive
    fn = False Negatives
    
    return: percision, recall, f1-score
    '''
    # calculating the precision, recall, and f1-score
    
    precision = ((tp)/(tp + fp))
    recall = ((tp)/(tp+fn))
    fpr = ((fp) / (fp + tn))
    
    f1 = 2 *((precision * recall) / (precision + recall))
    
    return precision, recall, fpr, f1

def getExemplarLinks(story_index):
    '''
    A function to get the links to each graph for a story in the exemplar data set
    
    inputs:
        story_index: Integer representation of which index the story is in the exemplar data set
        
    outputs:
        exemplar_links: A list of strings where each link is a uri for a graph in the given story
    '''
    f = open('data/exemplar_dataset.json')
    exemplar = json.load(f)
    f.close()

    exemplar_links = exemplar[story_index]['links']
    
    return exemplar_links

def LoadSgbotJson(story_index, date):
    '''
    Returns the sgbot information for the given story index and date
    
    Inputs:
        story_index: Integer index for the story
        date: String representation of the date in the format 'YYYY-MM-DD'
    '''
    
    f = open(f'data/toolkit_files/sgbot_{story_index}/cache/cache_{date}.json', encoding = 'UTF-8')
    sgbot = json.load(f)
    f.close()
    
    return sgbot

def PerformAnalysis(story_index, date, story_position, print_output = False):
    '''
    A function to perform an analyis. This is only usable when the story is contained in a json file for one day. Multi-day stories must be done differently.
    
    inputs:
        story_index: Integer index of the story you want to analyze. This index is the position of where it is found in the exemplar data set starting at 0.
        date: List of strings of the date of the storygraphbot in the format 'YYYY-MM-DD'. The date is used to pull the json from the storygraphbot output.
        story_position: List of integers for the location of where the story can be found in the storygraphbot output. Found by looking for the story in graph_ids.
        print_output: Boolean to decide if the precision and recall are printed.
    
    returns:
        precision: float output representing the precision for the algorithm
        recall: float output representing the recall for the algorithm
        fpr: False Positive Rate
        f1: float output representing the f1-score for the algorithm
    '''
    
    ## loading in the toolkit json files
    
    unique_graphs = []
    
    if len(story_position) == len(date):
        for i in range(len(story_position)):
            
            sgbot = LoadSgbotJson(story_index, date[i])
    
            story = sgbot[date[i]]['stories'][story_position[i]]
            unique_graph_part = getUniqueGraphs(story)
            
            unique_graphs += unique_graph_part
    else:
        print('Lengths of story_position and date lists are not equal')
    
    bot_graphs = getFormattedBotGraphs(unique_graphs)
    
    exemplar_links = getExemplarLinks(story_index)
    
    tp, fp, fn = defineConfusionMatrix(bot_graphs, exemplar_links)
    
    tn = defineTrueNegatives(story_index, date, story_position, exemplar_links)
    
    precision, recall, fpr, f1 = getScores(tp, tn, fp, fn)
    
    if print_output == True:
        print(f'Story {story_index} Output')
        print(f'percision: {precision}\nrecall: {recall}\nf1-score: {f1}')
    
    return precision, recall, fpr, f1

def defineTrueNegatives(story_index, date, story_position, exemplar_links):
    
    '''
    A function calculate the amount of true negatives.
    
    Inputs:
        story_index: An integer representation of the story
        date: A list of strings, each a date in the format 'YYYY-MM-DD'
        story_position: A list of integers that represent the index where the desired story is found
        exemplar_links: A list of dictionaries that each represent a connected component in the exemplar dataset
    '''
    
    tn = 0

    #creating a list of the links found in the exemplar dataset
    exemplar_link_list = [i['graph_link'] for i in exemplar_links]

    unique_non_selected_ccs = []
    for i in range(len(story_position)):
        
        ## load in the information for that day from the json file
        sgbot = LoadSgbotJson(story_index, date[i]) 

        non_selected_stories = sgbot[date[i]]['stories'][:story_position[i]] + sgbot[date[i]]['stories'][story_position[i]+1:]

        for i in range(len(non_selected_stories)):
            unique_non_selected_stories = getUniqueGraphs(non_selected_stories[i])
            unique_non_selected_ccs += unique_non_selected_stories

    formated_bot_graphs = getFormattedBotGraphs(unique_non_selected_ccs)

    for graph in formated_bot_graphs:
    
        # if the graph link is not found in the exemplar, then it is a true negative
        if graph['graph_link'] not in exemplar_link_list:
        
            tn += 1
        
        else:
        
            # find where the link matches
            idx = exemplar_link_list.index(graph['graph_link']) 
        
            # don't have the index for the old algorithm, only the cc_degree. If the degrees don't match up, then it is a true negative.
            if (round(graph['cc_degree'], 2)) != (exemplar_links[idx]['cc_degree']): 
                
                        tn += 1
    return tn

# these functions below (getGraphLinks, getLinksFromJson) are still in development and not necessary for the current algorithm evaluation

def getGraphLinks(input_file, index):
    '''
    
    A function that pull the json file for each graph.
    
    inputs:
        input_file: A json file that has the storygraphbot output 
        index: an integer representing the index of where the story is found in the input_file. 
        
    '''
    cursor_finder = r'(?<=#cursor=)(\d*)(?=&hist)'
    date_finder = r'(?<=t=)(\d*-\d*-\d*)(?=T)'
    cursor = re.search(cursor_finder, input_file[index]['graph_link'])
    date = re.search(date_finder, input_file[index]['graph_link'])
    year, month, day = date.group().split('-')
    index = cursor.group()
    
    uri = f'https://storygraph.cs.odu.edu/graphs/polar-media-consensus-graph/{year}/{month}/{day}/graph{index}.json'
    
    graph_data = requests.get(uri)
    graph_json = json.loads(graph_data.text)
    
    return graph_json

def getLinksFromJson(input_file, index):
    '''
    Returns a list of links that make up the connected component
    
    '''
    input_file = getGraphLinks(input_file, index)
    cc_nodes = input_file['connected-comps'][0]['nodes']
    cc_links = [input_file['nodes'][int(i)-1]['link'] for i in cc_nodes]
    
    return cc_links

In [2]:
saved_data = []

## Story 0: Kim/Trump Summit

In [4]:
precision, recall, fpr, f1 = PerformAnalysis(0, ['2018-06-12'], [0])
row = {'story_id': 0, 'precision': precision, 'recall': recall, 'fpr': fpr, 'f1': f1} 
saved_data.append(row)

## Story 1: Senate 2 year 2018 Spending Deal

In [6]:
precision, recall, fpr, f1 = PerformAnalysis(1, ['2018-02-07'], [1])
row = {'story_id': 1, 'precision': precision, 'recall': recall, 'fpr': fpr, 'f1': f1} 
saved_data.append(row)

## Story 2: Scott Pruitt Resigns as EPA Chief

In [7]:
precision, recall, fpr, f1 = PerformAnalysis(2, ['2018-07-05'], [0])
row = {'story_id': 2, 'precision': precision, 'recall': recall, 'fpr': fpr, 'f1': f1} 
saved_data.append(row)

## Story 3: Pittsburgh Synagogue Shooting

In [8]:
precision, recall, fpr, f1 = PerformAnalysis(3, ['2018-10-28', '2018-10-29'], [0, 4])
row = {'story_id': 3, 'precision': precision, 'recall': recall, 'fpr': fpr, 'f1': f1} 
saved_data.append(row)

## Story 4: Senate Votes to End Support for Yemen War

In [None]:
# Story did not appear in the sgbot output, may be too small of a story to have registered

## Story 5: George Bush Dies at 94

In [9]:
precision, recall, fpr, f1 = PerformAnalysis(5, ['2018-12-01', '2018-12-02'], [0, 0])
row = {'story_id': 5, 'precision': precision, 'recall': recall, 'fpr': fpr, 'f1': f1} 
saved_data.append(row)

## Story 8: Aurora Illinois Shooting

In [10]:
precision, recall, fpr, f1 = PerformAnalysis(8, ['2019-02-16'], [0])
row = {'story_id': 8, 'precision': precision, 'recall': recall, 'fpr': fpr, 'f1': f1} 
saved_data.append(row)

## Story 9: AOC Town Hall Green New Deal

In [11]:
precision, recall, fpr, f1 = PerformAnalysis(9, ['2019-03-30'], [0])
row = {'story_id': 9, 'precision': precision, 'recall': recall, 'fpr': fpr, 'f1': f1} 
saved_data.append(row)

## Story 11: John Lewis Has Pancreatic Cancer

In [None]:
# Storygraph bot gave no output for this story even after hours

## Story 16: Trump Signs Covid Relief Package

In [12]:
precision, recall, fpr, f1 = PerformAnalysis(16, ['2020-12-28'], [0])
row = {'story_id': 16, 'precision': precision, 'recall': recall, 'fpr': fpr, 'f1': f1} 
saved_data.append(row)

## Story 17: John Lewis Dies at 80

In [14]:
precision, recall, fpr, f1 = PerformAnalysis(17, ['2020-07-18', '2020-07-19'], [0, 0])
row = {'story_id': 17, 'precision': precision, 'recall': recall, 'fpr': fpr, 'f1': f1} 
saved_data.append(row)

## Story 19: ASAP Rocky Found Guilty in Sweeden

In [15]:
precision, recall, fpr, f1 = PerformAnalysis(19, ['2019-08-14'], [0])
row = {'story_id': 19, 'precision': precision, 'recall': recall, 'fpr': fpr, 'f1': f1} 
saved_data.append(row)

### Saving the Data as a CSV

In [17]:
import csv

file = open('current_algorithm_results_with_fpr.csv', 'w')
writer = csv.writer(file)
writer.writerow(['story_id', 'precision', 'recall', 'fpr', 'f1'])

for dictionary in saved_data:
    writer.writerow(dictionary.values())
    
file.close()