# Functions for Analysis 

In [317]:
# Necessary Imports
import pandas as pd
import json
import requests
import re
import numpy as np

def getUniqueGraphs(story):
    '''
    This function returns the unique graphs from the story graph bot file as a list of dictionaries. The main goal is to remove any duplicate graphs.
    
    inputs:
        story: A json file from the storygraphbot output of the story.
    
    output:
        unique_graphs: A list of dictionaries that each represent a graph.
    
    '''
    
    #selecting the graphs from the json file from storygraphbot
    graph_ids = story['graph_ids']
    
    #grathering the additional graphs in the reported_graphs section of the storygraphbot json
    reported_graphs = story['reported_graphs']
    
    #combing the graphs from the graph_ids and reported_graphs section
    story_union = graph_ids + reported_graphs
    
    #creating empty lists to append to
    unique_graphs = []
    unique_graph_ids = []
    
    
    for graph in story_union:
        
        #checking if the id has been seen before 
        if graph['id'] not in unique_graph_ids:
            
            #append the graph and id to the lists
            unique_graphs.append(graph)
            unique_graph_ids.append(graph['id'])
            
        else:
            continue
            
    return unique_graphs

def getFormattedBotGraphs(unique_graphs):
    '''
    A function that transforms the storygraph bot format to the exemplar format for direct comparison
    
    inputs:
        unique_graphs: A list of dictionaries that each represent a graph in the storygraphbot format.
        
    outputs:
        bot_graphs: A list of dictionaries that each represent a graph in the exemplar format.
    '''
    
    # list to append the dictionary to
    bot_graphs = []
    
    for graph in unique_graphs:
        # creating a dictionary for the new formating
        graph_dic = {'graph_link': '', 'cc_degree': ''} 
        
        # transforming the graph link from the internet archive format to the csu format
        graph_dic['graph_link'] = graph['graph_uri'].replace('https://web.archive.org/storygraph/graphs/usa/', 'https://storygraph.cs.odu.edu/graphs/polar-media-consensus-graph/').replace('hist=144', 'hist=1440')
        
        # storing the connected component degree
        graph_dic['cc_degree'] = graph['avg_degree'] 
        
        # storing the uri with the most edges in the connected component
        graph_dic['max_node_link'] = graph['max_node_link'] 
        
        bot_graphs.append(graph_dic)
        
    return bot_graphs

def defineConfusionMatrix(bot_graphs, exemplar_links, want_failed_bots = False):
    '''
    Function to calculate the True Positive (tp), False Positive (fp), and False Negative (fn) values.
    
    inputs:
        bot_graphs: A list of dictionaries that represent each graph. The ouput of getUniqueGraphs().
        exemplar_links: A list of dictionaries that represent each graph. The ouput of getFormattedBotGraphs().
    
    returns: 
        tp: an integer for the true positives
        fp: an integer for the false positives
        fn: an integer for the false negatives
        bot_failed_attempts: a list of dictionaries for each graph not found in the exemplar data set
    
    
    '''
    #creating a list of the links found in the exemplar dataset
    exemplar_link_list = [i['graph_link'] for i in exemplar_links]
    
    # list to identify any failed attempts
    bot_failed_attempts = [] 
    
    # setting values for the true positives, false positives, and false negatives
    tp = 0   
    fp = 0 
    fn = 0 
    
    
    for graph in bot_graphs:
        
        # looking at each uri from the storygraphbot ouput and seeing that it is in the exemplar data set
        if graph['graph_link'] in exemplar_link_list:
            
            # finding the index where the uri is found in the exemplar data set
            idx = exemplar_link_list.index(graph['graph_link']) 
            
            # checking that the degrees of the connected components match where the uri of each matched
            # the degree of the storygraphbot output is rounded to 2 decimals because it is a long decimal while
            # the exemplar is only 2 digits
            
            if (round(graph['cc_degree'], 2)) == (exemplar_links[idx]['cc_degree']): 
                
                tp += 1

            else:
                
                # adding to false postive when the storygraphbot output has the correct graph uri but the wrong connected component degree
                fp += 1
                bot_failed_attempts.append(graph)
       
        else:
            # adding to the false positive when the storygraphbot ouput uri is not found in the exemplar
            fp += 1
            bot_failed_attempts.append(graph)
    
    # bot links is a link of uri's ouputed by the storygraphbot algoirthm
    bot_links = [bot_graphs[i]['graph_link'] for i in range(len(bot_graphs))]
    
    #looking at each link in the exemplar links
    for link in exemplar_link_list:
        
        if link not in bot_links:
            # adding to the false negative because a link in the exemplar is not found in the storygraphbot output
            fn += 1    
    if want_failed_bots == True:
        return tp, fp, fn, bot_failed_attempts
    else:
        return tp, fp, fn

def getScores(tp, fp, fn):
    '''
    tp = True Positive
    fp = False Positive
    fn = False Negatives
    
    return: percision, recall, f1-score
    '''
    # calculating the precision, recall, and f1-score
    
    precision = ((tp)/(tp + fp))
    recall = ((tp)/(tp+fn))
    f1 = 2 *((precision * recall) / (precision + recall))
    return precision, recall, f1

def getExemplarLinks(story_index):
    '''
    A function to get the links to each graph for a story in the exemplar data set
    
    inputs:
        story_index: Integer representation of which index the story is in the exemplar data set
        
    outputs:
        exemplar_links: A list of strings where each link is a uri for a graph in the given story
    '''
    f = open('data/exemplar_dataset.json')
    exemplar = json.load(f)
    f.close()

    exemplar_links = exemplar[story_index]['links']
    
    return exemplar_links

def PerformAnalysis(story_index, date, story_position, print_output = False):
    '''
    A function to perform an analyis. This is only usable when the story is contained in a json file for one day. Multi-day stories must be done differently.
    
    inputs:
        story_index: Integer index of the story you want to analyze. This index is the position of where it is found in the exemplar data set starting at 0.
        date: String of the date of the storygraphbot in the format 'YYYY-MM-DD'. The date is used to pull the json from the storygraphbot output.
        story_position: Integer for th location of where the story can be found in the storygraphbot output. Found by looking for the story in graph_ids.
        print_output: Boolean to decide if the precision and recall are printed.
    
    returns:
        precision: float output representing the precision for the algorithm
        recall: float output representing the recall for the algorithm
        f1: float output representing the f1-score for the algorithm
    '''
    
    f = open(f'data/toolkit_files/sgbot_{story_index}/cache/cache_{date}.json', encoding = 'UTF-8')
    sgbot = json.load(f)
    f.close()
    
    story = sgbot[date]['stories'][story_position]
    unique_graphs = getUniqueGraphs(story)
    bot_graphs = getFormattedBotGraphs(unique_graphs)
    
    exemplar_links = getExemplarLinks(story_index)
    
    tp, fp, fn = defineConfusionMatrix(bot_graphs, exemplar_links)
    precision, recall, f1 = getScores(tp, fp, fn)
    
    if print_output == True:
        print(f'Story {story_index} Output')
        print(f'percision: {precision}\nrecall: {recall}\nf1-score: {f1}')
    
    return precision, recall, f1

# these functions below (getGraphLinks, getLinksFromJson) are still in development and not necessary for the current algorithm evaluation

def getGraphLinks(input_file, index):
    '''
    
    A function that pull the json file for each graph.
    
    inputs:
        input_file: A json file that has the storygraphbot output 
        index: an integer representing the index of where the story is found in the input_file. 
        
    '''
    cursor_finder = r'(?<=#cursor=)(\d*)(?=&hist)'
    date_finder = r'(?<=t=)(\d*-\d*-\d*)(?=T)'
    cursor = re.search(cursor_finder, input_file[index]['graph_link'])
    date = re.search(date_finder, input_file[index]['graph_link'])
    year, month, day = date.group().split('-')
    index = cursor.group()
    
    uri = f'https://storygraph.cs.odu.edu/graphs/polar-media-consensus-graph/{year}/{month}/{day}/graph{index}.json'
    
    graph_data = requests.get(uri)
    graph_json = json.loads(graph_data.text)
    
    return graph_json

def getLinksFromJson(input_file, index):
    '''
    Returns a list of links that make up the connected component
    
    '''
    input_file = getGraphLinks(input_file, index)
    cc_nodes = input_file['connected-comps'][0]['nodes']
    cc_links = [input_file['nodes'][int(i)-1]['link'] for i in cc_nodes] # i need to know if the indexing for nodes starts at 0 or 1
    
    return cc_links

In [209]:
# initializing a dataframe to store the scores for each story
df = pd.DataFrame(columns = ['Story_Id', 'Story_Name', 'Percision', 'Recall', 'f1'])

## Story 0

#### North Korea Summit

In [307]:
# loading in the storygraphbot dataset
f = open('data/toolkit_files/sgbot_0/cache/cache_2018-06-12.json', encoding = 'UTF-8')
sgbot_0 = json.load(f)
f.close()

story = sgbot_0['2018-06-12']['stories'][0]
unique_graphs = getUniqueGraphs(story)
bot_graphs = getFormattedBotGraphs(unique_graphs)

In [308]:
#loading in the exemplar data
f = open('data/exemplar_dataset.json')
exemplar = json.load(f)
f.close()

exemplar_links = exemplar[0]['links']

In [309]:
#Create confusion matrix and get values
tp, fp, fn = defineConfusionMatrix(bot_graphs, exemplar_links)
percision, recall, f1 = getScores(tp, fp, fn)

In [310]:
print('Story 0 Output')
print(f'percision: {percision}\nrecall: {recall}')

Story 0 Output
percision: 0.9902912621359223
recall: 0.6257668711656442


In [315]:
PerformAnalysis(0, '2018-06-12', 0, print_output = True)

Story 0 Output
percision: 0.9902912621359223
recall: 0.6257668711656442
f1-score: 0.7669172932330828


(0.9902912621359223, 0.6257668711656442, 0.7669172932330828)

In [210]:
df.loc[len(df)] = [0, 'North Korea Summit', percision, recall, f1]

## Story 1

#### Senate Spending Bill Passes

In [318]:
# This story had little to no reported graphs really, not a great one to look at
f = open('data/toolkit_files/sgbot_1/cache/cache_2018-02-07.json', encoding = 'UTF-8')
sgbot_1 = json.load(f)
f.close()

story = sgbot_1['2018-02-07']['stories'][1]
unique_graphs = getUniqueGraphs(story)
bot_graphs = getFormattedBotGraphs(unique_graphs)

In [320]:
f = open('data/exemplar_dataset.json')
exemplar = json.load(f)
f.close()

exemplar_links = exemplar[1]['links']

In [321]:
tp, fp, fn = defineConfusionMatrix(bot_graphs, exemplar_links)
percision, recall, f1 = getScores(tp, fp, fn)

In [322]:
print('Story 1 Output')
print(f'percision: {percision}\nrecall: {recall}')

Story 1 Output
percision: 1.0
recall: 0.0392156862745098


In [None]:
PerformAnalysis()

In [215]:
df.loc[len(df)] = [1, 'Senate Spending Bill Passes', percision, recall, f1]

## Story 2

#### Scott Resigns

In [217]:
f = open('data/toolkit_files/sgbot_2/cache/cache_2018-07-05.json', encoding = 'UTF-8')
sgbot_2 = json.load(f)
f.close()

story = sgbot_2['2018-07-05']['stories'][0]
unique_graphs = getUniqueGraphs(story)
bot_graphs = getFormattedBotGraphs(unique_graphs)

In [218]:
f = open('data/exemplar_dataset.json')
exemplar = json.load(f)
f.close()

exemplar_links = exemplar[2]['links']

In [219]:
tp, fp, fn = defineConfusionMatrix(bot_graphs, exemplar_links)
percision, recall, f1 = getScores(tp, fp, fn)

In [220]:
print('Story 2 Output')
print(f'percision: {percision}\nrecall: {recall}')

Story 2 Output
percision: 0.9523809523809523
recall: 0.20833333333333334


In [221]:
df.loc[len(df)] = [2, 'Scott Pruitt Resigns', percision, recall, f1]

## Story 3

#### Pittsburgh Synagogue Shooting

In [222]:
f = open('data/toolkit_files/sgbot_3/cache/cache_2018-10-28.json', encoding = 'UTF-8')
sgbot_3_day_1 = json.load(f)
f.close()

f = open('data/toolkit_files/sgbot_3/cache/cache_2018-10-29.json', encoding = 'UTF-8')
sgbot_3_day_2 = json.load(f)
f.close()

day_1_graphs = sgbot_3_day_1['2018-10-28']['stories'][0]
day_2_graphs = sgbot_3_day_2['2018-10-29']['stories'][4]

unique_graphs_1 = getUniqueGraphs(day_1_graphs)
unique_graphs_2 = getUniqueGraphs(day_2_graphs)

unique_graphs = unique_graphs_1 + unique_graphs_2
bot_graphs = getFormattedBotGraphs(unique_graphs)

In [223]:
f = open('data/exemplar_dataset.json')
exemplar = json.load(f)
f.close()

exemplar_links = exemplar[3]['links']

In [224]:
tp, fp, fn = defineConfusionMatrix(bot_graphs, exemplar_links)
percision, recall, f1 = getScores(tp, fp, fn)

In [225]:
print('Story 3 Output')
print(f'percision: {percision}\nrecall: {recall}')

Story 3 Output
percision: 0.9642857142857143
recall: 0.47703180212014135


In [226]:
df.loc[len(df)] = [3, 'Pittsburgh Syngogue Shooting', percision, recall, f1]

## Story 4

In [147]:
# The Yemen story didn't appear at all in the story grpah bot data set, maybe it was too small of a story?

In [227]:
df.loc[len(df)] = [4, 'Yemen War Bill', 0, 0, 0]

## Story 5

#### George Bush Sr Dies

In [228]:
f = open('data/toolkit_files/sgbot_5/cache/cache_2018-12-01.json', encoding = 'UTF-8')
sgbot_5_day_1 = json.load(f)
f.close()

f = open('data/toolkit_files/sgbot_5/cache/cache_2018-12-02.json', encoding = 'UTF-8')
sgbot_5_day_2 = json.load(f)
f.close()

day_1_graphs = sgbot_5_day_1['2018-12-01']['stories'][0]
day_2_graphs = sgbot_5_day_2['2018-12-02']['stories'][0]

unique_graphs_1 = getUniqueGraphs(day_1_graphs)
unique_graphs_2 = getUniqueGraphs(day_2_graphs)

unique_graphs = unique_graphs_1 + unique_graphs_2
bot_graphs = getFormattedBotGraphs(unique_graphs)

In [229]:
exemplar_links = exemplar[5]['links']

In [230]:
tp, fp, fn = defineConfusionMatrix(bot_graphs, exemplar_links)
percision, recall, f1 = getScores(tp, fp, fn)

In [231]:
print('Story 5 Output')
print(f'percision: {percision}\nrecall: {recall}')

Story 5 Output
percision: 0.8625592417061612
recall: 0.8310502283105022


In [232]:
df.loc[len(df)] = [5, 'George Bush Sr Dies', percision, recall, f1]

In [154]:
# Looks like it did a great job on this one, makes sense since it is a large story

## Story 8

#### Aurora Illinois Shooting

In [233]:
f = open('data/toolkit_files/sgbot_8/cache/cache_2019-02-16.json', encoding = 'UTF-8')
sgbot_8 = json.load(f)
f.close()

story = sgbot_8['2019-02-16']['stories'][0]
unique_graphs = getUniqueGraphs(story)
bot_graphs = getFormattedBotGraphs(unique_graphs)

In [234]:
exemplar_links = exemplar[8]['links']

In [235]:
tp, fp, fn = defineConfusionMatrix(bot_graphs, exemplar_links)
percision, recall, f1 = getScores(tp, fp, fn)

In [236]:
print('Story 8 Output')
print(f'percision: {percision}\nrecall: {recall}')

Story 8 Output
percision: 1.0
recall: 0.16666666666666666


In [237]:
df.loc[len(df)] = [8, 'Aurora Illinois Shooting', percision, recall, f1]

## Story 9
#### Green New Deal

In [238]:
f = open('data/toolkit_files/sgbot_9/cache/cache_2019-03-30.json', encoding = 'UTF-8')
sgbot_9 = json.load(f)
f.close()

story = sgbot_9['2019-03-30']['stories'][0]
unique_graphs = getUniqueGraphs(story)
bot_graphs = getFormattedBotGraphs(unique_graphs)

In [239]:
exemplar_links = exemplar[9]['links']

In [240]:
tp, fp, fn = defineConfusionMatrix(bot_graphs, exemplar_links)
percision, recall, f1 = getScores(tp, fp, fn)

In [241]:
print('Story 9 Output')
print(f'percision: {percision}\nrecall: {recall}')

Story 9 Output
percision: 1.0
recall: 0.5348837209302325


In [242]:
df.loc[len(df)] = [9, 'Green New Deal', percision, recall, f1]

## Story 11
#### John Lewis has Cancer

In [169]:
# There was no output from the function for story 11, the function ran for hours without output

## Story 16

#### Trump Signs Covid Relief

In [243]:
f = open('data/toolkit_files/sgbot_16/cache/cache_2020-12-28.json', encoding = 'UTF-8')
sgbot_16 = json.load(f)
f.close()

story = sgbot_16['2020-12-28']['stories'][0]
unique_graphs = getUniqueGraphs(story)
bot_graphs = getFormattedBotGraphs(unique_graphs)

In [244]:
exemplar_links = exemplar[16]['links']

In [245]:
tp, fp, fn = defineConfusionMatrix(bot_graphs, exemplar_links)
percision, recall, f1 = getScores(tp, fp, fn)

In [246]:
print('Story 16 Output')
print(f'percision: {percision}\nrecall: {recall}')

Story 16 Output
percision: 0.9807692307692307
recall: 0.42857142857142855


In [247]:
df.loc[len(df)] = [16, 'Trump Signs Covid Relief', percision, recall, f1]

## Story 17
#### John Lewis Dies

In [248]:
f = open('data/toolkit_files/sgbot_17/cache/cache_2020-07-18.json', encoding = 'UTF-8')
sgbot_17_day_1 = json.load(f)
f.close()

f = open('data/toolkit_files/sgbot_17/cache/cache_2020-07-19.json', encoding = 'UTF-8')
sgbot_17_day_2 = json.load(f)
f.close()

day_1_graphs = sgbot_17_day_1['2020-07-18']['stories'][0]
day_2_graphs = sgbot_17_day_2['2020-07-19']['stories'][0]

unique_graphs_1 = getUniqueGraphs(day_1_graphs)
unique_graphs_2 = getUniqueGraphs(day_2_graphs)

unique_graphs = unique_graphs_1 + unique_graphs_2
bot_graphs = getFormattedBotGraphs(unique_graphs)

In [249]:
exemplar_links = exemplar[17]['links']

In [250]:
tp, fp, fn = defineConfusionMatrix(bot_graphs, exemplar_links)
percision, recall, f1 = getScores(tp, fp, fn)

In [251]:
print('Story 17 Output')
print(f'percision: {percision}\nrecall: {recall}')

Story 17 Output
percision: 0.8089171974522293
recall: 0.6318407960199005


In [259]:
df.loc[9] = [17, 'John Lewis Dies', percision, recall, f1]

## Story 19

#### ASAP Rocky

In [253]:
f = open('data/toolkit_files/sgbot_19/cache/cache_2019-08-14.json', encoding = 'UTF-8')
sgbot_19 = json.load(f)
f.close()

story = sgbot_19['2019-08-14']['stories'][0]
unique_graphs = getUniqueGraphs(story)
bot_graphs = getFormattedBotGraphs(unique_graphs)

In [254]:
exemplar_links = exemplar[19]['links']

In [255]:
tp, fp, fn = defineConfusionMatrix(bot_graphs, exemplar_links)
percision, recall, f1 = getScores(tp, fp, fn)

In [256]:
print('Story 19 Output')
print(f'percision: {percision}\nrecall: {recall}')

Story 19 Output
percision: 1.0
recall: 0.1702127659574468


In [257]:
df.loc[len(df)] = [19, 'ASAP Rocky Found Guilty in Sweeden', percision, recall, f1]

In [261]:
df.to_csv('data/story_scores.csv', index = False)

## Visualizing the Scores