# Examing New Algorithm Results

In [1]:
import pandas as pd
import json

### Define Functions for Analysis

In [97]:
# functions

def TotalStoryAnalysis(story_id, cc_index_in_file_values):
    
    '''
    This functions runs a for loop to run an analysis for each cutoff value of the story. It adds the results to the saved_data list.
    
    Inputs: 
        story_id: Unique integer identifier of the desired story
        cc_index_in_file_values: A list of values that indicates the index of the most representative vector representation of the story. There are as many values in this list as there are total cutoffs.
    
    '''
    
    cutoff_values = [10, 20, 30, 40, 50, 60, 70, 80, 90]
    
    for i in range(len(cutoff_values)): 
        cutoff = cutoff_values[i]
        cc_index_in_file = cc_index_in_file_values[i]

        precision, recall, fpr, f1 = SingleFileAnalysis(f'data/new_algorithm_files/story_{story_id}/story_{story_id}_cutoff_{cutoff}.json', story_id, cc_index_in_file)

        row = {'story_id': story_id, 'cutoff_value': cutoff, 'precision': precision, 'recall': recall, 'fpr': fpr, 'f1': f1} 
        saved_data.append(row)

def SingleFileAnalysis(file_path, story_id, cc_index_in_file):
    
    ''''
    
    
    Inputs: 
        file_path: The path to the file for analysis
        story_id: Unique integer number for the story
        cc_index_in_file: This is the integer position for where the connected component that best represents the story is found in the new data
        
    Outputs:
    
    
    '''
    
    story_data = loadJsonFile(file_path)
    
    exemplar_links = GetExamplarLinks(story_id)

    story_links = story_data['story_vectors'][cc_index_in_file]['con_comps']
    
    tn = defineTrueNegatives(story_data, exemplar_links, cc_index_in_file)
    
    bot_graphs = getFormattedBotGraphs(story_links)
    
    tp, fp, fn, failed_bots = defineConfusionMatrix(bot_graphs, exemplar_links, want_failed_bots = True)
    
    precision, recall, fpr, f1 = getScores(tp, tn, fp, fn)
    
    return precision, recall, fpr, f1

def loadJsonFile(file_path, encoding = 'UTF-8'):
    '''
    Function to load in the JSON data
    
    Inputs:
        file_path: The path to the file
        encoding: The desired encoding to load the file in
        
    returns:
        story_data: The contents of the provided json file
    
    '''
    
    file = open(file_path, encoding = 'UTF-8')
    story_data = json.load(file)
    file.close()
    
    return story_data

def GetExamplarLinks(story_id, file_path = 'data/exemplar_dataset.json'):
    '''
    This function loads in the exemplar dataset and returns the data for the given story by its ID
    
    Input:
        story_id: The unique number given to the story to identify it
        file_path: Optional input that points to the exemplar data set file
        
    Returns:
        exemplar_links: List of dictionary objects that each represent one connected component of the story. 
    '''
    
    file = open(file_path)
    exemplar = json.load(file)
    file.close()

    exemplar_links = exemplar[story_id]['links']
    
    return exemplar_links

def defineConfusionMatrix(bot_graphs, exemplar_links, want_failed_bots = False):
    '''
    Function to calculate the True Positive (tp), False Positive (fp), and False Negative (fn) values.
    
    inputs:
        bot_graphs: A list of dictionaries that represent each graph. The ouput of getUniqueGraphs().
        exemplar_links: A list of dictionaries that represent each graph. The ouput of getFormattedBotGraphs().
    
    returns: 
        tp: an integer for the true positives
        fp: an integer for the false positives
        fn: an integer for the false negatives
        bot_failed_attempts: a list of dictionaries for each graph not found in the exemplar data set
    
    
    '''
    #creating a list of the links found in the exemplar dataset
    exemplar_link_list = [i['graph_link'] for i in exemplar_links]
    
    # list to identify any failed attempts
    bot_failed_attempts = [] 
    
    # setting values for the true positives, false positives, and false negatives
    tp = 0   
    fp = 0 
    fn = 0 
    
    
    for graph in bot_graphs:
        
        # looking at each uri from the storygraphbot ouput and seeing that it is in the exemplar data set
        if graph['graph_link'] in exemplar_link_list:
            
            # finding the index where the uri is found in the exemplar data set
            idx = exemplar_link_list.index(graph['graph_link']) 
    
            # checking that the degrees of the connected components match where the uri of each matched
            # the degree of the storygraphbot output is rounded to 2 decimals because it is a long decimal while
            # the exemplar is only 2 digits
            
            
            if (round(graph['cc_degree'], 2)) == (exemplar_links[idx]['cc_degree']) and (graph['cc_index'] == (exemplar_links[idx]['cc_index'] - 1)): 
                
                tp += 1

            else:
                
                # adding to false postive when the storygraphbot output has the correct graph uri but the wrong connected component degree
                fp += 1
                bot_failed_attempts.append(graph)
       
        else:
            # adding to the false positive when the storygraphbot ouput uri is not found in the exemplar
            fp += 1
            bot_failed_attempts.append(graph)
    
    # bot links is a link of uri's ouputed by the storygraphbot algoirthm
    bot_links = [bot_graphs[i]['graph_link'] for i in range(len(bot_graphs))]
    
    #looking at each link in the exemplar links
    for link in exemplar_link_list:
        
        if link not in bot_links:
            
            # adding to the false negative because a link in the exemplar is not found in the storygraphbot output
            fn += 1
            
    if want_failed_bots == True:
        
        return tp, fp, fn, bot_failed_attempts
    
    else:
        
        return tp, fp, fn

    
def defineTrueNegatives(story_data, exemplar_links, cc_index_in_file):
    '''
    Inputs: 
        story_data: List of each vector in the JSON file that is produced from the loadJsonFile function 
        exemplar_links: 
        cc_index_in_file: The integer index that points to where the desired story is found in the JSON file 
    
    Outputs:
        tn: The number of true negatives
    
    '''
    
    tn = 0
    
    #creating a list of the links found in the exemplar dataset
    exemplar_link_list = [i['graph_link'] for i in exemplar_links]
    
    non_story_data = story_data['story_vectors'][:cc_index_in_file] + story_data['story_vectors'][cc_index_in_file+1:]
    
    for story in non_story_data:
        
        bot_graphs = getFormattedBotGraphs(story['con_comps'])
        
        # want to add to TN when a connected component not included in the main story is not in the exemplar list
        for graph in bot_graphs:
            
            if graph['graph_link'] not in exemplar_link_list:
                
                tn += 1
            
            else:
                
                # finding the index where the uri is found in the exemplar data set
                idx = exemplar_link_list.index(graph['graph_link']) 
                
                # adding to true negative when the graph does not have the same degree or index as the one found in the exemplar list
                # this means it is different than the connected component in the exemplar and was correctly not added to the story 
                
                if (round(graph['cc_degree'], 2)) != (exemplar_links[idx]['cc_degree']) or (graph['cc_index'] != (exemplar_links[idx]['cc_index'] - 1)): 
                
                    tn += 1
                    
                
    return tn

def getFormattedBotGraphs(unique_graphs):
    '''
    A function that transforms the storygraph bot format to the exemplar format for direct comparison
    
    inputs:
        unique_graphs: A list of dictionaries that each represent a graph in the storygraphbot format.
        
    outputs:
        bot_graphs: A list of dictionaries that each represent a graph in the exemplar format.
    '''
    
    # list to append the dictionary to
    bot_graphs = []
    
    for graph in unique_graphs:
        # creating a dictionary for the new formating
        graph_dic = {'graph_link': '', 'cc_degree': '', 'cc_index': ''} 
        
        # transforming the graph link from the internet archive format to the csu format
        intermediate = (graph['graph_uri']
                                   .replace('https://web.archive.org/storygraph/graphs/usa/', 'https://storygraph.cs.odu.edu/graphs/polar-media-consensus-graph/')
                                   .replace('hist=144', 'hist=1440'))
        graph_dic['graph_link'] = (intermediate
                                   .replace(intermediate[65:87], '')
                                   .replace('&cursor', '#cursor') + intermediate[65:87].replace('#t=', '&t='))

        # storing the connected component degree
        graph_dic['cc_degree'] = graph['avg_degree'] 
        
        # storing the uri with the most edges in the connected component
        graph_dic['cc_index'] = graph['index'] 
        
        bot_graphs.append(graph_dic)
        
    return bot_graphs

def getScores(tp, tn, fp, fn):
    '''
    tp = True Positive
    tn = True Negative
    fp = False Positive
    fn = False Negatives
    
    return: percision, recall, f1-score
    '''
    # calculating the precision, recall, and f1-score
    
    precision = ((tp)/(tp + fp))
    recall = ((tp)/(tp + fn))
    fpr = ((fp) / (fp + tn))
    
    if (precision + recall) != 0:
        f1 = 2 *((precision * recall) / (precision + recall))
    else:
        f1 = 0
    
    return precision, recall, fpr, f1

def findCCIndexInFile(file_path):
    
    '''
    
    A function that is used to print the titles for each index in the JSON file from the new algorithm
    
    Input:
        file_path: Path to the JSON file
    '''
    
    file = loadJsonFile(file_path)
    for i in range(len(file['story_vectors'])):
        print(i)
        print(file['story_vectors'][i]['titles'])

# Analysis Description

For each story, there are a variety of cuttoff values to analyze. These cutoff values are the percentage of similiarity between the two vectors for them to be linked. Those values start at 10% and go to 90% in increments of 10%. These values are used in their decimal representation (i.e. 10% is 0.10). 

I cannot use a loop because there is no pattern for where the representative connected component will be in the file. This means I have to go into each file for each cuttoff and find the connected component that best represents the story we are interested in.

For each cuttoff, I went into the file and looked at the story_vectors tab. I then looked at the titles for each to see which one was the most similar to the given story. 

In [76]:
## Create Variable to Save the Data
saved_data = []

# Setup the cutoff values to cycle through the files and pair with the indices
cutoff_values = [10, 20, 30, 40, 50, 60, 70, 80, 90]

## Story 0: Kim/Trump Summit

In [80]:
cc_index_in_file_values = [0, 0, 0, 0, 0, 0, 0, 0, 0]
TotalStoryAnalysis(0, cc_index_in_file_values)

## Story 1: Senate 2 year 2018 Spending Deal

In [82]:
cc_index_in_file_values = [0, 0, 0, 0, 0, 1, 1, 62, 165]
TotalStoryAnalysis(1, cc_index_in_file_values)

# weird value for cutoff of 90

## Story 2: Scott Pruitt Resigns as EPA Chief

In [84]:
cc_index_in_file_values = [0, 0, 0, 0, 0, 0, 42, 57, 83]
TotalStoryAnalysis(2, cc_index_in_file_values)

## Story 3: Pittsburgh Synagogue Shooting

In [85]:
cc_index_in_file_values = [0, 0, 0, 0, 0, 0, 0, 19, 23]
TotalStoryAnalysis(3, cc_index_in_file_values)

## Story 4: Senate Votes to End Support for Yemen War

In [86]:
cc_index_in_file_values = [0, 0, 0, 0, 1, 2, 2, 7, 104]
TotalStoryAnalysis(4, cc_index_in_file_values)

## Story 5: George Bush Dies at 94

In [87]:
cc_index_in_file_values = [0, 0, 0, 0, 2, 2, 2, 2, 11]
TotalStoryAnalysis(5, cc_index_in_file_values)

## Story 8: Aurora Illinois Shooting

In [88]:
cc_index_in_file_values = [0, 0, 0, 0, 0, 0, 0, 0, 0]
TotalStoryAnalysis(8, cc_index_in_file_values)

## Story 9: AOC Town Hall Green New Deal

In [89]:
cc_index_in_file_values = [0, 0, 0, 0, 0, 0, 1, 1, 2]
TotalStoryAnalysis(9, cc_index_in_file_values)

## Story 11: John Lewis Has Pancreatic Cancer

In [90]:
cc_index_in_file_values = [0, 0, 0, 0, 13, 16, 24, 29, 49]
TotalStoryAnalysis(11, cc_index_in_file_values)

## Story 16: Trump Signs Covid Relief Package

In [91]:
cc_index_in_file_values = [0, 0, 0, 1, 1, 1, 1, 1, 1]
TotalStoryAnalysis(16, cc_index_in_file_values)

## Story 17: John Lewis Dies at 80

In [92]:
cc_index_in_file_values = [0, 0, 0, 0, 0, 3, 3, 3, 3]
TotalStoryAnalysis(17, cc_index_in_file_values)

## Story 19: ASAP Rocky Found Guilty in Sweeden

In [93]:
cc_index_in_file_values = [0, 0, 0, 0, 0, 0, 0, 26, 56]
TotalStoryAnalysis(19, cc_index_in_file_values)

In [95]:
import csv

In [96]:
file = open('new_algorithm_cutoff_results_with_fpr.csv', 'w')
writer = csv.writer(file)
writer.writerow(['story_id', 'cutoff_value', 'precision', 'recall', 'fpr', 'f1'])

for dictionary in saved_data:
    writer.writerow(dictionary.values())
    
file.close()

# Graphing The Results

In [99]:
from plotnine import *

In [101]:
results = pd.read_csv('new_algorithm_cutoff_results_with_fpr.csv')

In [102]:
results.head()

Unnamed: 0,story_id,cutoff_value,precision,recall,fpr,f1
0,0,10,0.069867,1.0,0.999079,0.130609
1,0,20,0.072833,1.0,0.955341,0.135777
2,0,30,0.073589,1.0,0.944751,0.13709
3,0,40,0.078441,1.0,0.881676,0.145471
4,0,50,0.10752,1.0,0.622928,0.194163


In [None]:
## Generating the graphs for the f1 scores

story_ids = results['story_id'].unique()
for story_id in story_ids:
    idx = results['story_id'] == story_id
    data_subset = results[idx]
    p = (ggplot(data_subset, aes('cutoff_value', 'f1')) + geom_point() + labs(x = 'Cutoff Value', y = 'F1 Score', title = f'Story {story_id}: F1 Scores') + 
         scale_x_continuous(breaks = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]))
    p.save(f'images/f1_plots/story_{story_id}_f1_cutoff_plot.png')

In [None]:
## Generating the ROC graph

story_ids = results['story_id'].unique()
for story_id in story_ids:
    idx = results['story_id'] == story_id
    data_subset = results[idx]
    p = (ggplot(data_subset, aes('fpr', 'recall')) + geom_point() + 
         labs(x = 'False Positive Rate', y = 'True Positive Rate (Recall)', title = f'Story {story_id}: ROC Curve') +
        scale_x_continuous(breaks = [0, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1]) + xlim(0, 1) +
        scale_y_continuous(breaks = [0, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1]) + ylim(0,1))
    p.save(f'images/roc_plots/story_{story_id}_roc_curve_plot.png')

## Calculating AUC Values

In [106]:
from sklearn import metrics

In [114]:
story_aucs = []
for story_id in story_ids:
    idx = results['story_id'] == story_id
    data_subset = results[idx]
    
    fpr = data_subset['fpr']
    tpr = data_subset['recall']
    
    auc = metrics.auc(fpr, tpr)
    
    result = {'story_id': story_id, 'auc': auc}
    story_aucs.append(result)

In [115]:
story_aucs

[{'story_id': 0, 'auc': 0.9254143646408839},
 {'story_id': 1, 'auc': 0.9442001099505223},
 {'story_id': 2, 'auc': 0.9966620932010823},
 {'story_id': 3, 'auc': 0.9923944892206297},
 {'story_id': 4, 'auc': 0.9749873900516143},
 {'story_id': 5, 'auc': 0.9921353035926885},
 {'story_id': 8, 'auc': 0.9787450569899977},
 {'story_id': 9, 'auc': 0.7500304484353557},
 {'story_id': 11, 'auc': 0.9988731310466141},
 {'story_id': 16, 'auc': 0.7972152885017063},
 {'story_id': 17, 'auc': 0.9802011687206591},
 {'story_id': 19, 'auc': 0.9961977186311787}]

In [123]:
file = open('new_algorithm_auc_values.csv', 'w')
writer = csv.writer(file)
writer.writerow(['story_id', 'auc'])

for dictionary in story_aucs:
    writer.writerow(dictionary.values())
    
file.close()