# Examing the New Algorithm with Different Vector Dimensions

This notebook is an extension of the "examing_new_algorithm_with_cutoffs" notebook. It shares similar functions but they are modified to deal with the new file locations and data.

### Required Functions for Analysis

In [11]:
# functions and imports

import pandas as pd
import json

def TotalStoryAnalysis(story_id, cc_index_in_file_values, dimension_values):
    
    '''
    This functions runs a for loop to run an analysis for each dimension value of the story. It adds the results to the saved_data list.
    
    Inputs: 
        story_id: Unique integer identifier of the desired story
        cc_index_in_file_values: A list of values that indicates the index of the most representative vector representation of the story. There are as many values in this list as there are total dimension.
        dimension_values: a list of integers that represent the dimensions of the vectors
     
    '''
    
    for i in range(len(dimension_values)): 
        dimension = dimension_values[i]
        cc_index_in_file = cc_index_in_file_values[i]

        precision, recall, fpr, f1 = SingleFileAnalysis(f'data/new_algorithm_files/story_{story_id}/story_{story_id}_dimension_{dimension}.json', story_id, cc_index_in_file)
        
        # appending to saved_data which is a global variable
        row = {'story_id': story_id, 'dimension_value': dimension, 'precision': precision, 'recall': recall, 'fpr': fpr, 'f1': f1} 
        saved_data.append(row)

def SingleFileAnalysis(file_path, story_id, cc_index_in_file):
    
    ''''
    
    
    Inputs: 
        file_path: The path to the file for analysis
        story_id: Unique integer number for the story
        cc_index_in_file: This is the integer position for where the connected component that best represents the story is found in the new data
        
    Outputs:
        Precision
        Recall
        fpr
        f1
    
    
    '''
    
    story_data = loadJsonFile(file_path)
    
    exemplar_links = GetExamplarLinks(story_id)

    story_links = story_data['story_vectors'][cc_index_in_file]['con_comps']
    
    tn = defineTrueNegatives(story_data, exemplar_links, cc_index_in_file)
    
    bot_graphs = getFormattedBotGraphs(story_links)
    
    tp, fp, fn, failed_bots = defineConfusionMatrix(bot_graphs, exemplar_links, want_failed_bots = True)
    
    precision, recall, fpr, f1 = getScores(tp, tn, fp, fn)
    
    return precision, recall, fpr, f1

def loadJsonFile(file_path, encoding = 'UTF-8'):
    '''
    Function to load in the JSON data
    
    Inputs:
        file_path: The path to the file
        encoding: The desired encoding to load the file in
        
    returns:
        story_data: The contents of the provided json file
    
    '''
    
    file = open(file_path, encoding = 'UTF-8')
    story_data = json.load(file)
    file.close()
    
    return story_data

def GetExamplarLinks(story_id, file_path = 'data/exemplar_dataset.json'):
    '''
    This function loads in the exemplar dataset and returns the data for the given story by its ID
    
    Input:
        story_id: The unique number given to the story to identify it
        file_path: Optional input that points to the exemplar data set file
        
    Returns:
        exemplar_links: List of dictionary objects that each represent one connected component of the story. 
    '''
    
    file = open(file_path)
    exemplar = json.load(file)
    file.close()

    exemplar_links = exemplar[story_id]['links']
    
    return exemplar_links

def defineConfusionMatrix(bot_graphs, exemplar_links, want_failed_bots = False):
    '''
    Function to calculate the True Positive (tp), False Positive (fp), and False Negative (fn) values.
    
    inputs:
        bot_graphs: A list of dictionaries that represent each graph. The ouput of getUniqueGraphs().
        exemplar_links: A list of dictionaries that represent each graph. The ouput of getFormattedBotGraphs().
    
    returns: 
        tp: an integer for the true positives
        fp: an integer for the false positives
        fn: an integer for the false negatives
        bot_failed_attempts: a list of dictionaries for each graph not found in the exemplar data set
    
    
    '''
    #creating a list of the links found in the exemplar dataset
    exemplar_link_list = [i['graph_link'] for i in exemplar_links]
    
    # list to identify any failed attempts
    bot_failed_attempts = [] 
    
    # setting values for the true positives, false positives, and false negatives
    tp = 0   
    fp = 0 
    fn = 0 
    
    
    for graph in bot_graphs:
        
        # looking at each uri from the storygraphbot ouput and seeing that it is in the exemplar data set
        if graph['graph_link'] in exemplar_link_list:
            
            # finding the index where the uri is found in the exemplar data set
            idx = exemplar_link_list.index(graph['graph_link']) 
    
            # checking that the degrees of the connected components match where the uri of each matched
            # the degree of the storygraphbot output is rounded to 2 decimals because it is a long decimal while
            # the exemplar is only 2 digits
            
            
            if (round(graph['cc_degree'], 2)) == (exemplar_links[idx]['cc_degree']) and (graph['cc_index'] == (exemplar_links[idx]['cc_index'] - 1)): 
                
                tp += 1

            else:
                
                # adding to false postive when the storygraphbot output has the correct graph uri but the wrong connected component degree
                fp += 1
                bot_failed_attempts.append(graph)
       
        else:
            # adding to the false positive when the storygraphbot ouput uri is not found in the exemplar
            fp += 1
            bot_failed_attempts.append(graph)
    
    # bot links is a link of uri's ouputed by the storygraphbot algoirthm
    bot_links = [bot_graphs[i]['graph_link'] for i in range(len(bot_graphs))]
    
    #looking at each link in the exemplar links
    for link in exemplar_link_list:
        
        if link not in bot_links:
            
            # adding to the false negative because a link in the exemplar is not found in the storygraphbot output
            fn += 1
            
    if want_failed_bots == True:
        
        return tp, fp, fn, bot_failed_attempts
    
    else:
        
        return tp, fp, fn

    
def defineTrueNegatives(story_data, exemplar_links, cc_index_in_file):
    '''
    Inputs: 
        story_data: List of each vector in the JSON file that is produced from the loadJsonFile function 
        exemplar_links: 
        cc_index_in_file: The integer index that points to where the desired story is found in the JSON file 
    
    Outputs:
        tn: The number of true negatives
    
    '''
    
    tn = 0
    
    #creating a list of the links found in the exemplar dataset
    exemplar_link_list = [i['graph_link'] for i in exemplar_links]
    
    non_story_data = story_data['story_vectors'][:cc_index_in_file] + story_data['story_vectors'][cc_index_in_file+1:]
    
    for story in non_story_data:
        
        bot_graphs = getFormattedBotGraphs(story['con_comps'])
        
        # want to add to TN when a connected component not included in the main story is not in the exemplar list
        for graph in bot_graphs:
            
            if graph['graph_link'] not in exemplar_link_list:
                
                tn += 1
            
            else:
                
                # finding the index where the uri is found in the exemplar data set
                idx = exemplar_link_list.index(graph['graph_link']) 
                
                # adding to true negative when the graph does not have the same degree or index as the one found in the exemplar list
                # this means it is different than the connected component in the exemplar and was correctly not added to the story 
                
                if (round(graph['cc_degree'], 2)) != (exemplar_links[idx]['cc_degree']) or (graph['cc_index'] != (exemplar_links[idx]['cc_index'] - 1)): 
                
                    tn += 1
                    
                
    return tn

def getFormattedBotGraphs(unique_graphs):
    '''
    A function that transforms the storygraph bot format to the exemplar format for direct comparison
    
    inputs:
        unique_graphs: A list of dictionaries that each represent a graph in the storygraphbot format.
        
    outputs:
        bot_graphs: A list of dictionaries that each represent a graph in the exemplar format.
    '''
    
    # list to append the dictionary to
    bot_graphs = []
    
    for graph in unique_graphs:
        # creating a dictionary for the new formating
        graph_dic = {'graph_link': '', 'cc_degree': '', 'cc_index': ''} 
        
        # transforming the graph link from the internet archive format to the csu format
        intermediate = (graph['graph_uri']
                                   .replace('https://web.archive.org/storygraph/graphs/usa/', 'https://storygraph.cs.odu.edu/graphs/polar-media-consensus-graph/')
                                   .replace('hist=144', 'hist=1440'))
        graph_dic['graph_link'] = (intermediate
                                   .replace(intermediate[65:87], '')
                                   .replace('&cursor', '#cursor') + intermediate[65:87].replace('#t=', '&t='))

        # storing the connected component degree
        graph_dic['cc_degree'] = graph['avg_degree'] 
        
        # storing the uri with the most edges in the connected component
        graph_dic['cc_index'] = graph['index'] 
        
        bot_graphs.append(graph_dic)
        
    return bot_graphs

def getScores(tp, tn, fp, fn):
    '''
    tp = True Positive
    tn = True Negative
    fp = False Positive
    fn = False Negatives
    
    return: percision, recall, f1-score
    '''
    # calculating the precision, recall, and f1-score
    
    precision = ((tp)/(tp + fp))
    
    if (tp + fn) != 0:
        recall = ((tp)/(tp + fn))
    else:
        recall = 0
    
    # error handling for dividing by 0
    if (fp + tn) != 0:
        fpr = ((fp) / (fp + tn))
    else:
        # not confident that this is the best solution, but it seems like 1 would be the worst possible fpr
        fpr = 1 
    
    if (precision + recall) != 0:
        f1 = 2 *((precision * recall) / (precision + recall))
    else:
        f1 = 0
    
    return precision, recall, fpr, f1

def findCCIndexInFile(file_path):
    
    '''
    
    A function that is used to print the titles for each index in the JSON file from the new algorithm
    
    Input:
        file_path: Path to the JSON file
    '''
    
    file = loadJsonFile(file_path)
    for i in range(len(file['story_vectors'])):
        print(i)
        print(file['story_vectors'][i]['titles'])
        
def FindBestCCIndex(file_path, story_id):
    '''
    Function to identify the cc index that yields the best f1 score
    
    Inputs:
        file_path
        story_id
    Outputs:
        best_cc_index
        best_f1
        
    '''
    
    story_data = loadJsonFile(file_path)
    num_ccs = len(story_data['story_vectors'])
    
    best_f1 = 0
    best_cc_index = 0
    
    for i in range(num_ccs):
        precision, recall, fpr, f1 = SingleFileAnalysis(file_path, story_id, i)
        if f1 >= best_f1:
            best_f1 = f1
            best_cc_index = i
    return best_cc_index, best_f1

def GetCCIndexFileValuesList(story_id, dimension_values):
    '''
    Function to generate the best cc_index_in_file_values list
    
    Inputs:
        story_id
        dimension_vaules: list of dimension values to be tested
        
    Output:
        cc_index_in_file_values: A list of indicies that give the best f1 score for each dimension value
    '''
    
    cc_index_in_file_values = []
    
    for i in dimension_values:
        best_cc_index, best_f1 = FindBestCCIndex(f'data/new_algorithm_files/story_{story_id}/story_{story_id}_dimension_{i}.json', story_id)
        cc_index_in_file_values.append(best_cc_index)
        
    return cc_index_in_file_values

## Analyzing Different Dimension Values

In [2]:
saved_data = []

# Setup the dimension values to cycle through the files and pair with the indices
dimension_values = [10, 50, 100, 500, 1000, 1500, 2000]

### Story 0: Kim/Trump Summit

In [3]:
cc_index_in_file_values = GetCCIndexFileValuesList(0, dimension_values)
TotalStoryAnalysis(0, cc_index_in_file_values, dimension_values)

### Story 1: Senate 2 year 2018 Spending Deal

In [5]:
cc_index_in_file_values = GetCCIndexFileValuesList(1, dimension_values)
TotalStoryAnalysis(1, cc_index_in_file_values, dimension_values)

### Story 2: Scott Pruitt Resigns as EPA Chief

In [6]:
cc_index_in_file_values = GetCCIndexFileValuesList(2, dimension_values)
TotalStoryAnalysis(2, cc_index_in_file_values, dimension_values)

### Story 3: Pittsburgh Synagogue Shooting

In [7]:
cc_index_in_file_values = GetCCIndexFileValuesList(3, dimension_values)
TotalStoryAnalysis(3, cc_index_in_file_values, dimension_values)

### Story 4: Senate Votes to End Support for Yemen War

In [12]:
cc_index_in_file_values = GetCCIndexFileValuesList(4, dimension_values)
TotalStoryAnalysis(4, cc_index_in_file_values, dimension_values)

### Story 5: George Bush Dies at 94

In [14]:
cc_index_in_file_values = GetCCIndexFileValuesList(5, dimension_values)
TotalStoryAnalysis(5, cc_index_in_file_values, dimension_values)

### Story 8: Aurora Illinois Shooting

In [15]:
cc_index_in_file_values = GetCCIndexFileValuesList(8, dimension_values)
TotalStoryAnalysis(8, cc_index_in_file_values, dimension_values)

### Story 9: AOC Town Hall Green New Deal

In [16]:
cc_index_in_file_values = GetCCIndexFileValuesList(9, dimension_values)
TotalStoryAnalysis(9, cc_index_in_file_values, dimension_values)

### Story 11: John Lewis Has Pancreatic Cancer

In [17]:
cc_index_in_file_values = GetCCIndexFileValuesList(11, dimension_values)
TotalStoryAnalysis(11, cc_index_in_file_values, dimension_values)

### Story 16: Trump Signs Covid Relief Package

In [18]:
cc_index_in_file_values = GetCCIndexFileValuesList(16, dimension_values)
TotalStoryAnalysis(16, cc_index_in_file_values, dimension_values)

### Story 17: John Lewis Dies at 80

In [19]:
cc_index_in_file_values = GetCCIndexFileValuesList(17, dimension_values)
TotalStoryAnalysis(17, cc_index_in_file_values, dimension_values)

### Story 19: ASAP Rocky Found Guilty in Sweeden

In [20]:
cc_index_in_file_values = GetCCIndexFileValuesList(19, dimension_values)
TotalStoryAnalysis(19, cc_index_in_file_values, dimension_values)

In [21]:
import csv

file = open('new_algorithm_dimension_results.csv', 'w')
writer = csv.writer(file)
writer.writerow(['story_id', 'dimension_value', 'precision', 'recall', 'fpr', 'f1'])

for dictionary in saved_data:
    writer.writerow(dictionary.values())
    
file.close()

## Comparison

In [34]:
import numpy as np

def FormatDimensionTables(file_path, metric, output = False):
    
    data = pd.read_csv(file_path)
    new_results_table = pd.DataFrame(columns = (['dimension_value'] + data['story_id'].unique().tolist() + ['average']))
    
    # list of unique dimension values
    dimension_value = data['dimension_value'].unique()

    for dimension in dimension_values:
    
        # selecting where in the dataframe that cutoff is
        idx = data['dimension_value'] == dimension
        dimension_df = data[idx]

        # getting the scores in a list
        scores = dimension_df[metric].tolist()
        avg = np.average(scores)
        row = [dimension] + scores + [avg]

        # appending results to the dataframe
        new_results_table.loc[len(new_results_table.index)] = row
        
    new_results_table.to_csv('tables/new_algorithm_' + metric + '_by_dimension.csv', index = False)
    
    if output == True:
        return new_results_table

In [None]:
FormatDimensionTables('new_algorithm_dimension_results.csv', 'f1')
FormatDimensionTables('new_algorithm_dimension_results.csv', 'precision')
FormatDimensionTables('new_algorithm_dimension_results.csv', 'recall')