In [1]:
import pandas as pd
from utils.read_exp_utils import read_experiment_result_db
import numpy as np
import matplotlib.pyplot as plt
plt.switch_backend('agg')#plot without using an X-server, use the Agg backend instead.
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.metrics import * 
import itertools
import os

from visualization.visualize import ParameterResult
import psycopg2
import sys

In [2]:
def parse(all_exp, i):
    test = all_exp.loc[i]

    y_scores = test["y_scores"]
    y_true = test["y_true"]
    space_time = test["space_time"]

    df_pr = pd.DataFrame({'space': space_time['space'], 'time': space_time['time'], 'y_scores': y_scores, 'y_true':y_true})
    df_pr = df_pr.sort_values(by=['time', 'y_scores'], ascending=False)
    
    return df_pr

In [3]:
def tp_count(df, num):
    #take the first num rows
    df = df.head(num)
    #return the sum of trues in those rows
    return df['y_true'].sum()

In [4]:
def pr_at_k(df, k, p, count):
    #total tp calculated by counting the number of tp when looping at the top i of each time period
    #total fp calculated by num of locations assigned 1 - all the tp
    tp = df.groupby('time').pipe(tp_count, k)
    fp = (count*k) - tp

    precision = tp/(tp+fp)
    recall = tp/p
    
    return precision, recall

In [5]:
def get_pr_for_val_set(df):
    '''needs to have 'space', time, y_scores, and 'y_true'''
    
    #find how many distinct segments there are 
    hecto_universe_count = df['space'].nunique()
    
    #get raw total number of positive and negatives
    p = df['y_true'].sum()
    
    #create empty arrays to fill
    precision_array = []
    recall_array = []

    #for loop through number of segments classified as accident
    for i in range(1, hecto_universe_count+1):

        precision, recall = pr_at_k(df, i, p, hecto_universe_count)

        precision_array.append(precision)
        recall_array.append(recall)
    
    return precision_array, recall_array

In [6]:
def get_pr(experiment_num, model_type, parameter_id):
    all_exp = read_experiment_result_db(experiment_num, model_type, parameter_id)

    all_precisions = []
    all_recalls = []

    #for number of dfs
    for i in range(4, all_exp.shape[0]):

        #get the new df
        df = parse(all_exp, i)

        #get the new pr vectors
        precision_array, recall_array = get_pr_for_val_set(df)

        #save vectors to vec of vec
        all_precisions.append(precision_array)
        all_recalls.append(recall_array)

    #find avg of vec of vecs
    final_precision = np.average(all_precisions, axis = 0)
    final_recall = np.average(all_recalls, axis = 0)
    
    return final_precision, final_recall


In [7]:
p, r = get_pr(44, 'LR', 'C_0.01_penalty_l1_random_state_42')

In [8]:
p[7]

0.05723443223443223

In [9]:
r[7]

0.10877864558665203