In [None]:
!pip install -r requirements.txt


In [None]:
from google.cloud import secretmanager

from faunadb import query as q
from faunadb.objects import Ref
from faunadb.client import FaunaClient

def get_client():
    print('grabbing secret')

    client = secretmanager.SecretManagerServiceClient()
    secret_name = "fauna_deepcite_db"
    project_id = "deepcite-306405"

    request = {"name": f"projects/{project_id}/secrets/{secret_name}/versions/latest"}
    response = client.access_secret_version(request)
    secret_string = response.payload.data.decode("UTF-8")

    return FaunaClient(secret=secret_string, domain='db.us.fauna.com')

client = get_client()



In [None]:
from datetime import datetime
import pandas as pd
pd.options.display.max_colwidth = 100
import json

def fauna_to_df(collection_name, size = 100000):
  fauna_call = client.query(
    q.map_(
      lambda x: q.get(x),
      q.paginate(q.documents(q.collection(collection_name)), size=size)
    )
  )
  data = [doc['data'] for doc in fauna_call['data']]
  df = pd.DataFrame.from_records(data)
  
  times = [datetime.fromtimestamp(doc['ts']/1000000.0) for doc in fauna_call['data']] ## need to figure out timezones
  df['created_at'] = times

  return df.sort_values(by=['created_at'], ascending=False).reset_index(drop=True)
call_df_fauna = fauna_to_df('deepcite_call')
call_df_fauna = pd.concat([call_df_fauna.drop(['response'], axis=1), call_df_fauna['response'].apply(pd.Series)], axis=1).set_index('id')
call_df_fauna.head(2)

In [None]:
import psycopg2
import pandas as pd
pd.options.display.max_colwidth = 100
import json

with open('database_config.json') as json_file:
    db_config = json.load(json_file)['gcp']
conn = psycopg2.connect(host=db_config['host'], user=db_config['user'], password=db_config['password'], database=db_config['database'], port=db_config['port'])
sql = 'SELECT * FROM "deepcite_call" ORDER by "created_at" DESC;'
call_df = pd.read_sql_query(sql, conn)
call_df = pd.concat([call_df.drop(['response'], axis=1), call_df['response'].apply(pd.Series)], axis=1).set_index('id')
call_df.head(2)


In [None]:
labels_df_fauna = fauna_to_df('deepcite_source')
labels_df_fauna.head(2)

In [None]:
sql = 'SELECT * FROM "source_label" ORDER by "created_at" DESC;'
labels_df = pd.read_sql_query(sql, conn)
labels_df.head(2)

In [None]:
retrieval_df_fauna = fauna_to_df('deepcite_retrieval')
retrieval_df_fauna.head(2)

In [None]:
sql = 'SELECT * FROM "deepcite_retrieval" ORDER by "created_at" DESC;'
retrieval_df = pd.read_sql_query(sql, conn)
retrieval_df.head(2)

# Grab all unique submissions with labels

In [None]:
# Right now I do not know how reliable everyone's labels are, so I'll allowlist some ID's
allowed_ids = ['2865b5b498575e748eb26c298eae56688afc9e4045896c8da76ce1931fe0']
allowed_labels_df = labels_df[labels_df.user_id.isin(allowed_ids)]

In [None]:
def check_most_recent_redacted(group):
    # print(group)

    sorted_labels = group.sort_values(['created_at'], ascending=False)
    if sorted_labels.iloc[0]['redact']: # i.e. if the most recent label was not redacted
        return False
    return True


grouped = allowed_labels_df.groupby(['base_id', 'source_id'])

filtered_labels_df = grouped.filter(check_most_recent_redacted)

filtered_size_delta = len(allowed_labels_df.groupby(['base_id', 'source_id'])) - len(filtered_labels_df.groupby(['base_id', 'source_id']))
print(f'Filtering reduced the size of the dataframe by: {filtered_size_delta} labels')
grouped_labels_df = filtered_labels_df.groupby('base_id')['source_id'].apply(set)




In [None]:
labeled_call = pd.merge(call_df, grouped_labels_df, left_index=True, right_on='base_id', how='inner')
print(len(labeled_call))
labeled_call.head(2)

In [None]:
def grab_submission(row):
    try:
        input = row.results[0]
        labels = row.source_id
        sources = [[(result['source'], result['link']) for result in row.results if result['citeID'] == label] for label in labels]
    except:
        return  pd.Series([row.results['link'], row.results['source']])
    return  pd.Series([input['link'], input['source'], tuple(sources)])

nan_value = float("NaN")
submissions = labeled_call.dropna(subset=['results']).apply(grab_submission, axis=1)
submissions.columns = ['link', 'claim', 'sources']
unique_submissions = submissions.drop_duplicates(subset=['link', 'claim']).replace('', nan_value).dropna()
print(len(submissions)-len(unique_submissions))

unique_submissions.head()


# Rerun submissions against local model

In [None]:
import requests
import sys
from datetime import date
import time

def current_result(row):
    time.sleep(3)
    json.loads('{"error": "timeout"}')
    print(row[0])
    try:
        response = requests.post(url='http://127.0.0.1:8000/api/v1/deep_cite', json={"claim": row[1], "link": row[0]})
    except:
        response = '{"error": "timeout"}'
        print(response)
        return json.loads(response)
    return response.text

results = unique_submissions.apply(current_result, axis=1)
unique_submissions['new_results'] = results
unique_submissions.to_csv(f'{date.today().strftime("%Y_%m_%d")}_base_model.csv')

In [None]:
# pd.json_normalize(unique_submissions['new_results'].dropna().apply(lambda x:eval(x)))
unique_submissions['new_results']

## Or grab most recent run

In [None]:
import pandas as pd
from datetime import date

import glob
import os

list_of_files = glob.glob('./*.csv') # * means all if need specific format then *.csv
latest_file = max(list_of_files, key=os.path.getctime)

rerun_df = pd.read_csv(latest_file)
rerun_df = pd.concat([rerun_df.drop(['new_results'], axis=1), pd.json_normalize(rerun_df['new_results'].apply(lambda x:eval(x))).add_suffix('_new')], axis=1)
rerun_df.set_index('base_id', inplace=True)
rerun_df.head(5) 

# unique_submissions

In [None]:

# add row for old results
compare_df = rerun_df.join(call_df[['results','error']].add_suffix('_old'))
compare_df.head(2)


# See differences in results

In [None]:
def compare_labels(row):
    '''
    Sees how the ranking of each labeled element has changed and sums them.
    Higher is a better ranking for the new labels.
    '''
    if row['error_new'] == 'timeout':
        print('timed out for new result')
        return float('Nan')

    sorted_old = sorted(row['results_old'], key=lambda k: k['score'], reverse=True)
    sorted_new = sorted(row['results_new'], key=lambda k: k['score'], reverse=True)

    claim_link_old = [(node['source'], node['link']) for node in sorted_old]
    claim_link_new = [(node['source'], node['link']) for node in sorted_new]

    total_ranking_improvement = 0
    for label in eval(row['sources']):
        try:
            ranking_delta = claim_link_old.index(label[0]) - claim_link_new.index(label[0])
        except ValueError:
            print('One of the elements wasn\'t found')
            ranking_delta = -0.01
        total_ranking_improvement += ranking_delta

    return total_ranking_improvement

scores = compare_df.apply(compare_labels, axis=1)
scores[scores != 0]
# compare_df.iloc[0].results_old[0]
# compare_df.iloc[0].sources

In [None]:
compare_df.loc['e54912df-22e4-42e6-8ad4-75b6c487881c']['results_new']
eval(compare_df.loc['e54912df-22e4-42e6-8ad4-75b6c487881c']['sources'])


In [None]:
import ast
import math

def print_results(results):
    for res in results:
        print(res[0][:150])
        print(res[1])
        print()

def matched_results(results_old, error_old, results_new, error_new):
    if error_new == 'timeout':
        return 'timed out for new result'

    res1 = sorted(results_old, key=lambda k: k['score'], reverse=True)
    res2 = sorted(results_new, key=lambda k: k['score'], reverse=True)

    if len(res1) + len(res2) <= 2:
        return 'not long enough'

    res1 = [(res['link'], res['source']) for res in res1]
    res2 = [(res['link'], res['source']) for res in res2]

    if res1[:4] != res2[:4]:
        print(results_old[0]['source'], '\n')
        print_results(res1[1:4])
        print('======================')
        print_results(res2[1:4])
        print()
        print()
        print()
        return 'no match in first 3'

    if len(res1) != len(res2):
        print(len(res1) - len(res2))
        return 'length'
    
    if res1 != res2:
        return 'no match'

    if error_old != error_new:
        print(error_old)
        print(error_new)
        print()
        return 'error message'

    return 'match'

def compare(row):
    return matched_results(row['results_old'], row['error_old'], row['results_new'], row['error_new'])

compare_df.apply(compare, axis=1).value_counts()

