In [None]:
!pip install -r requirements.txt


In [None]:
import psycopg2
import pandas as pd
import json

with open('database_config.json') as json_file:
    db_config = json.load(json_file)
conn = psycopg2.connect(host=db_config['host'], user=db_config['user'], password=db_config['password'], database=db_config['database'], port=db_config['port'])
sql = 'SELECT * FROM "deepcite_call" ORDER by "created_at" DESC;'
mega_df = pd.read_sql_query(sql, conn)
mega_df = pd.concat([mega_df.drop(['response'], axis=1), mega_df['response'].apply(pd.Series)], axis=1).set_index('id')
# mega_df.head()


In [None]:
mega_df.tail(20)
3

# Grab all unique submissions

In [None]:
def grab_submission(row):
    try:
        input = row.results[0]
    except:
        return  pd.Series([row.results['link'], row.results['source']])
    return  pd.Series([input['link'], input['source']])

nan_value = float("NaN")
submissions = mega_df.dropna(subset=['results']).apply(grab_submission, axis=1)
submissions = submissions.drop_duplicates().replace('', nan_value).dropna()
submissions


# Rerun submissions against local model

In [None]:
import requests
import sys
sys.path.insert(1, '../lambda')
from lambda_config import config

def current_result(row):
    response = requests.post(url=config['ec2']['url'], json={"claim": row[1], "link": row[0]})
    print(response)
    return json.loads(response.text)

results = submissions.apply(current_result, axis=1)
results

In [None]:
# rerun_df = pd.concat([results.drop(['new_response'], axis=1), filtered['new_response'].apply(pd.Series)], axis=1)
rerun_df = pd.json_normalize(results)
# rerun_df.to_csv('rerun_smaller_model_results.csv', index=False)
rerun_df = pd.read_csv('rerun_results.csv')
rerun_df.head()


In [None]:
rerun_small_df = pd.read_csv('rerun_smaller_model_results.csv')
rerun_small_df.head()


# See differences in results

In [None]:
import ast

def print_results(results):
    for res in results:
        print(res[0][:150])
        print(res[1])
        print()

def matched_results(row1, row2):
    res1 = sorted(ast.literal_eval(row1.results), key=lambda k: k['score'], reverse=True)
    res2 = sorted(ast.literal_eval(row2.results), key=lambda k: k['score'], reverse=True)

    if len(res1) + len(res2) <= 2:
        return 'not long enough'

    res1 = [(res['link'], res['source']) for res in res1]
    res2 = [(res['link'], res['source']) for res in res2]

    if res1[:4] != res2[:4]:
        print(ast.literal_eval(row1.results)[0]['source'], '\n')
        print_results(res1[1:4])
        print('======================')
        print_results(res2[1:4])
        print()
        print()
        print()
        return 'no match in first 3'

    if len(res1) != len(res2):
        print(len(res1) - len(res2))
        return 'length'
    
    if res1 != res2:
        return 'no match'

    if row1.error != row2.error:
        print(row1.error)
        print(row2.error)
        print()
        return 'error message'

    return 'match'

matches = []
for index, row1 in rerun_df.iterrows():
    row2 = rerun_small_df.iloc[index]
    matches.append(matched_results(row1, row2))
    # if len(matches) > 5:
    #     break
matches