In [1]:
import pandas as pd
import json
import requests
import csv
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline  
import seaborn as sns
import time
import sys
import psycopg2
import datetime
import pytz
sys.path.insert(0, "/Users/gjq527/Box Sync/Documents – YN13997/metadata/scripts/")
from generate_and_plot_baseline_metrics import *
sys.path.insert(0, "/Users/gjq527/Box Sync/Documents – YN13997/troubleshooting/datasets_containing_preserved_specimens/")
from find_preserved_specimen_collection_and_datasets import *

In [3]:
# Get all the datasets from API except the ones from PLAZI, GEO-TAG and PANGEA
summary_metadata = get_metadata_from_API(None, None, 900)
summary_metadata.to_csv("../summary_baseline_metrics_metadata_2020-01-26.txt", sep="\t")
# summary_metadata.to_csv("../summary_baseline_metrics_metadata_2020-03-04.txt", sep="\t")
# summary_metadata = pd.read_table("../summary_baseline_metrics_metadata.txt")
# summary_metadata = summary_metadata.set_index("UUID")

In [4]:
################ Split dataset per type ################
checklist_summary_metadata = summary_metadata[summary_metadata.type == "CHECKLIST"]
other_summary_metadata = summary_metadata[summary_metadata.type != "CHECKLIST"]

## Quality scoring for multiple datasets

In [5]:
conn = psycopg2.connect(dbname='prod_b_registry', 
                        user='', 
                        host='pg1.gbif.org', 
                        password='')
cur = conn.cursor()
# This is the query for the registry
query = """SELECT d.key, d.created, node.title, node.continent, node.country
           FROM dataset d JOIN organization o ON d.publishing_organization_key = o.key
           JOIN node ON o.endorsing_node_key = node.key;"""
# Queries the GBIF registry
cur.execute(query)
res_query = cur.fetchall()

dataset_per_publishing_country = pd.DataFrame(res_query, columns=['UUID',
                                                                  'created',
                                                                  'nodeTitle',
                                                                  'nodeContinent',
                                                                  'nodeCountry'])
dataset_per_publishing_country = dataset_per_publishing_country.set_index("UUID")
cur.close()
conn.close()

In [6]:
summary_metadata = pd.concat([summary_metadata, dataset_per_publishing_country], axis=1, join='inner')

In [7]:
# Classify datasets per preserved specimens VS not
preserved_specimens_ds = number_of_dataset_with_preserved_specimen(1000, True)
preserved_specimens_ds_list = [d['name'] for d in preserved_specimens_ds]
maskPS = summary_metadata.index.isin(preserved_specimens_ds_list)

In [8]:
summary_metadata.loc[(summary_metadata.type == "CHECKLIST")&~maskPS, "overall_score"] = summary_metadata[["score_what", "score_who"]].sum(axis=1)/5

In [9]:
summary_metadata.loc[(summary_metadata.type != "CHECKLIST")&~maskPS, "overall_score"] = summary_metadata[["score_what", "score_who", "score_where", "score_when", "score_how"]].sum(axis=1)/10

In [10]:
summary_metadata.loc[maskPS, "overall_score"] = summary_metadata[["score_what", "score_who", "score_where", "score_when"]].sum(axis=1)/8

In [11]:
def make_stats(summary_metadata, groupby, threshold, year, mask):
    # Column names
    year = str(year)
    nb_DSpublished = "Number of dataset published in "+year
    before_year_nb_DSpublished = "Number of dataset published BEFORE "+year
    percentage_below_thresh = "Pct of "+year+" datasets with insufficient metadata"
    before_year_percentage_below_thresh = "Pct of datasets published BEFORE "+year+" with insufficient metadata"
    score_below_thresh = "score_below_thresh"
    
    summary_metadata[score_below_thresh] = summary_metadata["overall_score"] < threshold
    
    stats_per_groupby = pd.DataFrame(columns=[nb_DSpublished,
                                              before_year_nb_DSpublished,
                                              percentage_below_thresh,
                                              before_year_percentage_below_thresh
                                               ])
      
    stats_per_groupby[nb_DSpublished] = summary_metadata[mask].groupby([groupby])["overall_score"].count()
    stats_per_groupby[before_year_nb_DSpublished] = summary_metadata[~mask].groupby([groupby])["overall_score"].count()
    stats_per_groupby[percentage_below_thresh] = summary_metadata[mask].groupby([groupby])[score_below_thresh].sum()*100/stats_per_groupby[nb_DSpublished]
    stats_per_groupby[before_year_percentage_below_thresh] = summary_metadata[~mask].groupby([groupby])[score_below_thresh].sum()*100/stats_per_groupby[before_year_nb_DSpublished]
    return stats_per_groupby

In [12]:
year = 2020
threshold = 0.6
summary_metadata = summary_metadata[pd.to_datetime(summary_metadata.created, utc=True) < datetime.datetime(year=year+1, month=1, day=1, tzinfo = pytz.UTC)]
mask2018 = (pd.to_datetime(summary_metadata.created, utc=True) >= datetime.datetime(year=year, month=1, day=1, tzinfo = pytz.UTC))

stats_per_continent = make_stats(summary_metadata, "nodeContinent", threshold, year, mask2018)
stats_per_node = make_stats(summary_metadata, "nodeTitle", threshold, year, mask2018)

In [13]:
stats_per_continent

Unnamed: 0_level_0,Number of dataset published in 2020,Number of dataset published BEFORE 2020,Pct of 2020 datasets with insufficient metadata,Pct of datasets published BEFORE 2020 with insufficient metadata
nodeContinent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AFRICA,95,576,0.0,4.861111
ASIA,103,708,52.427184,69.067797
EUROPE,4639,42840,62.9446,42.215219
NORTH_AMERICA,281,1827,24.55516,18.062397
OCEANIA,28,615,42.857143,47.479675
SOUTH_AMERICA,442,1795,9.728507,4.512535


In [14]:
writer = pd.ExcelWriter('metadata_quality_for_2020.xlsx')
stats_per_continent.to_excel(writer,'CONTINENT')
stats_per_node.to_excel(writer,'ENDOSING NODE')
writer.save()

In [15]:
summary_metadata.to_csv("../summary_baseline_metrics_metadata_2020-01-26-2020-only-with-publisher.txt", sep = "\t")