# Landscaping Open Science Best Practices  

> Building a landscaping analysis of best practices around data and model sharing.

Note - this question is inherently driven by discussion and informal experience (as opposed to formal experimentation). So we would expect to 

## Preliminaries

In [38]:
from alhazen.apps.chat import  AlhazenAgentChatBot
from alhazen.core import get_langchain_chatmodel, MODEL_TYPE
from alhazen.schema_sqla import *
from alhazen.tools.basic import AddCollectionFromEPMCTool, DeleteCollectionTool
from alhazen.tools.paperqa_emulation_tool import PaperQAEmulationTool
from alhazen.tools.metadata_extraction_tool import MetadataExtractionTool, MetadataExtractionWithRAGTool 
from alhazen.toolkit import AlhazenToolkit
from alhazen.utils.jats_text_extractor import NxmlDoc
from alhazen.utils.ceifns_db import Ceifns_LiteratureDb, create_ceifns_database, drop_ceifns_database

from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector

from bs4 import BeautifulSoup,Tag,Comment,NavigableString
from databricks import sql
from datetime import datetime
from importlib_resources import files
import os
import pandas as pd
from pathlib import Path
import re
import requests

from sqlalchemy import create_engine, exists, func
from sqlalchemy.orm import sessionmaker, aliased

from time import time,sleep
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote_plus, quote, unquote
from urllib.error import URLError, HTTPError
import yaml

In [2]:
skc = aliased(ScientificKnowledgeCollection)
skc_p = aliased(ScientificKnowledgeCollectionProvenance)
skc_hm = aliased(ScientificKnowledgeCollectionHasMembers)
ske = aliased(ScientificKnowledgeExpression)
ske_hr = aliased(ScientificKnowledgeExpressionHasRepresentation)
ski = aliased(ScientificKnowledgeItem)
ski_hp = aliased(ScientificKnowledgeItemHasPart)
skf = aliased(ScientificKnowledgeFragment)

n = aliased(Note)
skc_hn = aliased(ScientificKnowledgeCollectionHasNotes)
ske_hn = aliased(ScientificKnowledgeExpressionHasNotes)
ski_hn = aliased(ScientificKnowledgeItemHasNotes)
skf_hn = aliased(ScientificKnowledgeFragmentHasNotes)

Remember to set environmental variables for this code:

* `ALHAZEN_DB_NAME` - the name of the PostGresQL database you are storing information into
* `LOCAL_FILE_PATH` - the location on disk where you save temporary files, downloaded models or other data.   

In [3]:
os.environ['ALHAZEN_DB_NAME'] = 'open_science_best_practices'
os.environ['LOCAL_FILE_PATH'] = '/users/gully.burns/alhazen/'

if os.path.exists(os.environ['LOCAL_FILE_PATH']) is False:
    os.makedirs(os.environ['LOCAL_FILE_PATH'])

if os.environ.get('ALHAZEN_DB_NAME') is None: 
    raise Exception('Which database do you want to use for this application?')
db_name = os.environ['ALHAZEN_DB_NAME']

if os.environ.get('LOCAL_FILE_PATH') is None: 
    raise Exception('Where are you storing your local literature database?')
loc = os.environ['LOCAL_FILE_PATH']

In [41]:
create_ceifns_database(os.environ['ALHAZEN_DB_NAME'])

100%|██████████| 315/315 [00:00<00:00, 3507.90it/s]


In [40]:
drop_ceifns_database(os.environ['ALHAZEN_DB_NAME'])

Database has been dropped successfully !!


In [42]:
ldb = Ceifns_LiteratureDb(loc=loc, name=db_name)
llm = get_langchain_chatmodel(model_type=MODEL_TYPE.Ollama, llm_name='mixtral:instruction')
cb = AlhazenAgentChatBot()

print('AVAILABLE TOOLS')
for t in cb.tk.get_tools():
    print('\t'+type(t).__name__)

AVAILABLE TOOLS
	AddCollectionFromEPMCTool
	DescribeCollectionCompositionTool
	DeleteCollectionTool
	RetrieveFullTextTool
	MetadataExtractionTool
	SimpleExtractionWithRAGTool
	PaperQAEmulationTool
	CheckExpressionTool


## Build CEIFNS database from queries

### Run a basic query to build up corpus. 

Use the following general query to generate a corpus to work with: `"best practices" AND ("data sharing" OR "model sharing")`, but frame it as a question to the Alhazen agent.


In [44]:

out = cb.agent_executor.invoke({'input':'Build a collection using the query string "best practices" AND ("data sharing" OR "model sharing") and set the collection_id to "best_practices_models_or_data"'},
                               config={'callbacks': [ConsoleCallbackHandler()]})


[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m{
  "input": "Build a collection using the query string \"best practices\" AND (\"data sharing\" OR \"model sharing\") and set the collection_id to \"best_practices_models_or_data\""
}


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": ""
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 3:chain:RunnableAssign<agent_scratchpad>] Entering Chain run with input:
[0m{
  "input": ""
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 3:chain:RunnableAssign<agent_scratchpad> > 4:chain:RunnableParallel<agent_scratchpad>] Entering Chain run with input:
[0m{
  "input": ""
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 3:chain:RunnableAssign<agent_scratchpa

100%|██████████| 8/8 [01:53<00:00, 14.19s/it]


 Returning 6626


100%|██████████| 6626/6626 [00:37<00:00, 177.47it/s]


[36;1m[1;3m[tool/end][0m [1m[1:chain:AgentExecutor > 9:tool:add_collection_from_epmc_query] [452.28s] Exiting Tool run with output:
[0m"{'report': 'We added a collection to the database called `Best Practices for Data and Model Sharing` containing 0 papers from this query: `"best practices" AND ("data sharing" OR "model sharing")`.'}"
[36;1m[1;3m{'report': 'We added a collection to the database called `Best Practices for Data and Model Sharing` containing 0 papers from this query: `"best practices" AND ("data sharing" OR "model sharing")`.'}[0m
[32;1m[1;3m[0m
[36;1m[1;3m[chain/end][0m [1m[1:chain:AgentExecutor] [456.21s] Exiting Chain run with output:
[0m[outputs]

[1m> Finished chain.[0m


In [51]:
q = ldb.session.query(skc.id, skc.name, func.aggregate_strings(skc_p.provenance, separator=', '), func.count(skc_hm.has_members_id)) \
    .filter(skc.id==skc_hm.ScientificKnowledgeCollection_id) \
    .filter(skc_p.ScientificKnowledgeCollection_id==skc.id) \
    .group_by(skc.id, skc.name) \
    .order_by(skc.id)
corpora_df = pd.DataFrame(q.all(), columns=['Corpus ID', 'Corpus Name', 'Provenance', 'Paper Count'])

paper_count = ldb.session.query(func.count(ske.id)).first()
print('Count of all papers in database: %d'%(paper_count[0]))

corpora_df

Count of all papers in database: 6618


Unnamed: 0,Corpus ID,Corpus Name,Provenance,Paper Count
0,best_practices_models_or_data,Best Practices for Data and Model Sharing,"{""action_type"": """", ""action"": {""id"": ""add_coll...",6618
