### Overview

Tested again on July 31st and August 1st


This notebook contains various scripts to load data into tables on a local DuckDB database. <br>
- proteins are loaded into W2V_PROTEIN
- pfam entries are loaded into W2V_TOKEN
- disorder regions are also loaded into W2V_TOKEN

The tables are created at the time the data is loaded - so see the appropariate cells for the table definition.

Indexes are applied after the data is loaded.


DuckDB is very easy to install on a mac and can load tab-delimited files extremely quickly.
To recreate this environment, you just need to install DuckDB and then set the db_string at the top of this file
to the location where you wish the database file to be stored


### SETUP AND TEST

In [2]:
import duckdb

#
# TODO - SET THIS STRING TO WHERE YOU WANT THE DB TO STORE ITS DATA
#
db_string = "/Users/patrick/dev/ucl/comp0158_mscproject/database/w2v_20240731_test.db"

Test the DB works OK

In [3]:
# CREATE A TABLE
#con = duckdb.connect(database=':memory:')
con = duckdb.connect(database=db_string)  
duckdb.sql("\
    CREATE TABLE TEST (\
        ID VARCHAR,\
    )")
con.close()

In [4]:
# DESCRIBE
con = duckdb.connect(database=db_string)
res = duckdb.sql("DESCRIBE TEST")
print(res)
con.close()

┌─────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│ column_name │ column_type │  null   │   key   │ default │  extra  │
│   varchar   │   varchar   │ varchar │ varchar │ varchar │ varchar │
├─────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ ID          │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
└─────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘



In [17]:
# DROP
con = duckdb.connect(database=db_string)  
duckdb.sql("DROP TABLE TEST")
con.close()

### LOAD PROTEINS INTO W2V_PROTEIN

In [10]:
# load protein file into protein table
# 20 July 2024 - This took 12.9s to load uniprotkb-2759_78494531.dat (78M proteins)
# 31 July testing again to check code works
con = duckdb.connect(database=db_string)           
con.execute("CREATE TABLE W2V_PROTEIN AS SELECT * FROM read_csv_auto('/Volumes/My Passport/data/protein/uniprotkb-2759_78494531.dat', columns={'uniprot_id' :'VARCHAR', 'start': 'USMALLINT', 'end': 'USMALLINT'})")
con.close()

In [11]:
# This should output that there are 78,494,529 items
con = duckdb.connect(database=db_string)           
protein_count = con.execute("SELECT COUNT(*) FROM W2V_PROTEIN").fetchall()
print(protein_count)
con.close()

[(78494529,)]


In [9]:
con = duckdb.connect(database=db_string)           
#con.execute("DROP TABLE W2V_PROTEIN")
con.close()

In [12]:
# create an index (after loading the data)
con = duckdb.connect(database=db_string)   
con.execute("CREATE INDEX UNIP_IDX ON W2V_PROTEIN(UNIPROT_ID)")
print('index created')
con.close()

index created


In [10]:
con = duckdb.connect(database=db_string)      

# SELECT FROM LIST OF IDS - REALLY SLOW
#list = ['A0A010R6E0', 'A0A010RP22']
#entries = con.execute("SELECT * FROM PFAM_TOKEN WHERE column0 IN (SELECT UNNEST(?))", [list]).fetchall()

res = con.execute("SELECT * FROM PROTEIN WHERE ID = (?)", ['A0A010PZU8']).fetchall()

print(res)
con.close()

[('A0A010PZU8', 1, 1389)]


### LOAD PFAM TOKENS INTO W2V_TOKEN

In [14]:
# July 20 2024 - Took 1m 55s to load 296,017,815 entries from a directory on a macbook
# July 31 2024 - Restest took 3m 10s from an external drive attached to macbook
con = duckdb.connect(database=db_string)

con.execute("CREATE TABLE W2V_TOKEN AS SELECT * FROM read_csv_auto('/Volumes/My Passport/data/pfam/protein2ipr_pfam_20240715.dat', columns={'uniprot_id' :'VARCHAR', 'type' : 'VARCHAR', 'token' : 'VARCHAR', 'start': 'USMALLINT', 'end': 'USMALLINT'})")
con.close()

In [15]:
# with pfam only this shows 296,017,815 entries
# after loading disorder as well this shows 377,274,915 (81,257,100 disorder entries)
con = duckdb.connect(database=db_string)           
protein_count = con.execute("SELECT COUNT(*) FROM W2V_TOKEN").fetchall()
print(protein_count)
con.close()

[(296017815,)]


In [16]:
# create an index (after loading data)
con = duckdb.connect(database=db_string)  
res = con.execute("CREATE INDEX PF_TKN_IDX ON W2V_TOKEN(UNIPROT_ID)")
con.close()

 ### LOAD DISORDER ITEMS INTO W2V_TOKEN

In [18]:
# Load disorder entries
# First run : July 19
# Retest    : August 1st (on Macbook - took 2mn 25s)
con = duckdb.connect(database=db_string) 
con.execute("INSERT INTO W2V_TOKEN SELECT * FROM read_csv_auto('/Volumes/My Passport/data/disorder/dat/disordered_tokens_20240719.dat')")
con.close()

In [20]:
# with pfam only this shows 296,017,815 entries
# after loading disorder as well this shows 377,274,915
con = duckdb.connect(database=db_string)           
token_count = con.execute("SELECT COUNT(*) FROM W2V_TOKEN").fetchall()
print(token_count)
con.close()

[(377274915,)]


In [22]:
# test that W2V_TOKEN has all pfam and disorder entries
con = duckdb.connect(database=db_string)           
tokens = con.execute("SELECT * FROM W2V_TOKEN WHERE UNIPROT_ID=(?)", ['A0A010PZU8']).fetchall()
print(tokens)
con.close()

[('A0A010PZU8', 'PFAM', 'PF00400', 865, 900), ('A0A010PZU8', 'PFAM', 'PF00400', 928, 955), ('A0A010PZU8', 'PFAM', 'PF00400', 960, 998), ('A0A010PZU8', 'PFAM', 'PF00400', 1017, 1040), ('A0A010PZU8', 'PFAM', 'PF00400', 1078, 1108), ('A0A010PZU8', 'PFAM', 'PF00400', 1233, 1260), ('A0A010PZU8', 'PFAM', 'PF05729', 358, 479), ('A0A010PZU8', 'PFAM', 'PF17100', 152, 254), ('A0A010PZU8', 'DISORDER', 'Consensus Disorder Prediction', 1, 30)]


### LOAD TAXONOMY INFO

#### Names

In [4]:
# see the data-preparation folder for a shell script that produces the .dat file loaded here
con = duckdb.connect(database=db_string)
con.execute("CREATE TABLE W2V_TAX_NAME AS SELECT * FROM read_csv_auto('/Volumes/My Passport/data/taxonomy/dat/scientific_names_20240802.dat', columns={'tax_id' :'VARCHAR', 'name' : 'VARCHAR'})")
con.close()

In [5]:
# count  - should have 2,588,170 entries
con = duckdb.connect(database=db_string)           
token_count = con.execute("SELECT COUNT(*) FROM W2V_TAX_NAME").fetchall()
print(token_count)
con.close()

[(2588170,)]


### UTILITIES

#### Drop Table

In [16]:
con = duckdb.connect(database=db_string)           
con.execute("DROP TABLE PROTEIN")
con.close()

#### Unlock database

In [1]:
import duckdb
import os
db_path = "/Users/patrick/dev/ucl/comp0158_mscproject/database/proteins.db"

def is_locked():
    db_path = "/Users/patrick/dev/ucl/comp0158_mscproject/database/proteins.db"

    lock_file = f'{db_path}.lock'
    return os.path.exists(lock_file)

is_locked()

# this works from a command prompt
#fuser database/proteins.db

# then kill the id if there is one liset

False