In [1]:
import duckdb

db_string = "/Users/patrick/dev/ucl/comp0158_mscproject/database/w2v.db"

In [2]:
# CREATE A TABLE
#con = duckdb.connect(database=':memory:')
con = duckdb.connect(database=db_string)  
duckdb.sql("\
    CREATE TABLE TEST (\
        ID VARCHAR,\
    )")
con.close()

In [5]:
# DESCRIBE
con = duckdb.connect(database=db_string)
res = duckdb.sql("DESCRIBE TEST")
print(res)
con.close()

┌─────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│ column_name │ column_type │  null   │   key   │ default │  extra  │
│   varchar   │   varchar   │ varchar │ varchar │ varchar │ varchar │
├─────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ ID          │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
└─────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘



### Load proteins

In [17]:
# load protein file into protein table
# 20 July 2024 - This took 12.9s to load uniprotkb-2759_78494531.dat (78M proteins)
con = duckdb.connect(database=db_string)           
con.execute("CREATE TABLE W2V_PROTEIN AS SELECT * FROM read_csv_auto('/Users/patrick/dev/ucl/comp0158_mscproject/data/protein/uniprotkb-2759_78494531.dat', columns={'id' :'VARCHAR', 'start': 'USMALLINT', 'end': 'USMALLINT'})")
con.close()

In [19]:
con = duckdb.connect(database=db_string)           
protein_count = con.execute("SELECT COUNT(*) FROM W2V_PROTEIN").fetchall()
print(protein_count)
con.close()

[(78494529,)]


In [10]:
con = duckdb.connect(database=db_string)      

# SELECT FROM LIST OF IDS - REALLY SLOW
#list = ['A0A010R6E0', 'A0A010RP22']
#entries = con.execute("SELECT * FROM PFAM_TOKEN WHERE column0 IN (SELECT UNNEST(?))", [list]).fetchall()

res = con.execute("SELECT * FROM PROTEIN WHERE ID = (?)", ['A0A010PZU8']).fetchall()

print(res)
con.close()

[('A0A010PZU8', 1, 1389)]


### LOAD PFAM TOKENS

In [15]:
# July 20 2024 - Took 1m 55s to load 296,017,815 entries
con = duckdb.connect(database=db_string)

con.execute("CREATE TABLE W2V_TOKEN AS SELECT * FROM read_csv_auto('/Users/patrick/dev/ucl/comp0158_mscproject/data/pfam/protein2ipr_pfam_20240715.dat', columns={'uniprot_id' :'VARCHAR', 'type' : 'VARCHAR', 'token' : 'VARCHAR', 'start': 'USMALLINT', 'end': 'USMALLINT'})")
con.close()

In [23]:
# with pfam only this shows 296,017,815 entries
# after loading disorder as well this shows 377,274,915 (81,257,100 disorder entries)
con = duckdb.connect(database=db_string)           
protein_count = con.execute("SELECT COUNT(*) FROM W2V_TOKEN").fetchall()
print(protein_count)
con.close()

[(377274915,)]


#### Load DISORDER tokens

In [22]:
# Load disorder entries
con = duckdb.connect(database=db_string) 
con.execute("INSERT INTO W2V_TOKEN SELECT * FROM read_csv_auto('/Users/patrick/dev/ucl/comp0158_mscproject/data/disorder/disordered_tokens_20240719.dat')")
con.close()

In [29]:
# with pfam only this shows 296,017,815 entries
# after loading disorder as well this shows 377,274,915
con = duckdb.connect(database=db_string)           
protein_count = con.execute("SELECT COUNT(*) FROM W2V_TOKEN WHERE TYPE='DISORDER'").fetchall()
print(protein_count)
con.close()

[(81257100,)]


In [27]:
# with pfam only this shows 296,017,815 entries
# after loading disorder as well this shows 377,274,915
con = duckdb.connect(database=db_string)           
tokens = con.execute("SELECT * FROM W2V_TOKEN WHERE UNIPROT_ID=(?)", ['A0A010PZU8']).fetchall()
print(tokens)
con.close()

[('A0A010PZU8', 'PFAM', 'PF00400', 865, 900), ('A0A010PZU8', 'PFAM', 'PF00400', 928, 955), ('A0A010PZU8', 'PFAM', 'PF00400', 960, 998), ('A0A010PZU8', 'PFAM', 'PF00400', 1017, 1040), ('A0A010PZU8', 'PFAM', 'PF00400', 1078, 1108), ('A0A010PZU8', 'PFAM', 'PF00400', 1233, 1260), ('A0A010PZU8', 'PFAM', 'PF05729', 358, 479), ('A0A010PZU8', 'PFAM', 'PF17100', 152, 254), ('A0A010PZU8', 'DISORDER', 'Consensus Disorder Prediction', 1, 30)]


#### Apply indexes

In [28]:
con = duckdb.connect(database=db_string) 
res = con.execute("CREATE INDEX W2V_TKN_IDX ON W2V_TOKEN(UNIPROT_ID)")
con.close()

OutOfMemoryException: Out of Memory Error: could not allocate block of size 256.0 KiB (12.7 GiB/12.7 GiB used)

### UTILITIES

#### Drop Table

In [16]:
con = duckdb.connect(database=db_string)           
con.execute("DROP TABLE PROTEIN")
con.close()

#### Unlock database

In [1]:
import duckdb
import os
db_path = "/Users/patrick/dev/ucl/comp0158_mscproject/database/proteins.db"

def is_locked():
    db_path = "/Users/patrick/dev/ucl/comp0158_mscproject/database/proteins.db"

    lock_file = f'{db_path}.lock'
    return os.path.exists(lock_file)

is_locked()

# this works from a command prompt
#fuser database/proteins.db

# then kill the id if there is one liset

False