In [1]:
import pickle

# Specify the input file path
input_file = "../Embeddings/all_prots_with_embeds.pkl"

# Read the binary file and convert it back to a set
with open(input_file, "rb") as f:
    large_set = pickle.load(f)

print("Number of proteins in the set:", len(large_set))

Number of proteins in the set: 617521


In [2]:
import gzip
import sqlite3
from Bio import SeqIO
from tqdm import tqdm

# Step 1: Load the FASTA file into a SQLite database
def load_fasta_to_db(fasta_gz_file, db_file):
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()
    cursor.execute("CREATE TABLE IF NOT EXISTS proteins (id TEXT PRIMARY KEY, sequence TEXT)")
    
    with gzip.open(fasta_gz_file, "rt") as fasta_in:
        for record in tqdm(SeqIO.parse(fasta_in, "fasta"), desc = "Loading proteins into DB", total=192983315):
            protein_id = record.id
            sequence = str(record.seq)
            cursor.execute("INSERT INTO proteins (id, sequence) VALUES (?, ?)", (protein_id, sequence))
    
    conn.commit()
    conn.close()

load_fasta_to_db("uniref90.fasta.gz", "uniref90.db")

Loading proteins into DB: 100%|██████████| 192983315/192983315 [2:50:02<00:00, 18914.99it/s]  


In [7]:
# remove the uniref90_ prefix from the protein IDs from the uniref90.db
import sqlite3

# Connect to the database
conn = sqlite3.connect("uniref90.db")
cursor = conn.cursor()

# Update the protein_id column to remove the 'UniRef90_' prefix
cursor.execute("UPDATE proteins SET id = REPLACE(id, 'UniRef90_', '')")

# Commit the changes and close the connection
conn.commit()
conn.close()

print("Prefix removed successfully from id")


In [None]:
# print the first 5 rows of the proteins table
import sqlite3

# Connect to the database
conn = sqlite3.connect("uniref90.db")
cursor = conn.cursor()

# Select the first 5 rows from the proteins table
cursor.execute("SELECT * FROM proteins LIMIT 5")
rows = cursor.fetchall()


# Print the rows
for row in rows:
    print(row)

# Close the connection
conn.close()

In [3]:
def load_ids_to_remove(id_file, db_file):
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()
    cursor.execute("CREATE TABLE IF NOT EXISTS ids_to_remove (id TEXT PRIMARY KEY)")
    
    with open(id_file, "r") as f:
        for line in f:
            protein_id = line.strip()
            cursor.execute("INSERT OR IGNORE INTO ids_to_remove (id) VALUES (?)", 
                           (protein_id,))
    
    conn.commit()
    conn.close()

In [None]:
load_ids_to_remove("../Embeddings/all_prots_with_embeds.txt", "uniref90.db")
