# Combine Sequences and extended GO terms

## Imports

In [29]:
import pandas as pd
import numpy as np
from Bio import SeqIO
import matplotlib.pyplot as plt
import seaborn as sb
from datasets import Dataset
from datasets import load_from_disk
import os


## Load data

In [30]:
data_path = os.path.join("..","data","train_val")

absolute_path = os.path.abspath(data_path)

print(absolute_path)

C:\Users\49176\Unistuff\PP2\prostT5-CLIP\data\train_val


In [31]:
dataset = load_from_disk(absolute_path)

In [32]:
dataset

DatasetDict({
    train: Dataset({
        features: ['identifier', 'sequence', 'species'],
        num_rows: 113796
    })
    test: Dataset({
        features: ['identifier', 'sequence', 'species'],
        num_rows: 28450
    })
})

In [33]:
dataset

DatasetDict({
    train: Dataset({
        features: ['identifier', 'sequence', 'species'],
        num_rows: 113796
    })
    test: Dataset({
        features: ['identifier', 'sequence', 'species'],
        num_rows: 28450
    })
})

In [34]:
## Load GO terms
data_path = os.path.join("..","data","CAFA5","Train","train_terms_extended_new.tsv")

absolute_path = os.path.abspath(data_path)

print(absolute_path)
extended_GO = pd.read_csv(absolute_path,sep="\t")
extended_GO = extended_GO.rename(columns={"EntryID":"identifier"})

C:\Users\49176\Unistuff\PP2\prostT5-CLIP\data\CAFA5\Train\train_terms_extended_new.tsv


In [35]:
extended_GO

Unnamed: 0,identifier,term,aspect,GO Name,GO Sentence
0,A0A009IHW8,GO:0008152,BPO,metabolic process,The biological process is metabolic process.
1,A0A009IHW8,GO:0034655,BPO,nucleobase-containing compound catabolic process,The biological process is nucleobase-containin...
2,A0A009IHW8,GO:0072523,BPO,purine-containing compound catabolic process,The biological process is purine-containing co...
3,A0A009IHW8,GO:0044270,BPO,cellular nitrogen compound catabolic process,The biological process is cellular nitrogen co...
4,A0A009IHW8,GO:0006753,BPO,nucleoside phosphate metabolic process,The biological process is nucleoside phosphate...
...,...,...,...,...,...
5363858,X5L565,GO:0050649,MFO,testosterone 6-beta-hydroxylase activity,The molecular function is testosterone 6-beta-...
5363859,X5L565,GO:0016491,MFO,oxidoreductase activity,The molecular function is oxidoreductase activ...
5363860,X5M5N0,GO:0005515,MFO,protein binding,The molecular function is protein binding.
5363861,X5M5N0,GO:0005488,MFO,binding,The molecular function is binding.


### Check if compatible

In [43]:
assert len(extended_GO["identifier"].unique()) == (len(dataset["train"]["identifier"])+len(dataset["test"]["identifier"]))

### Merge datasets

In [37]:
train = dataset["train"].to_pandas()
test = dataset["test"].to_pandas()

In [47]:
merged_train = pd.merge(extended_GO, train, on='identifier', how='left').dropna()

In [48]:
merged_test = pd.merge(extended_GO, test, on='identifier', how='left').dropna()

In [49]:
len(extended_GO.identifier)

5363863

In [50]:
(len(merged_train)+len(merged_test))

5363863

In [51]:
assert (len(merged_train)+len(merged_test)) == len(extended_GO.identifier)

In [None]:
# Delete objects no longer needed
del train
del test
del extended_GO
del dataset

In [61]:
import gc



# Run garbage collection to free memory
gc.collect()


1440

In [63]:
from datasets import Dataset, DatasetDict, concatenate_datasets
import numpy as np

# Split your large DataFrames into smaller chunks
train_chunks = [Dataset.from_pandas(chunk) for chunk in np.array_split(merged_train, 10)]
test_chunks = [Dataset.from_pandas(chunk) for chunk in np.array_split(merged_test, 10)]

# Concatenate the chunks into a single Dataset
train_dataset = concatenate_datasets(train_chunks)
test_dataset = concatenate_datasets(test_chunks)

# Create the DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})


In [70]:
dataset_dict["train"][5]

{'identifier': 'A0A009IHW8',
 'term': 'GO:1901292',
 'aspect': 'BPO',
 'GO Name': 'nucleoside phosphate catabolic process',
 'GO Sentence': 'The biological process is nucleoside phosphate catabolic process.',
 'sequence': 'MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENARIQSKLSDLQKKKIDIDNKLLKEKQNLIKEEILERKKLEVLTKKQQKDEIEHQKKLKREIDAIKASTQYITDVSISSYNNTIPETEPEYDLFISHASEDKEDFVRPLAETLQQLGVNVWYDEFTLKVGDSLRQKIDSGLRNSKYGTVVLSTDFIKKDWTNYELDGLVAREMNGHKMILPIWHKITKNDVLDYSPNLADKVALNTSVNSIEEIAHQLADVILNR',
 'species': 'Acinetobacter baumannii ',
 '__index_level_0__': 5}

In [72]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['identifier', 'term', 'aspect', 'GO Name', 'GO Sentence', 'sequence', 'species', '__index_level_0__'],
        num_rows: 4299428
    })
    test: Dataset({
        features: ['identifier', 'term', 'aspect', 'GO Name', 'GO Sentence', 'sequence', 'species', '__index_level_0__'],
        num_rows: 1064435
    })
})

### Save

In [71]:
data_path = os.path.join("..","data","train_val_GO")

absolute_path = os.path.abspath(data_path)

print(absolute_path)
dataset_dict.save_to_disk(absolute_path)

C:\Users\49176\Unistuff\PP2\prostT5-CLIP\data\train_val_GO


Saving the dataset (0/7 shards):   0%|          | 0/4299428 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/1064435 [00:00<?, ? examples/s]