# **Importing necesary libraries**

In [1]:
import pandas as pd
from Bio import SeqIO 
import random
import numpy as np

random.seed(42)
random_seed = 42

# **Importing fasta files**

In [2]:
# Read the sequences from the fasta files
carbo_fasta = list(SeqIO.parse("Non Chitinase Sequences/CD-HIT/Carbohydrate Esterase/carbohydrate_cdhit.fasta", "fasta"))
gh10_fasta = list(SeqIO.parse("Non Chitinase Sequences/CD-HIT/GH10/gh10_cdhit.fasta", "fasta"))
gh20_fasta = list(SeqIO.parse("Non Chitinase Sequences/CD-HIT/GH20/gh20_cdhit.fasta", "fasta"))
glycosyl_fasta = list(SeqIO.parse("Non Chitinase Sequences/CD-HIT/Glycosyltransferases/glycosyl_cdhit.fasta", "fasta"))
peptidase_fasta = list(SeqIO.parse("Non Chitinase Sequences/CD-HIT/Peptidase Families/peptidase_cdhit.fasta", "fasta"))
tubulin_fasta = list(SeqIO.parse("Non Chitinase Sequences/CD-HIT/Tubulin Superfamily/tubulin_cdhit.fasta", "fasta"))

In [3]:
# Preparing protein sequences as a list
carbo = [str(sequence.seq) for sequence in carbo_fasta]
gh10 = [str(sequence.seq) for sequence in gh10_fasta]
gh20 = [str(sequence.seq) for sequence in gh20_fasta]
glycosyl = [str(sequence.seq) for sequence in glycosyl_fasta]
peptidase = [str(sequence.seq) for sequence in peptidase_fasta]
tubulin = [str(sequence.seq) for sequence in tubulin_fasta]

# **Ramdomly sampling sequences from each family**

In [4]:
print("Quantities from each family")
print("Carbohydrate Esterase: ", len(carbo))
print("GH10: ", len(gh10))
print("GH20: ", len(gh20))
print("Glycosyltransferases: ", len(glycosyl))
print("Peptidase Families: ", len(peptidase))
print("Tubulin Superfamily: ", len(tubulin))

Quantities from each family
Carbohydrate Esterase:  9684
GH10:  3896
GH20:  77
Glycosyltransferases:  7771
Peptidase Families:  215
Tubulin Superfamily:  299


In [9]:
sample_size = int(918 / 6)

In [10]:
carbo_sampled = random.sample(carbo, sample_size+16)
gh10_sampled = random.sample(gh10, sample_size+15)
gh20_sampled = random.sample(gh20, 77)
glycosyl_sampled = random.sample(glycosyl, sample_size+15)
peptidase_sampled = random.sample(peptidase, sample_size+15)
tubulin_sampled = random.sample(tubulin, sample_size+15)

In [24]:
print("Quantities from each family after sampling")
print("Carbohydrate Esterase: ", len(carbo_sampled))
print("GH10: ", len(gh10_sampled))
print("GH20: ", len(gh20_sampled))
print("Glycosyltransferases: ", len(glycosyl_sampled))
print("Peptidase Families: ", len(peptidase_sampled))
print("Tubulin Superfamily: ", len(tubulin_sampled))
print("TOTAL: ", len(carbo_sampled)+len(gh10_sampled)+len(gh20_sampled)+len(glycosyl_sampled)+len(peptidase_sampled)+len(tubulin_sampled))

Quantities from each family after sampling
Carbohydrate Esterase:  169
GH10:  168
GH20:  77
Glycosyltransferases:  168
Peptidase Families:  168
Tubulin Superfamily:  168
TOTAL:  918


## **Creating Dataframes**

In [13]:
carbo_dataframe = pd.DataFrame({"sequence": carbo_sampled})
gh10_dataframe = pd.DataFrame({"sequence": gh10_sampled})
gh20_dataframe = pd.DataFrame({"sequence": gh20_sampled})
glycosyl_dataframe = pd.DataFrame({"sequence": glycosyl_sampled})
peptidase_dataframe = pd.DataFrame({"sequence": peptidase_sampled})
tubulin_dataframe = pd.DataFrame({"sequence": tubulin_sampled})

In [14]:
carbo_dataframe["label"] = np.zeros(len(carbo_dataframe), dtype=np.int32)
gh10_dataframe["label"] = np.zeros(len(gh10_dataframe), dtype=np.int32)
gh20_dataframe["label"] = np.zeros(len(gh20_dataframe), dtype=np.int32)
glycosyl_dataframe["label"] = np.zeros(len(glycosyl_dataframe), dtype=np.int32)
peptidase_dataframe["label"] = np.zeros(len(peptidase_dataframe), dtype=np.int32)
tubulin_dataframe["label"] = np.zeros(len(tubulin_dataframe), dtype=np.int32)

# **Spliting Dataset into Train, Test, Validation datasets**

In [15]:
def split_dataframe(df, train_ratio=0.7, test_ratio=0.2, val_ratio=0.1, random_state=42):
    # Shuffle the DataFrame
    df_shuffled = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    # Calculate the indices for splits
    train_end = int(len(df_shuffled) * train_ratio)
    test_end = train_end + int(len(df_shuffled) * test_ratio)
    val_end = len(df_shuffled) 


    # Split the DataFrame
    train_df = df_shuffled[:train_end]
    test_df = df_shuffled[train_end:test_end]
    val_df = df_shuffled[test_end:val_end]
    
    return train_df, test_df, val_df

In [16]:
train_carbo, test_carbo, valid_carbo = split_dataframe(carbo_dataframe)
train_gh10, test_gh10, valid_gh10 = split_dataframe(gh10_dataframe)
train_gh20, test_gh20, valid_gh20 = split_dataframe(gh20_dataframe)
train_glycosyl, test_glycosyl, valid_glycosyl = split_dataframe(glycosyl_dataframe)
train_peptidase, test_peptidase, valid_peptidase = split_dataframe(peptidase_dataframe)
train_tubulin, test_tubulin, valid_tubulin = split_dataframe(tubulin_dataframe)

In [17]:
datasets = [train_carbo, test_carbo, valid_carbo,train_gh10, test_gh10, valid_gh10,train_gh20, test_gh20, valid_gh20,train_glycosyl, test_glycosyl, valid_glycosyl,train_peptidase, test_peptidase, valid_peptidase,train_tubulin, test_tubulin, valid_tubulin]
train_sets = datasets[0::3]
test_sets = datasets[1::3]
valid_sets = datasets[2::3]

In [20]:
train = pd.concat(train_sets, ignore_index=True)
train_shuffled = train.sample(frac=1, random_state=random_seed).reset_index(drop=True)
train_shuffled.head(10)

Unnamed: 0,sequence,label
0,MLAAALLAVYLAVGALASPLQMRQGEARVITSCVEPNTAALTFDDG...,0
1,MKLITFAVPCYNSQDYMRRCIDTLLTGGEKVEIIIVNDGSKDRTGE...,0
2,MFSESDEAQEILGNNCLEDVTWLCSLSESELDMLISLKMLVVQRAK...,0
3,MVDFVTKNQILCRGHNVLWQDPNFTPSWVRNLTTSPDLLRQAAESR...,0
4,DAEMGRRIALEEKEKIKQILRGANLTCLVFGLGKGTGTGVSPVIGQ...,0
5,MAISLNYFGDIKSKEANATVQWSKSNNKVALVEWCSTGFKISLNEV...,0
6,MSIMSFILPLALLISGAVSSILPRQGASCSTLPAGYSPQQYAKLPD...,0
7,MCLFFLNQRIPNNIKSSIISVSPEDVSMSGSFVANTTAIKQVFQRI...,0
8,MKIITRSRHITAYNGHFSMRKSTPDKLMYMKTIQANCFCYLSGFIL...,0
9,MPIPVSAQFSGGFIPFSGKVAVSWENAPTLVLKKAVARFISRANAL...,0


In [21]:
test = pd.concat(test_sets, ignore_index=True)
test_shuffled = test.sample(frac=1, random_state=random_seed).reset_index(drop=True)
test_shuffled.head(10)

Unnamed: 0,sequence,label
0,MRFSDITTAALVAPLVAAHGGIPGAPKVFGLPRDLQADFKAPITAR...,0
1,MTLTCSTSSSFPPSPAGSSLPIDGANRRVLYFSIVPPPSQLAEHRH...,0
2,MTLIWTAYIHDRYGGCMYMNNAYRIVQFTPTLDIALLVHDKQFDAI...,0
3,ARRLTLTSSTRARGRLGFRLAFTSLFRIKGEVTGGRLVEQLSNSHV...,0
4,MCTTTEDATKELFRLAARTPLNIAPEQGSKLQSEIFGSSKWNIKPS...,0
5,MAHASSISFSTKPAAFYLAGDSTTAAQSDEGGGWGVGFLKTLQDGA...,0
6,MASNMSIVRRAWPGAVQELKHLIVFGASYCDVGYDSRAPRPSPERP...,0
7,MSAIYKKLCEFVAAQGRTPQFWGDILEQEPELMRELPPRTVCLNWH...,0
8,GAVARGGAGGGRAAHGAPAARRRGGVPAPGLPGELHGRRRRHGEGA...,0
9,MTTHTQSYPGRRRAAATPSYPSTAAVTRAAKAAAKLAGRVLMVGIM...,0


In [22]:
valid = pd.concat(valid_sets, ignore_index=True)
valid_shuffled = valid.sample(frac=1, random_state=random_seed).reset_index(drop=True)
valid_shuffled.head(10)

Unnamed: 0,sequence,label
0,MPDDTPRPSRLDAAYLALACGIAFFWQLGNLGLVGPDEPRYAQVAR...,0
1,MNLVQKNRSSYRLVLPGQPSPEEKRAAEFLNRHLEKISGCTLPIIL...,0
2,IGSKFWEVVSDEHGVDPTGTYQGDSDLQLERINVYYNEATGGRYVP...,0
3,MTLTTSPFWVGLILWFLICAKPHLCGQFYDSSAYTECKMKPEAPLY...,0
4,MATTLRSLKIKTGTCKRIVKELHYYEKEVETEAAKTAKMKDNGADP...,0
5,MNRIPKIKVIGIGGAGVNALSRMAKCGFNYIELIAVNTDAQSLQFS...,0
6,MRHSIVRLGLAGLVGIVCASGGESPDVKDFDWTTITPSTKLEYHPC...,0
7,PFLHIGGDEAKGTSSTDFRAFVTRAMQLAAATGKRPIGWHEVGPAQ...,0
8,MTFKHYDVVRAASPSDLAEKLTHKLKEGWQPFGSPVAITPYTLMQA...,0
9,MLSPRLLALATSLLSVSVVSGIPWNATEYMFVFGDSYTTDGYNVSA...,0


# **Exporting Dataframes**

In [23]:
train_shuffled.to_csv("Negative Samples/Data Augmentation/my_train_negative.csv", index=False)
test_shuffled.to_csv("Negative Samples/Data Augmentation/my_test_negative.csv", index=False)
valid_shuffled.to_csv("Negative Samples/Data Augmentation/my_valid_negative.csv", index=False)