### strict dataset construction


In [27]:
import pandas as pd
# read the positive pairs
pos_pair = pd.read_csv('positive_pairs.csv',header = 0)
pos_pair

Unnamed: 0,Chromosome,Plasmid
0,NC_009674.1,NC_009673.1
1,NC_010581.1,NC_010580.1
2,NC_010581.1,NC_010578.1
3,NC_010556.1,NC_010549.1
4,NC_010556.1,NC_010550.1
...,...,...
10228,NZ_OX637961.1,NZ_OX637963.1
10229,NZ_OX637964.1,NZ_OX637965.1
10230,NZ_OX637964.1,NZ_OX637966.1
10231,NZ_OX637964.1,NZ_OX637967.1


In [None]:
import pandas as pd
import random

# Load the dataset
df = pd.read_csv("positive_pairs.csv")

# btain unique Chromosome values, shuffle them, and split in an 8:1:1 ratio
unique_chromosomes = df["Chromosome"].unique().tolist()
random.shuffle(unique_chromosomes)

n = len(unique_chromosomes)
train_chroms = set(unique_chromosomes[: int(n * 0.8)])
val_chroms = set(unique_chromosomes[int(n * 0.8): int(n * 0.9)])
test_chroms = set(unique_chromosomes[int(n * 0.9):])

# initial splitting
train_df = df[df["Chromosome"].isin(train_chroms)].copy()
val_df = df[df["Chromosome"].isin(val_chroms)].copy()
test_df = df[df["Chromosome"].isin(test_chroms)].copy()
print(train_df.shape, val_df.shape, test_df.shape)

(8271, 2) (982, 2) (980, 2)


In [48]:
train_all_ids = train_df["Chromosome"].tolist() +  train_df["Plasmid"].tolist()

In [49]:
# extract those rows in val_df, remove it from val_df and add it to train_df
move_idx = val_df[
    val_df["Chromosome"].isin(train_all_ids) | val_df["Plasmid"].isin(train_all_ids)
].index
train_df = pd.concat([train_df, val_df.loc[move_idx]], ignore_index=True)
val_clean = val_df.drop(index=move_idx).reset_index(drop=True)
# same in test_df
move_idx = test_df[
    test_df["Chromosome"].isin(train_all_ids) | test_df["Plasmid"].isin(train_all_ids)
].index
train_df = pd.concat([train_df, test_df.loc[move_idx]], ignore_index=True)
test_clean = test_df.drop(index=move_idx).reset_index(drop=True)

In [50]:
print(train_df.shape, val_clean.shape, test_clean.shape)

(8271, 2) (982, 2) (980, 2)


In [None]:
# Step 5:
train_df.to_csv("positive_train_5.csv", index=False)
val_clean.to_csv("positive_val_5.csv", index=False)
test_clean.to_csv("positive_test_5.csv", index=False)
print(f"Train: {len(train_df)}  |  Val: {len(val_clean)}  |  Test: {len(test_clean)}")


Train: 8271  |  Val: 982  |  Test: 980


### creat the positive and negative pairs for the dataset

In [52]:
import pandas as pd
import numpy as np

def generate_negative_pairs(pos_pair, seed):
    np.random.seed(seed)
    pos_set = set(zip(pos_pair['Chromosome'], pos_pair['Plasmid']))
    unique_chromosomes = pos_pair['Chromosome'].unique()
    unique_plasmids = pos_pair['Plasmid'].unique()
    target_count = len(pos_pair)

    neg_pairs_set = set()
    while len(neg_pairs_set) < target_count:
        chr_sample = np.random.choice(unique_chromosomes)
        plasmid_sample = np.random.choice(unique_plasmids)
        pair = (chr_sample, plasmid_sample)
        if pair not in pos_set :
            neg_pairs_set.add(pair)  # use set() fucntion, to make sure there is no duplicate negative pair generated

    neg_pairs = pd.DataFrame(list(neg_pairs_set), columns=['Chromosome', 'Plasmid'])
    neg_pairs['Label'] = 0
    return neg_pairs


In [None]:
# Main loop: generate multiple randomly partitioned datasets
i = 4
seed = 42 + i
pos_pair = pd.read_csv(f'positive_train_{i+1}.csv')
pos_pair['Label'] = 1
neg_pair = generate_negative_pairs(pos_pair, seed)
combined = pd.concat([pos_pair, neg_pair], ignore_index=True)
combined.to_csv(f'pos_neg_random_train_{i+1}.csv', index=False)
print(f"✅ Saved: pos_neg_random_train_{i+1}.csv (Total: {len(combined)} rows)")
print(combined.sample(5, random_state=seed))  # Show a few random rows

✅ Saved: pos_neg_random_train_5.csv (Total: 16542 rows)
          Chromosome        Plasmid  Label
16242  NZ_AP024262.1  NZ_CP070013.1      0
14143  NZ_LS450958.2  NZ_CP042531.1      0
12937  NZ_CP123639.1  NZ_CP083789.1      0
933    NZ_CP024273.1  NZ_CP024274.1      1
870    NZ_CP023899.1  NZ_CP023902.1      1


In [None]:
# Main loop: generate multiple randomly partitioned datasets
i = 4
seed = 42 + i
pos_pair = pd.read_csv(f'positive_val_{i+1}.csv')
pos_pair['Label'] = 1
neg_pair = generate_negative_pairs(pos_pair, seed)
combined = pd.concat([pos_pair, neg_pair], ignore_index=True)
combined.to_csv(f'pos_neg_random_val_{i+1}.csv', index=False)
print(f"✅ Saved: pos_neg_random_val_{i+1}.csv (Total: {len(combined)} rows)")
print(combined.sample(5, random_state=seed))  # Show a few random rows

✅ Saved: pos_neg_random_val_5.csv (Total: 1964 rows)
         Chromosome        Plasmid  Label
363   NZ_CP056251.1  NZ_CP056255.1      1
1655  NZ_CP058164.1  NZ_LR890485.1      0
230   NZ_CP047171.1  NZ_CP047172.1      1
728   NZ_CP048389.1  NZ_CP048395.1      1
1339  NZ_CP084825.1  NZ_CP042558.1      0


In [None]:
# Main loop: generate multiple randomly partitioned datasets
i = 4
seed = 42 + i
pos_pair = pd.read_csv(f'positive_test_{i+1}.csv')
pos_pair['Label'] = 1
neg_pair = generate_negative_pairs(pos_pair, seed)
combined = pd.concat([pos_pair, neg_pair], ignore_index=True)
combined.to_csv(f'pos_neg_random_test_{i+1}.csv', index=False)
print(f"✅ Saved: pos_neg_random_test_{i+1}.csv (Total: {len(combined)} rows)")
print(combined.sample(5, random_state=seed))  # Show a few random rows

✅ Saved: pos_neg_random_test_5.csv (Total: 1960 rows)
         Chromosome        Plasmid  Label
446   NZ_CP075649.1  NZ_CP075650.1      1
1301  NZ_CP065572.1  NZ_OW970564.1      0
1014  NZ_CP116123.1  NZ_OW970377.1      0
946   NZ_LR890356.1  NZ_LR890359.1      1
293   NZ_CP038351.1  NZ_CP038352.1      1
