In [1]:
import pandas as pd

df = pd.read_csv('./sub_vs_main_similarity.csv')

df.head(1)

Unnamed: 0,rowID,sequence,description,group,genus,species_epithet,similarity,elapsed_time
0,0,CAGCGGCGTCCCCACGCTACTGATGGCACGCACAGACGCACAGGCG...,NZ_AYSW01000002.1 Robbsia andropogonis Ba3549 ...,bacteria,Robbsia,andropogonis,0.778842,0.65778


In [2]:
similarities = list(df['similarity'])

for ele in similarities:
    if ele < 0.5:
        print(ele)

In [3]:
df1 = pd.read_csv('./genomic_species.csv')
df1.head(1)

Unnamed: 0,rowID,sequence,description,start_pos,end_pos,fasta_url,original_url,group,genus,species_epithet,set
0,0,CAGCGGCGTCCCCACGCTACTGATGGCACGCACAGACGCACAGGCG...,NZ_AYSW01000002.1 Robbsia andropogonis Ba3549 ...,0,6200,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ba...,bacteria,Robbsia,andropogonis,train


In [4]:
lengths = [len(ele) for ele in list(df1['sequence'])]
set(lengths)

{6200}

In [16]:
import os
import re
import pandas as pd

def read_all_csvs(folder_path):
    # 1) List all sizes, then sort by descending length so "1500" comes before "150"
    sizes = ['50','100','150','200','250','300','1000','1500','2000','2500','3000']
    sizes = sorted(sizes, key=lambda x: -len(x))

    # 2) Build a regex that won't match if there's a digit directly before or after
    size_pattern = re.compile(r'(?<!\d)(' + '|'.join(sizes) + r')(?!\d)')

    dataframes = []
    for root, dirs, files in os.walk(folder_path):
        for fname in files:
            if fname.lower().endswith('.csv'):
                m = size_pattern.search(fname)
                if m:
                    full = os.path.join(root, fname)
                    df   = pd.read_csv(full)
                    df['size'] = int(m.group(1))
                    dataframes.append(df)

    return dataframes

# Example usage:
folder = './cmsc701_final-main/data1'
dfs = read_all_csvs(folder)
# dfs is now a list of pandas.DataFrame objects, one per CSV file


In [13]:
len(dfs)

11

In [17]:
combined_df = pd.concat(dfs, ignore_index=True)
combined_df.head(1)

Unnamed: 0.1,Unnamed: 0,label,subsequence,description,group,genus,species_epithet,size
0,0,1,GGACCCTGCAAACGCCGTGCTCCGTGTTGCCGTGGCCGGCGTGTCA...,NZ_AYSW01000002.1 Robbsia andropogonis Ba3549 ...,bacteria,Robbsia,andropogonis,100


In [18]:
set(combined_df['size'])

{50, 100, 150, 200, 250, 300, 1000, 1500, 2000, 2500, 3000}

In [19]:
df2 = combined_df.drop(columns=["subsequence", "description"])

In [20]:
df2.to_csv('rows.csv')

In [10]:
len(combined_df)

93500

In [11]:
combined_df.to_csv('./combined_data1.csv')

In [None]:
import pandas as pd
import numpy as np

# 1) parameters
input_csv  = "./genomic_species.csv"
output_csv = "./genomic_species_subseq.csv"
lengths    = [50, 100, 150, 200, 250, 300, 400, 500, 1000, 2000]  # note: 100 appears twice if you really want two samples of length 100

# 2) load original data
df = pd.read_csv(input_csv)
print(len(set(list(df['species_epithet']))))
print(len(set(list(df['genus']))))
out_rows = []

# 3) for each seq, sample one random subseq per desired length
for _, row in df.iterrows():
    seq   = row["sequence"]
    label = row["genus"]
    group = row['group']
    species_epithet = row['species_epithet']
    L     = len(seq)
    for l in lengths:
        if L >= l:
            start = np.random.randint(0, L - l + 1)
            sub   = seq[start : start + l]
            out_rows.append({"sequence": sub, "genus": label, "group": group, 'species_epithet': species_epithet})
        else:
            # skip lengths longer than the sequence
            pass

# 4) save new training file
new_df = pd.DataFrame(out_rows)
new_df.to_csv(output_csv, index=False)
print(f"Wrote {len(new_df)} subsequences to {output_csv}")


712
848
Wrote 8500 subsequences to ./genomic_species_subseq.csv


In [6]:
import pandas as pd
import numpy as np

# 1) parameters
input_csv  = "./genomic_species.csv"
output_csv = "./genomic_species_subseq.csv"
lengths    = [50, 100, 150, 200, 250, 300, 400, 500, 1000, 2000]

# 2) load original data
df = pd.read_csv(input_csv)
print(f"# species_epithet: {df['species_epithet'].nunique()}")
print(f"# genus:          {df['genus'].nunique()}")

out_rows = []

# 3) for each seq, sample one random subseq per fixed length…
#    …and one extra subseq for each multiple-of-6 from 6 to 500 (but ≥50)
for _, row in df.iterrows():
    seq             = row["sequence"]
    genus           = row["genus"]
    group           = row["group"]
    species_epithet = row["species_epithet"]
    L               = len(seq)

    # original lengths
    for l in lengths:
        if L >= l:
            start = np.random.randint(0, L - l + 1)
            sub   = seq[start : start + l]
            out_rows.append({
                "sequence":        sub,
                "genus":           genus,
                "group":           group,
                "species_epithet": species_epithet,
            })

    # extra multiples-of-6 up to 500, but only if ≥50
    max_augment = min(L, 2000)
    for l in range(6, max_augment + 1, 6):
        if l >= 50:
            start = np.random.randint(0, L - l + 1)
            sub   = seq[start : start + l]
            out_rows.append({
                "sequence":        sub,
                "genus":           genus,
                "group":           group,
                "species_epithet": species_epithet,
            })

# 4) save augmented file
new_df = pd.DataFrame(out_rows)
new_df.to_csv(output_csv, index=False)
print(f"Wrote {len(new_df)} subsequences to {output_csv}")

# species_epithet: 712
# genus:          848
Wrote 284750 subsequences to ./genomic_species_subseq.csv


In [5]:
len(set(new_df["species_epithet"]))

712