## **Raw FASTA file for H37Rv reference genome**
Used for genome assembly, alignments and sequence search

In [5]:
fasta_file = "raw/GCF_000195955.2_ASM19595v2_genomic.fna.gz"

In [6]:
# Raw Fasta

import gzip
import pandas as pd
from Bio import SeqIO
from io import StringIO

def fasta_to_dataframe(fasta_gz_path):
    # Read the compressed FASTA file
    with gzip.open(fasta_gz_path, 'rt') as f:
        fasta_data = f.read()
    
    # Parse using Biopython's SeqIO
    records = SeqIO.parse(StringIO(fasta_data), 'fasta')
    
    # Create DataFrame
    data = []
    for record in records:
        data.append({
            'sequence_id': record.id,
            'description': record.description,
            'sequence': str(record.seq),
            'length': len(record.seq)
        })
    
    return pd.DataFrame(data)

df = fasta_to_dataframe(fasta_file)
df.head()

Unnamed: 0,sequence_id,description,sequence,length
0,NC_000962.3,"NC_000962.3 Mycobacterium tuberculosis H37Rv, ...",TTGACCGATGACCCCGGTTCAGGCTTCACCACAGTGTGGAACGCGG...,4411532


In [8]:
df.to_parquet('staging/reference_linear_genome.parquet', index=False)

In [3]:
# Formatted Indexed Fasta
import pandas as pd

def fasta_to_positional_parquet(fasta_gz_path, output_parquet_path, genome_type="H37Rv reference"):
    # Read the FASTA file into DataFrame (using your existing function)
    df_fasta = fasta_to_dataframe(fasta_gz_path)
    
    # Prepare an empty list to store all positional records
    positional_data = []
    
    # Process each sequence in the FASTA DataFrame
    for _, row in df_fasta.iterrows():
        sequence_id = row['sequence_id']
        description = row['description']
        sequence = row['sequence']
        
        # Create a record for each base in the sequence
        for position, base in enumerate(sequence, start=1):
            positional_data.append({
                'genome_type': genome_type,
                'chromosome': description,
                'position': position,
                'base': base
            })
    
    # Create DataFrame from the collected records
    df_positional = pd.DataFrame(positional_data)
    
    # Add an auto-incrementing ID (similar to SQLite)
    df_positional.reset_index(inplace=True)
    df_positional.rename(columns={'index': 'id'}, inplace=True)
    df_positional['id'] += 1  # Start IDs at 1 instead of 0
    
    # Save to Parquet file
    df_positional.to_parquet(output_parquet_path, index=False)
    
    return df_positional

# Usage example
output_file = "staging/reference_genome.parquet"
result_df = fasta_to_positional_parquet(fasta_file, output_file)

result_df.head()

Unnamed: 0,id,genome_type,chromosome,position,base
0,1,H37Rv reference,"NC_000962.3 Mycobacterium tuberculosis H37Rv, ...",1,T
1,2,H37Rv reference,"NC_000962.3 Mycobacterium tuberculosis H37Rv, ...",2,T
2,3,H37Rv reference,"NC_000962.3 Mycobacterium tuberculosis H37Rv, ...",3,G
3,4,H37Rv reference,"NC_000962.3 Mycobacterium tuberculosis H37Rv, ...",4,A
4,5,H37Rv reference,"NC_000962.3 Mycobacterium tuberculosis H37Rv, ...",5,C


In [4]:
# Database integration Validation

import sqlite3
import pandas as pd

# Connect to the database
conn = sqlite3.connect('mtb_warehouse.db')

# Method 1: Using read_sql() - recommended approach
query = "SELECT * FROM reference_genome LIMIT 10;"
df = pd.read_sql(query, conn)

# Method 2: Using fetchall() and converting to DataFrame
cursor = conn.cursor()
cursor.execute("SELECT * FROM reference_genome LIMIT 10;")
rows = cursor.fetchall()
column_names = [description[0] for description in cursor.description]  # Get column names
df2 = pd.DataFrame(rows, columns=column_names)

# Close the connection
conn.close()

# Display the DataFrame
display(df)

Unnamed: 0,id,genome_type,chromosome,position,base
0,1,H37Rv reference,"NC_000962.3 Mycobacterium tuberculosis H37Rv, ...",1,T
1,2,H37Rv reference,"NC_000962.3 Mycobacterium tuberculosis H37Rv, ...",2,T
2,3,H37Rv reference,"NC_000962.3 Mycobacterium tuberculosis H37Rv, ...",3,G
3,4,H37Rv reference,"NC_000962.3 Mycobacterium tuberculosis H37Rv, ...",4,A
4,5,H37Rv reference,"NC_000962.3 Mycobacterium tuberculosis H37Rv, ...",5,C
5,6,H37Rv reference,"NC_000962.3 Mycobacterium tuberculosis H37Rv, ...",6,C
6,7,H37Rv reference,"NC_000962.3 Mycobacterium tuberculosis H37Rv, ...",7,G
7,8,H37Rv reference,"NC_000962.3 Mycobacterium tuberculosis H37Rv, ...",8,A
8,9,H37Rv reference,"NC_000962.3 Mycobacterium tuberculosis H37Rv, ...",9,T
9,10,H37Rv reference,"NC_000962.3 Mycobacterium tuberculosis H37Rv, ...",10,G
