In [40]:
# Setup project environment
import sys
from pathlib import Path

# Import setup_project
sys.path.append(str(Path.cwd()))
from setup_project import setup_environment

# Run setup
paths = setup_environment()

✅ Environment configured successfully!


In [41]:
# Now you can use the paths
db_path = paths["DB_PATH"]
raw_data_dir = paths["RAW_DATA_DIR"]

# Connect to the database
from sqlalchemy import create_engine, inspect, MetaData

# Create database file if missing
if not db_path.exists():
    print(f"Database not found at {db_path}, creating a new one...")
    db_path.parent.mkdir(parents=True, exist_ok=True)
    db_path.touch()

engine = create_engine(f"sqlite:///{db_path}")
connection = engine.connect()
inspector = inspect(engine)
metadata = MetaData()

print(f"✅ Connected to database at {db_path}")

✅ Connected to database at /home/falatfernando/Desktop/bdq_resistance_study/mtb_resistance_db/mtb_resistance.db


In [42]:
# Get all table names
table_names = inspector.get_table_names()
print("Tables in database:", table_names)

# Get schema information for a specific table
for table_name in table_names:
    columns = inspector.get_columns(table_name)
    print(f"\nColumns in {table_name}:")
    for column in columns:
        print(f"  {column['name']}: {column['type']}")

Tables in database: ['reference_genome']

Columns in reference_genome:
  id: INTEGER
  seq_id: VARCHAR
  source: VARCHAR
  feature: VARCHAR
  start: INTEGER
  end: INTEGER
  score: FLOAT
  strand: VARCHAR
  frame: VARCHAR
  attribute: VARCHAR


In [43]:
from sqlalchemy import Table, Column, Integer, String, Float, MetaData

metadata = MetaData()

reference_genome = Table(
    "reference_genome",
    metadata,
    Column("id", Integer, primary_key=True, autoincrement=True),
    Column("seq_id", String),
    Column("source", String),
    Column("feature", String),
    Column("start", Integer),
    Column("end", Integer),
    Column("score", Float),
    Column("strand", String),
    Column("frame", String),
    Column("attribute", String),
)

In [44]:
metadata.create_all(engine)

In [45]:
def parse_gtf(gtf_path):
    columns = [
        "seq_id", "source", "feature", 
        "start", "end", "score", 
        "strand", "frame", "attribute"
    ]
    gtf = pd.read_csv(
        gtf_path, 
        sep="\t", 
        comment="#", 
        names=columns,
        compression="gzip"
    )
    return gtf

# Load the GTF
gtf_df = parse_gtf(os.path.join(raw_data_dir, "GCF_000195955.2_ASM19595v2_genomic.gtf.gz"))

# Insert into database
gtf_df.to_sql("reference_genome", con=engine, if_exists="append", index=False)


15866

In [46]:
import pandas as pd
from sqlalchemy import create_engine

df = pd.read_sql("SELECT * FROM reference_genome LIMIT 10", engine)

In [47]:
df

Unnamed: 0,id,seq_id,source,feature,start,end,score,strand,frame,attribute
0,1,NC_000962.3,RefSeq,gene,1,1524,.,+,.,"gene_id ""Rv0001""; transcript_id """"; db_xref ""G..."
1,2,NC_000962.3,RefSeq,CDS,1,1521,.,+,0,"gene_id ""Rv0001""; transcript_id ""unassigned_tr..."
2,3,NC_000962.3,RefSeq,start_codon,1,3,.,+,0,"gene_id ""Rv0001""; transcript_id ""unassigned_tr..."
3,4,NC_000962.3,RefSeq,stop_codon,1522,1524,.,+,0,"gene_id ""Rv0001""; transcript_id ""unassigned_tr..."
4,5,NC_000962.3,RefSeq,gene,2052,3260,.,+,.,"gene_id ""Rv0002""; transcript_id """"; db_xref ""G..."
5,6,NC_000962.3,RefSeq,CDS,2052,3257,.,+,0,"gene_id ""Rv0002""; transcript_id ""unassigned_tr..."
6,7,NC_000962.3,RefSeq,start_codon,2052,2054,.,+,0,"gene_id ""Rv0002""; transcript_id ""unassigned_tr..."
7,8,NC_000962.3,RefSeq,stop_codon,3258,3260,.,+,0,"gene_id ""Rv0002""; transcript_id ""unassigned_tr..."
8,9,NC_000962.3,RefSeq,gene,3280,4437,.,+,.,"gene_id ""Rv0003""; transcript_id """"; db_xref ""G..."
9,10,NC_000962.3,RefSeq,CDS,3280,4434,.,+,0,"gene_id ""Rv0003""; transcript_id ""unassigned_tr..."
