In [None]:
def read_dna_file(file_path):
    """
    Reads an AncestryDNA raw data .txt file and extracts SNP records.
    Skips header lines (#) and the column header.
    Returns a list of dicts with keys: rsid, chromosome, position, allele1, allele2.
    """
    snps = []
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            # Skip comments and empty lines
            if line.startswith("#") or not line:
                continue
            # Skip the header row if present
            if line.startswith("rsid"):
                continue

            fields = line.split("\t")
            if len(fields) >= 5:
                rsid, chromosome, position, allele1, allele2 = fields[:5]
                snps.append({
                    'rsid': rsid,
                    'chromosome': chromosome,
                    'position': position,
                    'allele1': allele1,
                    'allele2': allele2
                })
    return snps


read_dna_file(r'C:\Users\greys\OneDrive\zando\AncestoryData.txt')

In [None]:
import pandas as pd

genome_file = r'C:\Users\greys\OneDrive\zando\AncestoryData.txt'
genome_data = pd.read_csv(
    genome_file,
    comment='#',
    sep='\t',
    header=None,
    names=['rsid', 'chromosome', 'al', 'genotype'],
    dtype={'rsid': str, 'chromosome': str,
            'position': int, 'genotype': str},
    low_memory=False
)

In [2]:
import pandas as pd

import os
def verify_and_read_txt(filepath):
    # Check if the file is a .txt file
    if not filepath.endswith('.txt'):
        raise ValueError("The file is not a .txt file")

    # Check if the file exists
    if not os.path.isfile(filepath):
        raise FileNotFoundError("The file does not exist")

    # List of valid columns to search for
    valid_columns = ['rsid', 'chromosome',
                     'position', 'allele1', 'allele2', 'genotype']
    found_columns = set()

    # Read the first 100 lines of the file
    with open(filepath, 'r') as file:
        for i, line in enumerate(file):
            if i >= 100:
                break
            # Check if any of the valid columns are in the line
            for column in valid_columns:
                if column in line:
                    found_columns.add(column)

    return list(found_columns)

def get_genome_data(genome_file):
    genome_data = pd.read_csv(
        genome_file,
        comment='#',
        sep='\t',
        header=None,
        # names=['rsid', 'chromosome', 'position', 'genotype'],
        names=['rsid','chromosome','position','allele1','allele2'],

        dtype={'rsid': str, 'chromosome': str,
               'position': str, 'allele1': str, 'allele2': str},
        low_memory=False
    )
    return genome_data.to_dict(orient='records')

df = verify_and_read_txt('__AncestoryData.txt')
df

['chromosome', 'allele1', 'position', 'allele2', 'genotype', 'rsid']