In [18]:
import requests
import pandas as pd

def create_hgvs(chrom, pos, ref, alt):
    return f"{chrom}:g.{pos}{ref}>{alt}"

def search_variation_id(hgvs_str):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "clinvar",
        "term": hgvs_str,
        "retmode": "json"
    }
    try:
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()
        idlist = data.get("esearchresult", {}).get("idlist", [])
        return idlist[0] if idlist else None
    except Exception as e:
        return None

def parse_vcf(file_path):
    records = []
    with open(file_path, "r") as vcf:
        for line in vcf:
            if line.startswith("#"):
                continue  # header satırlarını atla
            parts = line.strip().split("\t")
            chrom, pos, _, ref, alt = parts[0], parts[1], parts[2], parts[3], parts[4]
            records.append((chrom, pos, ref, alt))
    return records

# 📂 VCF dosyasını oku
vcf_path = "clinvar.vcf"  # buraya kendi dosya adını yaz
variants = parse_vcf(vcf_path)

# 🔄 İlk 100 varyant için ClinVar ID bul
results = []
max_variants = 100  # sadece ilk 100 varyantı işle
for i, (chrom, pos, ref, alt) in enumerate(variants):
    if i >= max_variants:
        break
    hgvs = create_hgvs(chrom, pos, ref, alt)
    var_id = search_variation_id(hgvs)
    results.append({
        "CHROM": chrom,
        "POS": pos,
        "REF": ref,
        "ALT": alt,
        "HGVS": hgvs,
        "Variation ID": var_id or "Bulunamadı"
    })

# 📊 Excel'e yaz
df = pd.DataFrame(results)
df.to_excel("clinvar_variation_ids_first100.xlsx", index=False)
print("✅ İlk 100 varyant işlendi: clinvar_variation_ids_first100.xlsx")


✅ İlk 100 varyant işlendi: clinvar_variation_ids_first100.xlsx


In [20]:
import pandas as pd
import requests
import time

def get_variation_info(variation_id):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=clinvar&id={variation_id}&retmode=json"
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        data = response.json()
        vid = list(data["result"]["uids"])[0]
        info = data["result"][vid]

        # Bilgileri çek
        title = info.get("title", "Yok")
        accession = info.get("accession_version", "Yok")
        gene = info.get("genes", [{}])[0].get("symbol", "Yok")
        variant_type = info.get("obj_type", "Yok")

        locations = info.get("variation_set", [])[0].get("variation_loc", [])
        grch38 = next((l for l in locations if l["assembly_name"] == "GRCh38"), {})
        grch37 = next((l for l in locations if l["assembly_name"] == "GRCh37"), {})
        spdi = info["variation_set"][0].get("canonical_spdi", "Yok")

        clinical = info.get("germline_classification", {})
        clinical_sig = clinical.get("description", "Yok")
        review_status = clinical.get("review_status", "Yok")
        trait = clinical.get("trait_set", [{}])[0].get("trait_name", "Yok")
        last_eval = clinical.get("last_evaluated", "Yok")

        allele_freqs = info["variation_set"][0].get("allele_freq_set", [])
        freq_str = "\n".join([f"{f['source']}: {f['value']}" for f in allele_freqs]) or "Yok"

        molecular = ", ".join(info.get("molecular_consequence_list", [])) or "Yok"

        return {
            "Title": title,
            "Accession": accession,
            "Gene": gene,
            "Variant Type": variant_type,
            "GRCh38 chr": grch38.get("chr", "Yok"),
            "GRCh38 pos": grch38.get("start", "Yok"),
            "GRCh37 chr": grch37.get("chr", "Yok"),
            "GRCh37 pos": grch37.get("start", "Yok"),
            "SPDI": spdi,
            "Clinical Significance": clinical_sig,
            "Condition": trait,
            "Review Status": review_status,
            "Last Evaluated": last_eval,
            "Allele Frequencies": freq_str,
            "Molecular Consequences": molecular
        }
    except:
        return {
            "Title": "HATA",
            "Accession": "HATA",
            "Gene": "HATA",
            "Variant Type": "HATA",
            "GRCh38 chr": "HATA",
            "GRCh38 pos": "HATA",
            "GRCh37 chr": "HATA",
            "GRCh37 pos": "HATA",
            "SPDI": "HATA",
            "Clinical Significance": "HATA",
            "Condition": "HATA",
            "Review Status": "HATA",
            "Last Evaluated": "HATA",
            "Allele Frequencies": "HATA",
            "Molecular Consequences": "HATA"
        }

# 🔽 Excel dosyasını yükle
input_path = "clinvar_variation_ids_first100.xlsx"  # Dosya adını değiştirmen gerekebilir
df = pd.read_excel(input_path)

# 🔍 Sadece geçerli Variation ID'lere bak (rakam olanlar)
df_valid = df[df["Variation ID"].apply(lambda x: str(x).isdigit())].copy()

# 🧠 Her ID için bilgileri çek
info_list = []
for var_id in df_valid["Variation ID"]:
    info = get_variation_info(str(var_id))
    info_list.append(info)
    time.sleep(0.34)  # 3 istek/saniye sınırına uymak için

# 🔗 Yeni bilgileri ekle
info_df = pd.DataFrame(info_list)
df_result = pd.concat([df_valid.reset_index(drop=True), info_df], axis=1)

# 📁 Yeni Excel dosyasına yaz
output_path = "clinvar_detailed_results.xlsx"
df_result.to_excel(output_path, index=False)
print(f"✅ Bitti: {output_path}")


✅ Bitti: clinvar_detailed_results.xlsx


In [17]:
import gzip
import shutil

# Giriş ve çıkış dosyası yolları
input_path = "clinvar.vcf.gz"
output_path = "clinvar.vcf"

# .gz dosyasını aç, .vcf olarak yaz
with gzip.open(input_path, 'rt') as f_in, open(output_path, 'w') as f_out:
    shutil.copyfileobj(f_in, f_out)

print("VCF dosyası başarıyla oluşturuldu:", output_path)


VCF dosyası başarıyla oluşturuldu: clinvar.vcf


In [1]:
import requests
import pandas as pd
import time

# HGVS oluştur
def create_hgvs(chrom, pos, ref, alt):
    return f"{chrom}:g.{pos}{ref}>{alt}"

# ClinVar'dan Variation ID al
def search_variation_id(hgvs_str):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {"db": "clinvar", "term": hgvs_str, "retmode": "json"}
    try:
        response = requests.get(url, params=params, timeout=10)
        data = response.json()
        return data.get("esearchresult", {}).get("idlist", [None])[0]
    except:
        return None

# VCF dosyasını oku
def parse_vcf(file_path, start=100, end=200):
    records = []
    counter = 0
    with open(file_path, "r") as vcf:
        for line in vcf:
            if line.startswith("#"):
                continue
            if counter >= start and counter < end:
                parts = line.strip().split("\t")
                chrom, pos, ref, alt = parts[0], parts[1], parts[3], parts[4]
                records.append((chrom, pos, ref, alt))
            counter += 1
            if counter >= end:
                break
    return records

# ClinVar'dan detaylı varyant bilgisi al
def get_variation_info(variation_id):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    params = {"db": "clinvar", "id": variation_id, "retmode": "json"}
    try:
        response = requests.get(url, params=params, timeout=10)
        data = response.json()
        vid = data["result"]["uids"][0]
        info = data["result"][vid]

        trait_list = [t.get("trait_name", "Yok") for t in info.get("germline_classification", {}).get("trait_set", [])]
        trait = ", ".join(trait_list) if trait_list else "Yok"

        return {
            "Title": info.get("title", "Yok"),
            "Gene": info.get("genes", [{}])[0].get("symbol", "Yok"),
            "Variant Type": info.get("obj_type", "Yok"),
            "Clinical Significance": info.get("germline_classification", {}).get("description", "Yok"),
            "Condition": trait,
            "Review Status": info.get("germline_classification", {}).get("review_status", "Yok"),
            "Last Evaluated": info.get("germline_classification", {}).get("last_evaluated", "Yok")
        }
    except:
        return {
            "Title": "HATA", "Gene": "HATA", "Variant Type": "HATA",
            "Clinical Significance": "HATA", "Condition": "HATA",
            "Review Status": "HATA", "Last Evaluated": "HATA"
        }

# 🔁 Ana akış
def process_vcf(vcf_path, output_path="clinvar_detailed_results.xlsx", start=0, end=100):
    variants = parse_vcf(vcf_path, start=start, end=end)
    result_rows = []

    for chrom, pos, ref, alt in variants:
        hgvs = create_hgvs(chrom, pos, ref, alt)
        var_id = search_variation_id(hgvs)
        basic_info = {
            "CHROM": chrom,
            "POS": pos,
            "REF": ref,
            "ALT": alt,
            "HGVS": hgvs,
            "Variation ID": var_id if var_id else "Bulunamadı"
        }

        if var_id:
            detail_info = get_variation_info(var_id)
            time.sleep(0.34)  # API hız limiti
        else:
            detail_info = {
                "Title": "Yok", "Gene": "Yok", "Variant Type": "Yok",
                "Clinical Significance": "Yok", "Condition": "Yok",
                "Review Status": "Yok", "Last Evaluated": "Yok"
            }

        combined = {**basic_info, **detail_info}
        result_rows.append(combined)

    df = pd.DataFrame(result_rows)
    df.to_excel(output_path, index=False)
    print(f"✅ İşlem tamamlandı: {output_path}")

# 100. satırdan 200. satıra kadar olan varyantları işleyip Excel'e yaz
process_vcf("clinvar.vcf", output_path="clinvar_100_200.xlsx", start=100, end=200)


✅ İşlem tamamlandı: clinvar_100_200.xlsx


In [5]:
import pandas as pd

def read_clinvar_vcf(vcf_path, max_rows=20):
    rows = []
    with open(vcf_path, 'r') as f:
        for line in f:
            if line.startswith("#"):
                continue  # header satırlarını atla
            parts = line.strip().split('\t')
            if len(parts) >= 8:
                row = {
                    "CHROM": parts[0],
                    "POS": parts[1],
                    "ID": parts[2],
                    "REF": parts[3],
                    "ALT": parts[4],
                    "QUAL": parts[5],
                    "FILTER": parts[6],
                    "INFO": parts[7]
                }
                rows.append(row)
            if len(rows) >= max_rows:
                break
    return pd.DataFrame(rows)

# Kullanım
df = read_clinvar_vcf("clinvar.vcf", max_rows=20)


In [6]:
df

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO
0,1,66926,3385321,AG,A,.,.,ALLELEID=3544463;CLNDISDB=Human_Phenotype_Onto...
1,1,69134,2205837,A,G,.,.,ALLELEID=2193183;CLNDISDB=MedGen:CN169374;CLND...
2,1,69314,3205580,T,G,.,.,ALLELEID=3374047;CLNDISDB=MedGen:CN169374;CLND...
3,1,69423,3205581,G,A,.,.,ALLELEID=3374048;CLNDISDB=MedGen:CN169374;CLND...
4,1,69581,2252161,C,G,.,.,ALLELEID=2238986;CLNDISDB=MedGen:CN169374;CLND...
5,1,69682,2396347,G,A,.,.,AF_EXAC=0.00007;ALLELEID=2386655;CLNDISDB=MedG...
6,1,69731,3205582,T,C,.,.,ALLELEID=3374049;CLNDISDB=MedGen:CN169374;CLND...
7,1,69769,2288999,T,C,.,.,ALLELEID=2278803;CLNDISDB=MedGen:CN169374;CLND...
8,1,69995,2351346,G,C,.,.,ALLELEID=2333177;CLNDISDB=MedGen:CN169374;CLND...
9,1,809284,3892489,T,TGGTCAATCA,.,.,"ALLELEID=4008375;CLNDISDB=MONDO:MONDO:0024537,..."
