In [20]:
import urllib.request
import requests
from bs4 import BeautifulSoup
import re
import os

In [5]:
def get_sample_names(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Извлекаем все ссылки на файлы
    links = soup.find_all('a')
    
    # Фильтруем ссылки, оставляя только те, которые оканчиваются на _inter_30.hic
    prefixes = []
    for link in links:
        href = link.get('href')
        if href.endswith('_inter_30.hic'):
            # Извлекаем префикс
            prefix = href.split('_inter_30.hic')[0]
            prefixes.append(prefix)
    return prefixes



In [27]:
def download_files(sample_names, base_url, file_type, out_dir):
    # Определяем шаблон файла и соответствующую папку для сохранения
    if file_type == 'hic':
        file_pattern = 'inter_30.hic'
        local_dir = os.path.join(out_dir, 'hic')
    elif file_type == 'merged_nodups':
        file_pattern = '_merged_nodups.txt.gz'
        local_dir = os.path.join(out_dir, 'merged_nodups')
    else:
        raise ValueError("Invalid file type. Use 'hic' or 'merged_nodups'.")
    
    # Создаем локальную папку, если она не существует
    if not os.path.exists(local_dir):
        os.makedirs(local_dir)

    for sample in sample_names:
        sample_url = f"{base_url}/{sample}/"
    
        try:
            with urllib.request.urlopen(sample_url) as response:
                html = response.read().decode('utf-8')
                lines = html.splitlines()
    
                for line in lines:
                    if '<a href="' in line:
                        dir_name = line.split('<a href="')[1].split('/')[0]
                        if not dir_name.startswith('?') and not dir_name.startswith('..'):
                            subdir_url = f"{sample_url}{dir_name}/"
                            suffix = re.split('[_-]', dir_name)[-1]
                        
                            try:
                                with urllib.request.urlopen(subdir_url) as sub_response:
                                    sub_html = sub_response.read().decode('utf-8')
                                    sub_lines = sub_html.splitlines()
                                    
                                    for sub_line in sub_lines:
                                        if file_pattern in sub_line:
                                            file_name = sub_line.split('<a href="')[1].split('"')[0]
                                            file_url = f"{subdir_url}{file_name}"
                                            file_name_new = '_'.join(file_name.split('_')[:2]) + '_' + suffix + '_' + '_'.join(file_name.split('_')[2:])
                                            
                                            if not os.path.exists(os.path.join(local_dir, file_name_new)):
                                                local_filepath = os.path.join(local_dir, file_name_new)
                                                    
                                                # print(f"Downloading {file_name} to {local_filepath}...")
                                                urllib.request.urlretrieve(file_url, local_filepath)
                                                print(f"Downloaded {file_name_new} successfully!")
                                            else:
                                                print(f"{file_name} exists")
                            except Exception as e:
                                print(f"Error accessing subdirectory {dir_name} in {sample}: {e}")
                            
        except Exception as e:
            print(f"Error accessing sample directory {sample}: {e}")



In [None]:
base_url = "https://genedev.bionet.nsc.ru/ftp/by_Project/LowInHIC_for_BCA/Samples"
out_dir = "/media/eternus1/nfs/projects/dpanchenko/test_embrio/data/"

url = "https://genedev.bionet.nsc.ru/ftp/by_Project/LowInHIC_for_BCA/Translocations_Andrei/"
prefixes = get_sample_names(url)

download_files(prefixes, base_url, 'hic', out_dir)
download_files(prefixes, base_url, 'merged_nodups', out_dir)

In [None]:
### Download genome size files (optional)
# These files store in data/genome dir

In [None]:
local_filepath = '/media/eternus1/nfs/projects/dpanchenko/test_embrio/data/genome/'
t2t_genome = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/914/755/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_assembly_report.txt'
hg19_genome = 'https://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/referenceSequences/male.hg19.chrom.sizes'

urllib.request.urlretrieve(t2t_genome, os.path.join(local_filepath, 'GCF_009914755.1_T2T-CHM13v2.0_assembly_report.txt'))
urllib.request.urlretrieve(hg19_genome, os.path.join(local_filepath, 'male.hg19.chrom.sizes'))