In [6]:
import pandas as pd
import openpyxl
from itertools import chain
import re

structure_columns = ['Helix', 'Turn', 'Beta strand']

def extract_positions(cell, col_name):
    if pd.isna(cell):
        return []
    items = cell.split('; ')
    
    prefix_map = {
        'Helix': 'HELIX',
        'Turn': 'TURN',
        'Beta strand': 'STRAND'
    }
    
    prefix = prefix_map.get(col_name, '')
    return [item.replace(f'{prefix} ', '') for item in items if item.startswith(prefix)]

def extract_evidence_codes(cell):
    if pd.isna(cell):
        return []
    
    evidence_codes = re.findall(r'ECO:([^\|]+)', cell)
    
    return evidence_codes

def extract_subsequences(sequence, positions):
    return [sequence[start-1:end] for start, end in positions]

def autoscale_excel(excel_path):
    book = openpyxl.load_workbook(excel_path)
    sheet = book.active
    for column in sheet.columns:
        max_length = max(len(str(cell.value)) for cell in column)
        adjusted_width = max_length + 2
        col_dim = openpyxl.utils.get_column_letter(column[0].column)
        sheet.column_dimensions[col_dim].width = adjusted_width
    book.save(excel_path)

def fetch_data(url, timeout=10):
    import requests
    from requests.exceptions import RequestException
    
    with requests.Session() as session:
        try:
            response = session.get(url, timeout=timeout)
            response.raise_for_status()
            if response.status_code == 200:
                return response.text
        except RequestException as e:
            print(f"An error occurred: {e}")
    return None

def read_url(url):
    from io import StringIO
    text_content = fetch_data(url)

    if text_content:
        return pd.read_csv(StringIO(text_content), sep='\t')
    else:
        print("Failed to retrieve or parse the data.")
        return None

In [7]:
file = 'example.tsv'
url = 'https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Csequence%2Cft_helix%2Cft_turn%2Cft_strand&format=tsv&query=%28reviewed%3Atrue%29'

data = pd.read_csv(file, sep='\t')
# data = read_url(url)
data

Unnamed: 0,Entry,Helix,Turn,Beta strand,Sequence,Entry Name
0,A1A4S6,"HELIX 780..782; /evidence=""ECO:0007829|PDB:2MIO""",,"STRAND 728..730; /evidence=""ECO:0007829|PDB:2M...",MGLQPLEFSDCYLDSPWFRERIRAHEAELERTNKFIKELIKDGKNL...,RHG10_HUMAN
1,A1L3X0,"HELIX 17..20; /evidence=""ECO:0007829|PDB:6Y7F""...","TURN 89..94; /evidence=""ECO:0007829|PDB:6Y7F""","STRAND 97..99; /evidence=""ECO:0007829|PDB:6Y7F""",MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...,ELOV7_HUMAN
2,A2RUC4,"HELIX 16..22; /evidence=""ECO:0007829|PDB:3AL5""...","TURN 70..72; /evidence=""ECO:0007829|PDB:3AL6"";...","STRAND 10..13; /evidence=""ECO:0007829|PDB:3AL5...",MAGQHLPVPRLEGVSREQFMQHLYPQRKPLVLEGIDLGPCTSKWTV...,TYW5_HUMAN
3,A4GXA9,"HELIX 82..86; /evidence=""ECO:0007829|PDB:7F6L""...","TURN 70..72; /evidence=""ECO:0007829|PDB:7F6L"";...","STRAND 73..80; /evidence=""ECO:0007829|PDB:7F6L...",MARVGPGRAGVSCQGRGRGRGGSGQRRPPTWEISDSDAEDSAGSEA...,EME2_HUMAN
4,A6H8Y1,"HELIX 305..317; /evidence=""ECO:0007829|PDB:5N9...",,,MFRRARLSVKPNVRPGVGARGSTASNPQRGRESPRPPDPATDSASK...,BDP1_HUMAN
5,A6NGG8,"HELIX 835..838; /evidence=""ECO:0007829|PDB:7LX...",,,MGCTPSHSDLVNSVAKSGIQFLKKPKAIRPGCQGGSERGSIPLLVK...,PCARE_HUMAN


In [8]:
positions_df = data[structure_columns].apply(lambda col: col.apply(lambda cell: extract_positions(cell, col.name)))
positions_df = positions_df.applymap(lambda pos_list: [(int(p.split('..')[0]), int(p.split('..')[1])) for p in pos_list])
positions_df

Unnamed: 0,Helix,Turn,Beta strand
0,"[(780, 782)]",[],"[(728, 730), (732, 737), (743, 746), (754, 760..."
1,"[(17, 20), (23, 25), (35, 49), (51, 56), (65, ...","[(89, 94)]","[(97, 99)]"
2,"[(16, 22), (24, 26), (40, 43), (46, 53), (82, ...","[(70, 72), (114, 116), (205, 207)]","[(10, 13), (30, 34), (57, 66), (76, 81), (106,..."
3,"[(82, 86), (88, 90), (91, 97), (147, 156), (18...","[(70, 72), (336, 340)]","[(73, 80), (102, 106), (110, 119), (141, 145),..."
4,"[(305, 317), (322, 328), (334, 347), (349, 357...",[],[]
5,"[(835, 838), (842, 845)]",[],[]


In [9]:
evidence_df = data[structure_columns].applymap(extract_evidence_codes)
evidence_df

Unnamed: 0,Helix,Turn,Beta strand
0,[0007829],[],"[0007829, 0007829, 0007829, 0007829, 0007829, ..."
1,"[0007829, 0007829, 0007829, 0007829, 0007829, ...",[0007829],[0007829]
2,"[0007829, 0007829, 0007829, 0007829, 0007829, ...","[0007829, 0007829, 0007829]","[0007829, 0007829, 0007829, 0007829, 0007829, ..."
3,"[0007829, 0007829, 0007829, 0007829, 0007829, ...","[0007829, 0007829]","[0007829, 0007829, 0007829, 0007829, 0007829, ..."
4,"[0007829, 0007829, 0007829, 0007829, 0007829]",[],[]
5,"[0007829, 0007829]",[],[]


In [10]:
subsequences_df = data.apply(lambda row: {col: extract_subsequences(row['Sequence'], positions_df.loc[row.name, col]) for col in structure_columns}, axis=1)
subsequences_df.tail()

1    {'Helix': ['WIKD', 'PRV', 'LPQTILLGFYVYFVT', '...
2    {'Helix': ['REQFMQH', 'YPQ', 'CTSK', 'VDYLSQVG...
3    {'Helix': ['TAILE', 'AGA', 'DVLMEAL', 'PEEFLQG...
4    {'Helix': ['NKETDMFFLAISM', 'FSMIGQL', 'RIEIKN...
5    {'Helix': ['MEVL', 'SFAS'], 'Turn': [], 'Beta ...
dtype: object

In [12]:
excel_data = []

for idx, row in data.iterrows():
    entry = row['Entry']
    for structure_type in structure_columns:
        pos_list = positions_df.loc[idx, structure_type]
        subseq_list = subsequences_df[idx][structure_type]
        evi_list = evidence_df.loc[idx, structure_type]
        
        if len(pos_list) != len(subseq_list) != len(evi_list):
            print(f"Warning: Mismatch in number of positions, subsequences or evidences for Entry {entry}, Structure {structure_type}")
            continue
        
        for (start_pos, end_pos), subseq, evi in zip(pos_list, subseq_list, evi_list):
            excel_data.append({
                'Entry': entry,
                'Sequence': subseq,
                'Start_position': start_pos,
                'End_position': end_pos,
                'Type': structure_type,
                'ECO': evi
            })

excel_df = pd.DataFrame(excel_data)
excel_df

Unnamed: 0,Entry,Sequence,Start_position,End_position,Type,ECO
0,A1A4S6,QNY,780,782,Helix,0007829
1,A1A4S6,IRS,728,730,Beta strand,0007829
2,A1A4S6,KARAVY,732,737,Beta strand,0007829
3,A1A4S6,HSSE,743,746,Beta strand,0007829
4,A1A4S6,IFEDVQT,754,760,Beta strand,0007829
...,...,...,...,...,...,...
80,A6H8Y1,RIEIKNKFKREEKT,334,347,Helix,0007829
81,A6H8Y1,GWRIDKAFQ,349,357,Helix,0007829
82,A6H8Y1,FDFFAHLLQKVLAEEEKR,364,381,Helix,0007829
83,A6NGG8,MEVL,835,838,Helix,0007829


In [6]:
excel_path = f"./{file.replace('.tsv', '')}.xlsx" if file else f'result.xlsx'
excel_df.to_excel(excel_path, index=False)
autoscale_excel(excel_path)