In [107]:
import pandas as pd
import openpyxl
from itertools import chain
import re

structure_columns = ['Helix', 'Turn', 'Beta strand']

def extract_positions(cell, col_name):
    if pd.isna(cell):
        return []
    items = cell.split('; ')
    
    prefix_map = {
        'Helix': 'HELIX',
        'Turn': 'TURN',
        'Beta strand': 'STRAND'
    }
    
    prefix = prefix_map.get(col_name, '')
    return [item.replace(f'{prefix} ', '') for item in items if item.startswith(prefix)]

def extract_subsequences(sequence, positions):
    return [sequence[start-1:end] for start, end in positions]

def extract_evidence(cell):
    eco_ids = []
    sources = []
    source_ids = []
    
    if not isinstance(cell, str):
        return pd.DataFrame({
            'ECO': eco_ids,
            'Source': sources,
            'ID': source_ids
        })
    
    pattern = r"ECO:([0-9]+)(?:\|(\w+):([\w\d]+))?"
    matches = re.findall(pattern, cell)
    
    for match in matches:
        eco_ids.append(match[0])
        sources.append(match[1] if match[1] else 'NA')
        source_ids.append(match[2] if match[2] else 'NA')
    
    return pd.DataFrame({
        'ECO': eco_ids,
        'Source': sources,
        'ID': source_ids
    })

def autoscale_excel(excel_path):
    book = openpyxl.load_workbook(excel_path)
    sheet = book.active
    
    for column in sheet.columns:
        max_length = max(len(str(cell.value)) for cell in column)
        adjusted_width = max_length + 2
        col_dim = openpyxl.utils.get_column_letter(column[0].column)
        sheet.column_dimensions[col_dim].width = adjusted_width
        
    book.save(excel_path)

def fetch_data(url, timeout=10):
    import requests
    from requests.exceptions import RequestException
    
    with requests.Session() as session:
        try:
            response = session.get(url, timeout=timeout)
            response.raise_for_status()
            if response.status_code == 200:
                return response.text
        except RequestException as e:
            print(f"An error occurred: {e}")
    return None

def read_url(url):
    from io import StringIO
    text_content = fetch_data(url)

    if text_content:
        return pd.read_csv(StringIO(text_content), sep='\t')
    else:
        print("Failed to retrieve or parse the data.")
        return None

In [110]:
file = 'human_reviewed.tsv'
url = 'https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Csequence%2Cft_helix%2Cft_turn%2Cft_strand&format=tsv&query=%28reviewed%3Atrue%29'

data = pd.read_csv(file, sep='\t')
# data = read_url(url)

data.dropna(subset=['Helix', 'Turn', 'Beta strand'], how='all', inplace=True)

data

Unnamed: 0,Entry,Sequence,Helix,Turn,Beta strand
8,A0AV96,MTAEDSTAAMSSDSAAGSSAKVPEGVAGAPNEAALLALMERTGYSM...,"HELIX 164..174; /evidence=""ECO:0007829|PDB:2DI...","TURN 189..193; /evidence=""ECO:0007829|PDB:2DIS...","STRAND 150..156; /evidence=""ECO:0007829|PDB:2D..."
11,A0AVK6,MENEKENLFCEPHKRGLMKTPLKESTTANIVLAEIQPDFGPLTTPT...,"HELIX 114..116; /evidence=""ECO:0007829|PDB:4YO...",,"STRAND 135..137; /evidence=""ECO:0007829|PDB:4Y..."
12,A0AVT1,MEGSEPVAAHQGEEASCSSWGTGSTNKNLPIMSTASVEIDDALYSR...,"HELIX 41..51; /evidence=""ECO:0007829|PDB:7PVN""...","TURN 483..485; /evidence=""ECO:0007829|PDB:7PVN...","STRAND 63..67; /evidence=""ECO:0007829|PDB:7PVN..."
13,A0FGR8,MTANRDAALSSHRHPGCAQRPRTPTFASSSQRRSAFGFDDGNFPGL...,"HELIX 196..219; /evidence=""ECO:0007829|PDB:4P4...","TURN 283..286; /evidence=""ECO:0007829|PDB:4P42...","STRAND 235..241; /evidence=""ECO:0007829|PDB:4P..."
15,A0JLT2,MENFTALFGAQADPPPPPTALGFGPGKPPPPPPPPAGGGPGTAPPP...,"HELIX 83..86; /evidence=""ECO:0007829|PDB:7EMF""...","TURN 117..119; /evidence=""ECO:0007829|PDB:7EMF""","STRAND 79..81; /evidence=""ECO:0007829|PDB:7EMF"""
...,...,...,...,...,...
19876,Q96I45,MVNLGLSRVDDAVAAKHPGLGEYAACQSHAFMKGVFTFVTGTGMAF...,"HELIX 6..16; /evidence=""ECO:0007829|PDB:2LOR"";...",,"STRAND 3..5; /evidence=""ECO:0007829|PDB:2LOR"";..."
19954,Q9H0W7,MPTNCAAAGCATTYNKHINISFHRFPLDPKRRKEWVRLVRRKNFVP...,"HELIX 29..38; /evidence=""ECO:0007829|PDB:2D8R""",,"STRAND 7..9; /evidence=""ECO:0007829|PDB:2D8R"";..."
20009,Q9P1F3,MNVDHEVNLLVEEIHRLGSKNADGKLSVKFGVLFRDDKCANLFEAL...,"HELIX 3..17; /evidence=""ECO:0007829|PDB:2L2O"";...",,"STRAND 24..29; /evidence=""ECO:0007829|PDB:2L2O..."
20013,Q9P298,MSANRRWWVPPDDEDCVSEKLLRKTRESPLVPIGLGGCLVVAAYRI...,"HELIX 18..24; /evidence=""ECO:0007829|PDB:2LON""...",,"STRAND 11..14; /evidence=""ECO:0007829|PDB:2LON..."


In [111]:
positions_df = data[structure_columns].apply(lambda col: col.apply(lambda cell: extract_positions(cell, col.name)))
positions_df = positions_df.applymap(lambda pos_list: [(int(p.split('..')[0]), int(p.split('..')[1])) for p in pos_list])
positions_df

Unnamed: 0,Helix,Turn,Beta strand
8,"[(164, 174), (204, 211)]","[(189, 193), (212, 216), (237, 239)]","[(150, 156), (178, 182), (185, 187), (196, 203..."
11,"[(114, 116), (118, 128), (142, 148), (154, 166...",[],"[(135, 137), (169, 174), (177, 179), (305, 307..."
12,"[(41, 51), (53, 59), (71, 83), (98, 103), (109...","[(483, 485), (488, 490), (550, 552), (605, 607...","[(63, 67), (86, 91), (131, 138), (144, 146), (..."
13,"[(196, 219), (221, 226), (230, 232), (334, 337...","[(283, 286), (341, 345), (405, 407), (595, 597...","[(235, 241), (248, 255), (264, 282), (287, 304..."
15,"[(83, 86), (90, 96), (112, 116), (128, 138), (...","[(117, 119)]","[(79, 81)]"
...,...,...,...
19876,"[(6, 16), (20, 39), (40, 43), (44, 53), (62, 95)]",[],"[(3, 5), (56, 60)]"
19954,"[(29, 38)]",[],"[(7, 9), (47, 49)]"
20009,"[(3, 17), (30, 40), (46, 55), (69, 71)]",[],"[(24, 29), (58, 60), (75, 78)]"
20013,"[(18, 24), (30, 49), (66, 87)]",[],"[(11, 14), (89, 92)]"


In [112]:
subsequences_df = data.apply(lambda row: {col: extract_subsequences(row['Sequence'], positions_df.loc[row.name, col]) for col in structure_columns}, axis=1)
subsequences_df

8        {'Helix': ['REEILEEIAKV', 'HRAAAMAR'], 'Turn':...
11       {'Helix': ['KEK', 'LGLLCHKFLAR', 'CLDEVAE', 'R...
12       {'Helix': ['DALYSRQRYVL', 'DTAMQKM', 'GLGLEIAK...
13       {'Helix': ['EWLNKTVKHMWPFICQFIEKLFRE', 'IEPAVR...
15       {'Helix': ['LITH', 'EQAYNKF', 'LPGMI', 'RSLIEK...
                               ...                        
19876    {'Helix': ['LSRVDDAVAAK', 'LGEYAACQSHAFMKGVFTF...
19954    {'Helix': ['PKRRKEWVRL'], 'Turn': [], 'Beta st...
20009    {'Helix': ['VDHEVNLLVEEIHRL', 'FGVLFRDDKCA', '...
20013    {'Helix': ['SEKLLRK', 'LVPIGLGGCLVVAAYRIYRL', ...
20024    {'Helix': ['VKESLQLQLLEMEMLFSM', 'VNA', 'TNIKR...
Length: 7159, dtype: object

In [113]:
evidence_df = data[structure_columns].applymap(extract_evidence)
evidence_df

Unnamed: 0,Helix,Turn,Beta strand
8,ECO Source ID 0 0007829 PDB 2DI...,ECO Source ID 0 0007829 PDB 2DI...,ECO Source ID 0 0007829 PDB 2DI...
11,ECO Source ID 0 0007829 PDB 4YO...,"Empty DataFrame Columns: [ECO, Source, ID] Ind...",ECO Source ID 0 0007829 PDB 4YO...
12,ECO Source ID 0 0007829 PDB 7...,ECO Source ID 0 0007829 PDB 7PV...,ECO Source ID 0 0007829 PDB 7...
13,ECO Source ID 0 0007829 PDB 4...,ECO Source ID 0 0007829 PDB 4P4...,ECO Source ID 0 0007829 PDB 4...
15,ECO Source ID 0 0007829 PDB 7EM...,ECO Source ID 0 0007829 PDB 7EMF,ECO Source ID 0 0007829 PDB 7EMF
...,...,...,...
19876,ECO Source ID 0 0007829 PDB 2LO...,"Empty DataFrame Columns: [ECO, Source, ID] Ind...",ECO Source ID 0 0007829 PDB 2LO...
19954,ECO Source ID 0 0007829 PDB 2D8R,"Empty DataFrame Columns: [ECO, Source, ID] Ind...",ECO Source ID 0 0007829 PDB 2D8...
20009,ECO Source ID 0 0007829 PDB 2L2...,"Empty DataFrame Columns: [ECO, Source, ID] Ind...",ECO Source ID 0 0007829 PDB 2L2...
20013,ECO Source ID 0 0007829 PDB 2LO...,"Empty DataFrame Columns: [ECO, Source, ID] Ind...",ECO Source ID 0 0007829 PDB 2LO...


In [114]:
excel_data = []

for idx, row in data.iterrows():
    entry = row['Entry']
    for structure_type in structure_columns:
        pos_list = positions_df.loc[idx, structure_type]
        subseq_list = subsequences_df[idx][structure_type]
        evidence_list = evidence_df.loc[idx, structure_type]
        
        if len(pos_list) != len(subseq_list):
            print(f"Warning: Mismatch in number of positions and subsequences for Entry {entry}, Structure {structure_type}")
            continue

        for i, ((start_pos, end_pos), subseq) in enumerate(zip(pos_list, subseq_list)):
            excel_row = {
                'Entry': entry,
                'Sequence': subseq,
                'Start_position': start_pos,
                'End_position': end_pos,
                'Type': structure_type,
            }
            
            if isinstance(evidence_list, pd.DataFrame) and not evidence_list.empty and i < len(evidence_list):
                excel_row.update(evidence_list.iloc[i].to_dict())
            
            excel_data.append(excel_row)

excel_df = pd.DataFrame(excel_data)
excel_df

Unnamed: 0,Entry,Sequence,Start_position,End_position,Type,ECO,Source,ID
0,A0AV96,REEILEEIAKV,164,174,Helix,0007829,PDB,2DIS
1,A0AV96,HRAAAMAR,204,211,Helix,0007829,PDB,2DIS
2,A0AV96,DKMKN,189,193,Turn,0007829,PDB,2DIS
3,A0AV96,RKLMP,212,216,Turn,0007829,PDB,2DIS
4,A0AV96,VDE,237,239,Turn,0007829,PDB,2DIS
...,...,...,...,...,...,...,...,...
184797,Q9UIY3,EVKLE,28,32,Beta strand,0007829,PDB,2DAW
184798,Q9UIY3,IEFVITLQI,55,63,Beta strand,0007829,PDB,2DAW
184799,Q9UIY3,VKIDLQVTM,68,76,Beta strand,0007829,PDB,2DAW
184800,Q9UIY3,QLFGR,86,90,Beta strand,0007829,PDB,2DAW


In [115]:
print(excel_df['ECO'].unique())
print(excel_df['Source'].unique())
print(excel_df['ID'].unique())

['0007829']
['PDB']
['2DIS' '4YO2' '7PVN' ... '2L2O' '2LON' '2DAW']


In [116]:
excel_path = f"./{file.replace('.tsv', '')}.xlsx" if file else f'result.xlsx'
excel_df.to_excel(excel_path, index=False)
autoscale_excel(excel_path)