In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
amino_d = {'C': 'CYS', 'D': 'ASP', 'S': 'SER', 'Q': 'GLN', 'K': 'LYS', 'I': 'ILE',
           'P': 'PRO', 'T': 'THR', 'F': 'PHE', 'N': 'ASN', 'G': 'GLY', 'H': 'HIS',
           'L': 'LEU', 'R': 'ARG', 'W': 'TRP', 'A': 'ALA', 'V': 'VAL', 'E': 'GLU',
           'Y': 'TYR', 'M': 'MET'}

In [3]:
# path to the input file:
INPUT_FILE_PATH = '/Users/evabertalan/Documents/cgrap_test2/SarsCov2_variants_v8_all_coral_new.text'

# format of the input file, should be 'txt', 'text' or 'csv'
filename, file_extension = os.path.splitext(INPUT_FILE_PATH)
print(file_extension)

.text


In [4]:
if file_extension in('.txt', '.text'):
    input_data = np.loadtxt(INPUT_FILE_PATH, dtype=str)
elif file_extension == '.csv':
    input_data = pd.read_csv(INPUT_FILE_PATH, dtype=str)
else:
    print('Input file should be .txt or .csv')

In [5]:
print(input_data)

[['chain_ID' 'residue_name' 'residue_number' 'value' 'ref_seq']
 ['6VXX_A' 'A' '27' '100' 'A']
 ['6VXX_A' 'Y' '28' '100' 'Y']
 ...
 ['6VXX_A' 'L' '1145' '100' 'L']
 ['6VXX_A' 'D' '1146' '100' 'D']
 ['6VXX_A' 'S' '1147' '100' 'S']]


In [29]:
# number of lines to skip from the beginning of the input file; e.g: SKIP_LINES = 1
SKIP_LINES = 1
# True if amino acid residue name is given by 1 letter in the input file; e.g: ONE_LETTER_INPUT = True
ONE_LETTER_INPUT = True
# If chain or seg_id is in a differnt form than in pdb, options to rewirite:
# - to any letter of the seg_id, index start with 0; e.g: REWRITE_SEG_ID = -1
# - to a fixed string, e.g: REWRITE_SEG_ID = "A"
# - to keep the orignal, use: REWRITE_SEG_ID = False
REWRITE_SEG_ID = -1

# Define column mapping to input file, numbering starts wiht 0
RES_NAME_col_index = 4
RES_ID_col_index = 2
SEG_ID_col_index = 0
VALUE_col_index = 3

In [30]:
if REWRITE_SEG_ID:
    if type(REWRITE_SEG_ID) is str:
        seg_id = [REWRITE_SEG_ID for seg in input_data[SKIP_LINES:, SEG_ID_col_index]]
    elif type(REWRITE_SEG_ID) is int:
        seg_id = [seg[REWRITE_SEG_ID] for seg in input_data[SKIP_LINES:, SEG_ID_col_index]]
    else: 'Unsupported type'
else:
    seg_id = input_data[SKIP_LINES:, SEG_ID_col_index]

In [32]:
res_name = [amino_d[res] for res in input_data[SKIP_LINES:, RES_NAME_col_index]] if ONE_LETTER_INPUT else input_data[SKIP_LINES:, RES_NAME_col_index]

In [33]:
#output column order will be: ['res_name', 'res_id', 'seg_name', 'value']
output_data = np.array([res_name,input_data[SKIP_LINES:, RES_ID_col_index], seg_id, input_data[SKIP_LINES:, VALUE_col_index] ])

In [36]:
PDB_ID = '6vxx'
OUTPUT_PATH = f'/Users/evabertalan/Documents/cgrap_test2/{PDB_ID}_data.txt'
np.savetxt(OUTPUT_PATH, output_data.T, fmt='%s')