# Calculate CS

In [1]:
import MDAnalysis as mda
import sys
import shutil
sys.path.append("..")
import data_process
from fetch_data import gsheet_to_csv
from src.features.build_features import get_sparta_plus_chemical_shifts
from spc_imports import *
set_up_plt()
%run /home/sperez/bin/PYTHON_SCRIPTS/barker.py

  plt.rcParams["text.latex.preamble"] = [


In [2]:
raw_data_dir = '../data/raw/'
interim_data_dir = '../data/interim/'
processed_data_dir = '../data/processed/'
external_data_dir = '../data/external/'

## Get the Data

### Get experimental data from GSheets.

In [3]:
gsheet_to_csv('KcsA_assignments', 'open_clean', external_data_dir + 'open_CS.csv')

In [4]:
gsheet_to_csv('KcsA_assignments', 'closed clean',
              external_data_dir+'closed_CS.csv')

### Reshape data 

In [5]:
closed_data = pd.read_csv(external_data_dir+'closed_CS.csv')

In [6]:
open_data = pd.read_csv(external_data_dir+'open_CS.csv')

In [7]:
for name in ['open_CS', 'closed_CS']:
    data = pd.read_csv(external_data_dir+name+'.csv')
    data['Residue'] = [ int(resid[1:]) for resid in data['Residue']]
    iterables = [data['Residue'], ['N', 'C', 'CA', 'CB']]
    index = pd.MultiIndex.from_product(iterables, names=['resid', 'nuclei'])
    data = pd.DataFrame(data[['N', 'C', 'CA', 'CB']].to_numpy().reshape(1,data.shape[0]*4),columns=index)
    iterables = [list(range(26, 121)), ['N', 'C', 'CA', 'CB']]
    index = pd.MultiIndex.from_product(iterables, names=['resid', 'nuclei'])
    data = data.reindex(index, axis=1)
    data.to_pickle(processed_data_dir + name + '.pkl')

## Make diff data

In [8]:
closed_data = pd.read_pickle(processed_data_dir+'closed_CS.pkl')

In [9]:
open_data = pd.read_pickle(processed_data_dir+'open_CS.pkl')

In [10]:
diff_data = (open_data - closed_data)

In [11]:
diff_data.to_pickle(processed_data_dir + 'diff_CS' + '.pkl')

Trick to multiindex data

In [12]:
open_data.loc[:,diff_data.dropna(axis=1).columns]

resid,33,33,33,38,38,38,42,42,42,43,...,102,103,107,108,111,111,112,112,112,113
nuclei,N,CA,CB,N,CA,CB,C,CA,CB,N,...,CB,N,CA,N,CA,CB,N,CA,CB,N
0,119.827,67.129,67.896,122.486,65.871,37.39,179.549,54.072,18.849,114.07,...,62.268,125.1621,68.0238,126.492,55.19,16.2839,117.91,67.274,68.108,129.219


### Process trajectories

In [13]:
states = { 
#     '3FB5_lb' : {'begin': 400000.,
#                        'end': 1000000.},
          '5VK6_lb': {'begin': 0.,
                      'end': 350000.},
          '5VKH_lb': {'begin': 0,
                      'end': 1.e+20},
#           '5VKE_lb': {'begin': 0,
#                       'end': 1000000.}
         }

In [4]:
for key in states.keys():
    print(key)
    data_process.process_trajectory('traj_comp.xtc', f'../data/raw/{key}',
                                    begin=states[key]['begin'],
                                    end=states[key]['end'],
                                    output_group='protein')
    move_xtc_to = f'../data/interim/{key}'
    os.makedirs(move_xtc_to, exist_ok=True)
    if os.path.isfile(f'{move_xtc_to}/protein_sk1_pbc.xtc'):
        os.remove(f'{move_xtc_to}/protein_sk1_pbc.xtc')
    shutil.move(f'../data/raw/{key}/protein_sk1_pbc.xtc', move_xtc_to)
    data_process.process_trajectory('charmm_final.pdb', f'../data/raw/{key}',
                                    output_group='protein')
    os.makedirs(move_xtc_to, exist_ok=True)
    if os.path.isfile(f'{move_xtc_to}/protein_sk1_pbc.pdb'):
        os.remove(f'{move_xtc_to}/protein_sk1_pbc.pdb')
    shutil.move(f'../data/raw/{key}/protein_sk1_pbc.pdb', move_xtc_to)

5VKE_lb


### Get Chemical Shifts

In [None]:
for key in states.keys():
    univ = mda.Universe(interim_data_dir + f'{key}/protein_sk1_pbc.pdb',
                        interim_data_dir + f'{key}/protein_sk1_pbc.xtc')
    df = get_sparta_plus_chemical_shifts(univ, '../data/interim/')
    df.to_pickle(f'{interim_data_dir}{key}/CS_sparta_plus_{key}.pkl')

t(A):   3%|▎         | 6/176 [02:47<1:18:09, 27.59s/it]

### Reformat interim data

In [None]:
for key in states.keys():
    df = pd.read_pickle(f'{interim_data_dir}{key}/CS_sparta_plus_{key}.pkl')
    iterables = [df.index.levels[0], df.index.levels[1]]
    index = pd.MultiIndex.from_product(iterables, names=['resid', 'nuclei'])
    data=pd.DataFrame(df.to_numpy().T)
    data.columns = df.index
    data=data.reindex(index,axis=1)
    data=data.filter(regex="(C|N)")
    os.makedirs(f'{processed_data_dir}{key}', exist_ok=True)
    data.to_pickle(f'{processed_data_dir}{key}/CS_sparta_plus_{key}.pkl')