In [1]:
%matplotlib inline
# Secondary structure computations done with stride.
#   http://webclu.bio.wzw.tum.de/cgi-bin/stride/stridecgi.py

import csv
import StringIO
import pandas as pd
import numpy as np

In [2]:
def stride_to_df(stride_fname, struct_id):
    assignment_rows = []
    with open(stride_fname) as f:
        for line in f:
            if line.startswith('ASG'):
                row = line.strip().split()
                row = row[1:-4]
                row[2] = int(row[2])
                row[3] = int(row[3])
                assignment_rows.append(row)

    cols = ['Residue_%s' % struct_id, 'Chain',
            'PDBResidueNum', 'OrdinalResidueNum_%s' % struct_id,
            'ShortSecondaryStructure_%s' % struct_id,
            'SecondaryStructure_%s' % struct_id]
    df = pd.DataFrame(assignment_rows, columns=cols)
    df.set_index(['PDBResidueNum'], inplace=True)
    return df
        

In [3]:
sec_4CMP = stride_to_df('../data/4CMP_stride.txt', '4CMP')
sec_4OO8 = stride_to_df('../data/4OO8_stride.txt', '4OO8')
sec_4UN3 = stride_to_df('../data/4UN3_stride.txt', '4UN3')
sec_4TZ0 = stride_to_df('../data/4TZ0_stride.txt', '4TZ0')

sec_4CMP.to_csv('../data/4CMP_stride.csv')
sec_4OO8.to_csv('../data/4OO8_stride.csv')
sec_4UN3.to_csv('../data/4UN3_stride.csv')
sec_4TZ0.to_csv('../data/4TZ0_stride.csv')

In [4]:
sec_4CMP_singlechain = sec_4CMP[sec_4CMP.Chain == 'A']
sec_4CMP_singlechain = sec_4CMP_singlechain.drop('Chain', axis=1)

sec_4OO8_singlechain = sec_4OO8[sec_4OO8.Chain == 'A']
sec_4OO8_singlechain = sec_4OO8_singlechain.drop('Chain', axis=1)

sec_4UN3_singlechain = sec_4UN3[sec_4UN3.Chain == 'B']
sec_4UN3_singlechain = sec_4UN3_singlechain.drop('Chain', axis=1)

sec_4TZ0_singlechain = sec_4TZ0[sec_4TZ0.Chain == 'A']
sec_4TZ0_singlechain = sec_4TZ0_singlechain.drop('Chain', axis=1)

In [6]:
merged_df = pd.concat([sec_4CMP_singlechain, sec_4OO8_singlechain, sec_4UN3_singlechain, sec_4TZ0_singlechain], axis=1)
merged_df

Unnamed: 0_level_0,Residue_4CMP,OrdinalResidueNum_4CMP,ShortSecondaryStructure_4CMP,SecondaryStructure_4CMP,Residue_4OO8,OrdinalResidueNum_4OO8,ShortSecondaryStructure_4OO8,SecondaryStructure_4OO8,Residue_4UN3,OrdinalResidueNum_4UN3,ShortSecondaryStructure_4UN3,SecondaryStructure_4UN3,Residue_4TZ0,OrdinalResidueNum_4TZ0,ShortSecondaryStructure_4TZ0,SecondaryStructure_4TZ0
PDBResidueNum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
3,,,,,LYS,1,C,Coil,,,,,LYS,1,C,Coil
4,LYS,1,C,Coil,LYS,2,C,Coil,LYS,1,C,Coil,LYS,2,C,Coil
5,TYR,2,C,Coil,TYR,3,C,Coil,TYR,2,C,Coil,TYR,3,C,Coil
6,SER,3,E,Strand,SER,4,E,Strand,SER,3,E,Strand,SER,4,E,Strand
7,ILE,4,E,Strand,ILE,5,E,Strand,ILE,4,E,Strand,ILE,5,E,Strand
8,GLY,5,E,Strand,GLY,6,E,Strand,GLY,5,E,Strand,GLY,6,E,Strand
9,LEU,6,E,Strand,LEU,7,E,Strand,LEU,6,E,Strand,LEU,7,E,Strand
10,ASP,7,E,Strand,ALA,8,E,Strand,ASP,7,E,Strand,ASP,8,E,Strand
11,ILE,8,E,Strand,ILE,9,E,Strand,ILE,8,E,Strand,ILE,9,E,Strand
12,GLY,9,T,Turn,GLY,10,T,Turn,GLY,9,T,Turn,GLY,10,T,Turn


In [13]:
ss_names = {'SecondaryStructure_4UN3': 'LocallyConsistent_4UN3',
            'SecondaryStructure_4CMP': 'LocallyConsistent_4CMP',
            'SecondaryStructure_4OO8': 'LocallyConsistent_4OO8',
            'SecondaryStructure_4TZ0': 'LocallyConsistent_4TZ0',}

cols_d = {}

for aa, row in merged_df.iterrows():
    prev_data = merged_df[merged_df.index == aa - 1]
    next_data = merged_df[merged_df.index == aa + 1]
    
    if not prev_data.size or not next_data.size:
        for out_name in ss_names.itervalues():
            cols_d.setdefault(out_name, []).append(False)
        continue
        
    for ss_name, out_name in ss_names.iteritems():
        eq = prev_data[ss_name].values[0] == row[ss_name]
        eq = eq and next_data[ss_name].values[0] == row[ss_name]
        cols_d.setdefault(out_name, []).append(eq)

for col_name, values in cols_d.iteritems():
    merged_df[col_name] = np.array(values)

merged_df.to_csv('../data/Cas9_merged_stride.csv')
merged_df

Unnamed: 0_level_0,Residue_4CMP,OrdinalResidueNum_4CMP,ShortSecondaryStructure_4CMP,SecondaryStructure_4CMP,Residue_4OO8,OrdinalResidueNum_4OO8,ShortSecondaryStructure_4OO8,SecondaryStructure_4OO8,Residue_4UN3,OrdinalResidueNum_4UN3,ShortSecondaryStructure_4UN3,SecondaryStructure_4UN3,Residue_4TZ0,OrdinalResidueNum_4TZ0,ShortSecondaryStructure_4TZ0,SecondaryStructure_4TZ0,LocallyConsistent_4UN3,LocallyConsistent_4OO8,LocallyConsistent_4CMP,LocallyConsistent_4TZ0
PDBResidueNum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3,,,,,LYS,1,C,Coil,,,,,LYS,1,C,Coil,False,False,False,False
4,LYS,1,C,Coil,LYS,2,C,Coil,LYS,1,C,Coil,LYS,2,C,Coil,False,True,False,True
5,TYR,2,C,Coil,TYR,3,C,Coil,TYR,2,C,Coil,TYR,3,C,Coil,False,False,False,False
6,SER,3,E,Strand,SER,4,E,Strand,SER,3,E,Strand,SER,4,E,Strand,False,False,False,False
7,ILE,4,E,Strand,ILE,5,E,Strand,ILE,4,E,Strand,ILE,5,E,Strand,True,True,True,True
8,GLY,5,E,Strand,GLY,6,E,Strand,GLY,5,E,Strand,GLY,6,E,Strand,True,True,True,True
9,LEU,6,E,Strand,LEU,7,E,Strand,LEU,6,E,Strand,LEU,7,E,Strand,True,True,True,True
10,ASP,7,E,Strand,ALA,8,E,Strand,ASP,7,E,Strand,ASP,8,E,Strand,True,True,True,True
11,ILE,8,E,Strand,ILE,9,E,Strand,ILE,8,E,Strand,ILE,9,E,Strand,False,False,False,False
12,GLY,9,T,Turn,GLY,10,T,Turn,GLY,9,T,Turn,GLY,10,T,Turn,False,False,False,False
