In [1]:
import pandas as pd
import numpy as np

In [5]:
# Open the file
data = pd.read_csv("secondarystructure.txt", sep=" ", skiprows=9,
                   header=None, names=['a', 'b', 'c', 'Frame', 'value'])
data

Unnamed: 0,a,b,c,Frame,value
0,1,H,HA,0,T
1,2,H,HA,0,T
2,3,H,HA,0,T
3,4,H,HA,0,T
4,5,H,HA,0,T
...,...,...,...,...,...
2620095,390,H,HB,3324,E
2620096,391,H,HB,3324,E
2620097,392,H,HB,3324,E
2620098,393,H,HB,3324,E


In [6]:
# column names for the summary file
colnames = ["Frame"]
for i in data.value.unique():
    colnames.append(i)
colnames

['Frame', 'T', 'C', 'E', 'B', 'G', 'H']

In [7]:
# number of rows for the summary file
n_rows = len(data.Frame.unique())

n_rows

3325

In [8]:
# Create the initial summary dataframe
summary = pd.DataFrame(np.zeros((n_rows, len(colnames)), dtype=int), columns=colnames)

summary

Unnamed: 0,Frame,T,C,E,B,G,H
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...
3320,0,0,0,0,0,0,0
3321,0,0,0,0,0,0,0
3322,0,0,0,0,0,0,0
3323,0,0,0,0,0,0,0


In [9]:
# add frames to the first column
summary["Frame"] = data.Frame.unique()

summary

Unnamed: 0,Frame,T,C,E,B,G,H
0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0
2,2,0,0,0,0,0,0
3,3,0,0,0,0,0,0
4,4,0,0,0,0,0,0
...,...,...,...,...,...,...,...
3320,3320,0,0,0,0,0,0
3321,3321,0,0,0,0,0,0
3322,3322,0,0,0,0,0,0
3323,3323,0,0,0,0,0,0


In [10]:
# take subsets according to the frame number
for frame in data.Frame.unique():
    f = data.loc[data["Frame"]==frame]
    frame_total = len(f)
    # take subsets from above subset according to the value letter
    for st_type in f.value.unique():
        v = f.loc[f["value"]==st_type]
        # update the summary dataframe according to the items of the subset
        summary.loc[frame,st_type] += (len(v)/frame_total)*100
summary

Unnamed: 0,Frame,T,C,E,B,G,H
0,0,31.218274,18.908629,44.289340,1.269036,1.903553,2.411168
1,1,29.822335,17.512690,46.827411,1.395939,2.030457,2.411168
2,2,28.172589,17.893401,46.954315,1.522843,2.918782,2.538071
3,3,32.106599,17.893401,44.670051,1.395939,1.395939,2.538071
4,4,27.791878,19.162437,44.796954,2.284264,3.680203,2.284264
...,...,...,...,...,...,...,...
3320,3320,30.329949,17.639594,45.558376,1.522843,2.411168,2.538071
3321,3321,29.187817,19.923858,44.796954,1.395939,2.411168,2.284264
3322,3322,29.949239,18.020305,45.939086,1.142132,3.299492,1.649746
3323,3323,28.807107,15.228426,47.842640,1.395939,2.918782,3.807107


In [12]:
# save the dataframe as a csv file

summary.to_csv(r'summary.txt', sep="\t", index=False)

In [19]:
for i in range(1,7):
    data = summary.iloc[:,[0,i]]
    name = colnames[i] + " summary.txt"
    data.to_csv(name, sep="\t", index=False, header=False)