In [1]:
import pandas as pd
import json

In [2]:
df = pd.read_csv('../data/data.csv')
df['reads'] = df['reads'].map(lambda x: json.loads(x))
df['transcript_position'] = df['transcript_position'].astype(int)

In [3]:
df1 = pd.DataFrame(df['reads'].to_list())
df2 = pd.concat([df, df1], axis=1)
df2.head()

Unnamed: 0,transcript_id,transcript_position,fivemers,reads,0,1,2,3,4,5,6,7,8
0,ENST00000000233,244,AAGACCA,"[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0....",0.00299,2.06,125.0,0.0177,10.4,122.0,0.0093,10.9,84.1
1,ENST00000000233,244,AAGACCA,"[0.0063100000000000005, 2.5300000000000002, 12...",0.00631,2.53,125.0,0.00844,4.67,126.0,0.0103,6.3,80.9
2,ENST00000000233,244,AAGACCA,"[0.0046500000000000005, 3.92, 109.0, 0.0136000...",0.00465,3.92,109.0,0.0136,12.0,124.0,0.00498,2.13,79.6
3,ENST00000000233,244,AAGACCA,"[0.00398, 2.06, 125.0, 0.0083, 5.01, 130.0, 0....",0.00398,2.06,125.0,0.0083,5.01,130.0,0.00498,3.78,80.4
4,ENST00000000233,244,AAGACCA,"[0.006640000000000001, 2.92, 120.0, 0.00266, 3...",0.00664,2.92,120.0,0.00266,3.94,129.0,0.013,7.15,82.2


In [6]:
# Each segment has 3 features - dwelling time of the 5-mer nucleotides, 
# standard deviation of the direct RNA current, and mean of the direct RNA current. 
# Separate the dataframe into 1 segment per row and change the position & 5-mers accordingly.
# input: df (i.e. df2 as shown above)
# output: df
def rearrange(df):
    # first segment
    feat_df1 = df[['transcript_id', 'transcript_position', 'fivemers', 0, 1, 2]].rename(
        columns = {0:'dwelling_time', 1:'sd_current', 2:'mean_current'}
    )
    feat_df1['transcript_position'] = feat_df1['transcript_position'].map(lambda x: x-1)
    feat_df1['fivemers'] = feat_df1['fivemers'].map(lambda x: x[:5])
    
    # second segment
    feat_df2 = df[['transcript_id', 'transcript_position', 'fivemers', 3, 4, 5]].rename(
        columns = {3:'dwelling_time', 4:'sd_current', 5:'mean_current'}
    )
    feat_df2['fivemers'] = feat_df2['fivemers'].map(lambda x: x[1:6])
    
    # third segment
    feat_df3 = df[['transcript_id', 'transcript_position', 'fivemers', 6, 7, 8]].rename(
        columns = {6:'dwelling_time', 7:'sd_current', 8:'mean_current'}
    )
    feat_df3['transcript_position'] = feat_df3['transcript_position'].map(lambda x: x+1)
    feat_df3['fivemers'] = feat_df3['fivemers'].map(lambda x: x[2:])
    
    return pd.concat([feat_df1, feat_df2, feat_df3])

In [7]:
final_df = rearrange(df2)
final_df

Unnamed: 0,transcript_id,transcript_position,fivemers,dwelling_time,sd_current,mean_current
0,ENST00000000233,243,AAGAC,0.00299,2.06,125.0
1,ENST00000000233,243,AAGAC,0.00631,2.53,125.0
2,ENST00000000233,243,AAGAC,0.00465,3.92,109.0
3,ENST00000000233,243,AAGAC,0.00398,2.06,125.0
4,ENST00000000233,243,AAGAC,0.00664,2.92,120.0
...,...,...,...,...,...,...
11027101,ENST00000641834,1694,GACAT,0.01000,2.01,76.4
11027102,ENST00000641834,1694,GACAT,0.01760,2.61,74.6
11027103,ENST00000641834,1694,GACAT,0.00316,2.28,85.3
11027104,ENST00000641834,1694,GACAT,0.00664,4.44,76.8


In [None]:
# aggregate and find min, max, mean, median, std, skew for each feature
grouped_df = final_df.groupby(by = ['transcript_id', 'transcript_position', 'fivemers']).agg(
    {'dwelling_time': [min,max, 'mean', 'median', 'std', 'skew'],
     'sd_current': [min,max, 'mean', 'median', 'std', 'skew'],
     'mean_current': [min,max, 'mean', 'median', 'std', 'skew']
    }).reset_index()

# rename the columns
grouped_df.columns = ["_".join(x) for x in grouped_df.columns.ravel()]
grouped_df = grouped_df.rename(columns = {'transcript_id_': 'transcript_id', 
                                          'transcript_position_': 'transcript_position', 
                                          'fivemers_': 'fivemers'})

In [None]:
grouped_df

In [None]:
# split the fivemers into five columns
order_df = pd.DataFrame(grouped_df['fivemers'].str.split('').to_list())[[1, 2, 3, 4, 5]].rename(
    columns = {1: 'order_1', 2:'order_2', 3:'order_3', 4:'order_4', 5: 'order_5'}
)
grouped_df2 = pd.concat([grouped_df, order_df], axis = 1)
grouped_df2

In [None]:
# find the number of occurrence of a letter in a word
# input: str, str
# output: int
def find(word, letter):
    res = 0
    for i in word:
        if i==letter:
            res += 1
    return res

In [None]:
# count the A,C,G,T in the fivemers
grouped_df2['count_A'] = grouped_df2['fivemers'].map(lambda x: find(x, 'A'))
grouped_df2['count_C'] = grouped_df2['fivemers'].map(lambda x: find(x, 'C'))
grouped_df2['count_G'] = grouped_df2['fivemers'].map(lambda x: find(x, 'G'))
grouped_df2['count_T'] = grouped_df2['fivemers'].map(lambda x: find(x, 'T'))
grouped_df2

# Export to csv

In [None]:
grouped_df2.to_csv('../data/grouped_data.csv', index=False)

In [None]:
small_grouped_df = grouped_df2.head(n=int(len(grouped_df2)/4))
small_grouped_df.to_csv('../data/small_grouped_data.csv', index=False) # a quarter of original