In [1]:
import pandas as pd
import json

In [3]:
df = pd.read_csv('../data/data.csv')
df['reads'] = df['reads'].map(lambda x: json.loads(x))
df['position'] = df['position'].astype(int)

In [4]:
df1 = pd.DataFrame(df['reads'].to_list())
df2 = pd.concat([df, df1], axis=1)
df2.head()

Unnamed: 0,gene_id,position,fivemers,reads,first,second,third,0,1,2,3,4,5,6,7,8
0,ENST00000000233,244,AAGACCA,"[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0....","[0.00299, 2.06, 125.0]","[0.0177, 10.4, 122.0]","[0.009300000000000001, 10.9, 84.1]",0.00299,2.06,125.0,0.0177,10.4,122.0,0.0093,10.9,84.1
1,ENST00000000233,244,AAGACCA,"[0.0063100000000000005, 2.5300000000000002, 12...","[0.0063100000000000005, 2.5300000000000002, 12...","[0.008440000000000001, 4.67, 126.0]","[0.0103, 6.3, 80.9]",0.00631,2.53,125.0,0.00844,4.67,126.0,0.0103,6.3,80.9
2,ENST00000000233,244,AAGACCA,"[0.0046500000000000005, 3.92, 109.0, 0.0136000...","[0.0046500000000000005, 3.92, 109.0]","[0.013600000000000001, 12.0, 124.0]","[0.00498, 2.13, 79.6]",0.00465,3.92,109.0,0.0136,12.0,124.0,0.00498,2.13,79.6
3,ENST00000000233,244,AAGACCA,"[0.00398, 2.06, 125.0, 0.0083, 5.01, 130.0, 0....","[0.00398, 2.06, 125.0]","[0.0083, 5.01, 130.0]","[0.00498, 3.7800000000000002, 80.4]",0.00398,2.06,125.0,0.0083,5.01,130.0,0.00498,3.78,80.4
4,ENST00000000233,244,AAGACCA,"[0.006640000000000001, 2.92, 120.0, 0.00266, 3...","[0.006640000000000001, 2.92, 120.0]","[0.00266, 3.94, 129.0]","[0.013000000000000001, 7.15, 82.2]",0.00664,2.92,120.0,0.00266,3.94,129.0,0.013,7.15,82.2


In [5]:
# Each segment has 3 features - dwelling time of the 5-mer nucleotides, standard deviation of the direct RNA current, and mean of the direct RNA current. 
# Separate the dataframe into 1 segment per row and change the position & 5-mers accordingly 
# input: df (i.e. df2 as shown above)
# output: df
def rearrange(df):
    # first segment
    feat_df1 = df[['gene_id', 'position', 'fivemers', 0, 1, 2]].rename(columns = {0:'dwelling_time', 
                                                                                  1:'sd_current', 
                                                                                  2:'mean_current'})
    feat_df1['position'] = feat_df1['position'].map(lambda x: x-1)
    feat_df1['fivemers'] = feat_df1['fivemers'].map(lambda x: x[:5])
    
    # second segment
    feat_df2 = df[['gene_id', 'position', 'fivemers', 3, 4, 5]].rename(columns = {3:'dwelling_time', 
                                                                                  4:'sd_current', 
                                                                                  5:'mean_current'})
    feat_df2['fivemers'] = feat_df2['fivemers'].map(lambda x: x[1:6])
    
    # third segment
    feat_df3 = df[['gene_id', 'position', 'fivemers', 6, 7, 8]].rename(columns = {6:'dwelling_time', 
                                                                                  7:'sd_current', 
                                                                                  8:'mean_current'})
    feat_df3['position'] = feat_df3['position'].map(lambda x: x+1)
    feat_df3['fivemers'] = feat_df3['fivemers'].map(lambda x: x[2:])
    
    return pd.concat([feat_df1, feat_df2, feat_df3])

In [6]:
final_df = rearrange(df2)
final_df

Unnamed: 0,gene_id,position,fivemers,dwelling_time,sd_current,mean_current
0,ENST00000000233,243,AAGAC,0.00299,2.06,125.0
1,ENST00000000233,243,AAGAC,0.00631,2.53,125.0
2,ENST00000000233,243,AAGAC,0.00465,3.92,109.0
3,ENST00000000233,243,AAGAC,0.00398,2.06,125.0
4,ENST00000000233,243,AAGAC,0.00664,2.92,120.0
...,...,...,...,...,...,...
11027101,ENST00000641834,1694,GACAT,0.01000,2.01,76.4
11027102,ENST00000641834,1694,GACAT,0.01760,2.61,74.6
11027103,ENST00000641834,1694,GACAT,0.00316,2.28,85.3
11027104,ENST00000641834,1694,GACAT,0.00664,4.44,76.8


In [7]:
# aggregate and find min, max, mean, median, std, skew for each feature
grouped_df = final_df.groupby(by = ['gene_id', 'position', 'fivemers']).agg(
    {'dwelling_time': [min,max, 'mean', 'median', 'std', 'skew'],
     'sd_current': [min,max, 'mean', 'median', 'std', 'skew'],
     'mean_current': [min,max, 'mean', 'median', 'std', 'skew']
    }).reset_index()

# rename the columns
grouped_df.columns = ["_".join(x) for x in grouped_df.columns.ravel()]
grouped_df = grouped_df.rename(columns = {'gene_id_': 'gene_id', 'position_': 'position', 'fivemers_': 'fivemers'})

  grouped_df.columns = ["_".join(x) for x in grouped_df.columns.ravel()]


In [8]:
grouped_df

Unnamed: 0,gene_id,position,fivemers,dwelling_time_min,dwelling_time_max,dwelling_time_mean,dwelling_time_median,dwelling_time_std,dwelling_time_skew,sd_current_min,...,sd_current_mean,sd_current_median,sd_current_std,sd_current_skew,mean_current_min,mean_current_max,mean_current_mean,mean_current_median,mean_current_std,mean_current_skew
0,ENST00000000233,243,AAGAC,0.00199,0.0339,0.008264,0.006970,0.005399,1.683065,1.770,...,4.223784,3.730,1.848027,1.839794,102.0,132.0,123.702703,125.00,4.957783,-2.467387
1,ENST00000000233,244,AGACC,0.00232,0.0296,0.009373,0.007970,0.005379,1.187657,1.040,...,7.382162,6.650,3.311633,0.411969,111.0,133.0,125.913514,126.00,2.772748,-1.284248
2,ENST00000000233,245,GACCA,0.00232,0.0329,0.007345,0.005980,0.004578,1.859260,0.773,...,4.386989,3.440,2.914112,1.183484,73.1,88.3,80.570270,80.50,2.529013,0.121925
3,ENST00000000233,260,CAAAC,0.00199,0.0222,0.006609,0.005640,0.003599,1.423941,0.919,...,3.216424,2.880,1.694099,4.216090,98.3,115.0,109.681395,110.00,2.989886,-1.108753
4,ENST00000000233,261,AAACT,0.00166,0.0267,0.006813,0.005885,0.003778,1.697649,0.789,...,3.226535,3.000,1.240656,2.622497,96.1,116.0,107.889535,108.00,3.536825,-0.437179
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365509,ENST00000641834,1537,TGACC,0.00232,0.0260,0.007419,0.006810,0.003895,2.122487,2.380,...,6.552982,5.790,2.346295,0.568151,114.0,133.0,123.263158,124.00,3.720245,-0.022975
365510,ENST00000641834,1538,GACCA,0.00258,0.0144,0.006472,0.006310,0.002648,0.937447,1.190,...,2.540877,2.330,1.119528,1.374889,77.6,87.4,82.289474,82.00,2.285332,0.218421
365511,ENST00000641834,1692,TTGAC,0.00232,0.0215,0.008788,0.007090,0.005528,0.789954,1.260,...,4.090577,3.160,2.468300,1.427842,92.0,111.0,105.807692,107.00,4.018382,-1.933418
365512,ENST00000641834,1693,TGACA,0.00232,0.0166,0.006908,0.006705,0.003246,0.693597,2.990,...,8.702885,8.675,2.986582,-0.042227,103.0,122.0,113.134615,113.00,3.575674,-0.216095


In [9]:
# split the fivemers into five columns
order_df = pd.DataFrame(grouped_df['fivemers'].str.split('').to_list())[[1, 2, 3, 4, 5]].rename(columns = {1: 'order_1', 2:'order_2', 3:'order_3', 4:'order_4', 5: 'order_5'})
grouped_df2 = pd.concat([grouped_df, order_df], axis = 1)
grouped_df2

Unnamed: 0,gene_id,position,fivemers,dwelling_time_min,dwelling_time_max,dwelling_time_mean,dwelling_time_median,dwelling_time_std,dwelling_time_skew,sd_current_min,...,mean_current_max,mean_current_mean,mean_current_median,mean_current_std,mean_current_skew,order_1,order_2,order_3,order_4,order_5
0,ENST00000000233,243,AAGAC,0.00199,0.0339,0.008264,0.006970,0.005399,1.683065,1.770,...,132.0,123.702703,125.00,4.957783,-2.467387,A,A,G,A,C
1,ENST00000000233,244,AGACC,0.00232,0.0296,0.009373,0.007970,0.005379,1.187657,1.040,...,133.0,125.913514,126.00,2.772748,-1.284248,A,G,A,C,C
2,ENST00000000233,245,GACCA,0.00232,0.0329,0.007345,0.005980,0.004578,1.859260,0.773,...,88.3,80.570270,80.50,2.529013,0.121925,G,A,C,C,A
3,ENST00000000233,260,CAAAC,0.00199,0.0222,0.006609,0.005640,0.003599,1.423941,0.919,...,115.0,109.681395,110.00,2.989886,-1.108753,C,A,A,A,C
4,ENST00000000233,261,AAACT,0.00166,0.0267,0.006813,0.005885,0.003778,1.697649,0.789,...,116.0,107.889535,108.00,3.536825,-0.437179,A,A,A,C,T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365509,ENST00000641834,1537,TGACC,0.00232,0.0260,0.007419,0.006810,0.003895,2.122487,2.380,...,133.0,123.263158,124.00,3.720245,-0.022975,T,G,A,C,C
365510,ENST00000641834,1538,GACCA,0.00258,0.0144,0.006472,0.006310,0.002648,0.937447,1.190,...,87.4,82.289474,82.00,2.285332,0.218421,G,A,C,C,A
365511,ENST00000641834,1692,TTGAC,0.00232,0.0215,0.008788,0.007090,0.005528,0.789954,1.260,...,111.0,105.807692,107.00,4.018382,-1.933418,T,T,G,A,C
365512,ENST00000641834,1693,TGACA,0.00232,0.0166,0.006908,0.006705,0.003246,0.693597,2.990,...,122.0,113.134615,113.00,3.575674,-0.216095,T,G,A,C,A


In [10]:
# find the number of occurrence of a letter in a word
# input: str, str
# output: int
def find(word, letter):
    res = 0
    for i in word:
        if i==letter:
            res += 1
    return res

In [11]:
# count the A,C,G,T in the fivemers
grouped_df2['count_A'] = grouped_df2['fivemers'].map(lambda x: find(x, 'A'))
grouped_df2['count_C'] = grouped_df2['fivemers'].map(lambda x: find(x, 'C'))
grouped_df2['count_G'] = grouped_df2['fivemers'].map(lambda x: find(x, 'G'))
grouped_df2['count_T'] = grouped_df2['fivemers'].map(lambda x: find(x, 'T'))
grouped_df2

Unnamed: 0,gene_id,position,fivemers,dwelling_time_min,dwelling_time_max,dwelling_time_mean,dwelling_time_median,dwelling_time_std,dwelling_time_skew,sd_current_min,...,mean_current_skew,order_1,order_2,order_3,order_4,order_5,count_A,count_C,count_G,count_T
0,ENST00000000233,243,AAGAC,0.00199,0.0339,0.008264,0.006970,0.005399,1.683065,1.770,...,-2.467387,A,A,G,A,C,3,1,1,0
1,ENST00000000233,244,AGACC,0.00232,0.0296,0.009373,0.007970,0.005379,1.187657,1.040,...,-1.284248,A,G,A,C,C,2,2,1,0
2,ENST00000000233,245,GACCA,0.00232,0.0329,0.007345,0.005980,0.004578,1.859260,0.773,...,0.121925,G,A,C,C,A,2,2,1,0
3,ENST00000000233,260,CAAAC,0.00199,0.0222,0.006609,0.005640,0.003599,1.423941,0.919,...,-1.108753,C,A,A,A,C,3,2,0,0
4,ENST00000000233,261,AAACT,0.00166,0.0267,0.006813,0.005885,0.003778,1.697649,0.789,...,-0.437179,A,A,A,C,T,3,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365509,ENST00000641834,1537,TGACC,0.00232,0.0260,0.007419,0.006810,0.003895,2.122487,2.380,...,-0.022975,T,G,A,C,C,1,2,1,1
365510,ENST00000641834,1538,GACCA,0.00258,0.0144,0.006472,0.006310,0.002648,0.937447,1.190,...,0.218421,G,A,C,C,A,2,2,1,0
365511,ENST00000641834,1692,TTGAC,0.00232,0.0215,0.008788,0.007090,0.005528,0.789954,1.260,...,-1.933418,T,T,G,A,C,1,1,1,2
365512,ENST00000641834,1693,TGACA,0.00232,0.0166,0.006908,0.006705,0.003246,0.693597,2.990,...,-0.216095,T,G,A,C,A,2,1,1,1


In [12]:
grouped_df2.to_csv('../data/grouped_data.csv', index=False)

In [13]:
len(grouped_df2)

365514

In [16]:
small_grouped_df = grouped_df2.head(n=int(len(grouped_df2)/4))
small_grouped_df

Unnamed: 0,gene_id,position,fivemers,dwelling_time_min,dwelling_time_max,dwelling_time_mean,dwelling_time_median,dwelling_time_std,dwelling_time_skew,sd_current_min,...,mean_current_skew,order_1,order_2,order_3,order_4,order_5,count_A,count_C,count_G,count_T
0,ENST00000000233,243,AAGAC,0.00199,0.0339,0.008264,0.006970,0.005399,1.683065,1.770,...,-2.467387,A,A,G,A,C,3,1,1,0
1,ENST00000000233,244,AGACC,0.00232,0.0296,0.009373,0.007970,0.005379,1.187657,1.040,...,-1.284248,A,G,A,C,C,2,2,1,0
2,ENST00000000233,245,GACCA,0.00232,0.0329,0.007345,0.005980,0.004578,1.859260,0.773,...,0.121925,G,A,C,C,A,2,2,1,0
3,ENST00000000233,260,CAAAC,0.00199,0.0222,0.006609,0.005640,0.003599,1.423941,0.919,...,-1.108753,C,A,A,A,C,3,2,0,0
4,ENST00000000233,261,AAACT,0.00166,0.0267,0.006813,0.005885,0.003778,1.697649,0.789,...,-0.437179,A,A,A,C,T,3,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91373,ENST00000290158,4258,AACCA,0.00199,0.0209,0.005729,0.004975,0.002932,1.703257,0.795,...,0.577244,A,A,C,C,A,3,2,0,0
91374,ENST00000290158,4267,GAAAC,0.00199,0.0332,0.008698,0.007140,0.005692,1.319556,1.030,...,-0.657472,G,A,A,A,C,3,1,1,0
91375,ENST00000290158,4268,AAACT,0.00232,0.0515,0.011279,0.008400,0.008445,2.277755,1.240,...,-0.761313,A,A,A,C,T,3,1,0,1
91376,ENST00000290158,4269,AACTG,0.00217,0.0398,0.008962,0.006320,0.007318,2.120694,1.070,...,0.397059,A,A,C,T,G,2,1,1,1


In [17]:
small_grouped_df.to_csv('../data/small_grouped_data.csv', index=False)