In [23]:
import pandas as pd
import json

# Join with Labels

In [24]:
df = pd.read_csv('../data/data.csv')
df['reads'] = df['reads'].map(lambda x: json.loads(x))
df['transcript_position'] = df['transcript_position'].astype(int)

In [25]:
df1 = pd.DataFrame(df['reads'].to_list())
df2 = pd.concat([df, df1], axis=1)
df2.head()

Unnamed: 0,transcript_id,transcript_position,fivemers,reads,0,1,2,3,4,5,6,7,8
0,ENST00000000233,244,AAGACCA,"[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0....",0.00299,2.06,125.0,0.0177,10.4,122.0,0.0093,10.9,84.1
1,ENST00000000233,244,AAGACCA,"[0.0063100000000000005, 2.5300000000000002, 12...",0.00631,2.53,125.0,0.00844,4.67,126.0,0.0103,6.3,80.9
2,ENST00000000233,244,AAGACCA,"[0.0046500000000000005, 3.92, 109.0, 0.0136000...",0.00465,3.92,109.0,0.0136,12.0,124.0,0.00498,2.13,79.6
3,ENST00000000233,244,AAGACCA,"[0.00398, 2.06, 125.0, 0.0083, 5.01, 130.0, 0....",0.00398,2.06,125.0,0.0083,5.01,130.0,0.00498,3.78,80.4
4,ENST00000000233,244,AAGACCA,"[0.006640000000000001, 2.92, 120.0, 0.00266, 3...",0.00664,2.92,120.0,0.00266,3.94,129.0,0.013,7.15,82.2


In [26]:
data = pd.read_csv('../data/data.info')

In [34]:
final_df = df2.merge(data, how='left', on=['transcript_id', 'transcript_position'])

In [35]:
final_df

Unnamed: 0,transcript_id,transcript_position,fivemers,reads,0,1,2,3,4,5,6,7,8,gene_id,label
0,ENST00000000233,244,AAGACCA,"[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0....",0.00299,2.06,125.0,0.01770,10.40,122.0,0.00930,10.90,84.1,ENSG00000004059,0
1,ENST00000000233,244,AAGACCA,"[0.0063100000000000005, 2.5300000000000002, 12...",0.00631,2.53,125.0,0.00844,4.67,126.0,0.01030,6.30,80.9,ENSG00000004059,0
2,ENST00000000233,244,AAGACCA,"[0.0046500000000000005, 3.92, 109.0, 0.0136000...",0.00465,3.92,109.0,0.01360,12.00,124.0,0.00498,2.13,79.6,ENSG00000004059,0
3,ENST00000000233,244,AAGACCA,"[0.00398, 2.06, 125.0, 0.0083, 5.01, 130.0, 0....",0.00398,2.06,125.0,0.00830,5.01,130.0,0.00498,3.78,80.4,ENSG00000004059,0
4,ENST00000000233,244,AAGACCA,"[0.006640000000000001, 2.92, 120.0, 0.00266, 3...",0.00664,2.92,120.0,0.00266,3.94,129.0,0.01300,7.15,82.2,ENSG00000004059,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11027101,ENST00000641834,1693,TTGACAT,"[0.0041800000000000006, 7.49, 108.0, 0.00564, ...",0.00418,7.49,108.0,0.00564,10.20,116.0,0.01000,2.01,76.4,ENSG00000167747,0
11027102,ENST00000641834,1693,TTGACAT,"[0.006640000000000001, 1.9100000000000001, 109...",0.00664,1.91,109.0,0.00598,12.30,110.0,0.01760,2.61,74.6,ENSG00000167747,0
11027103,ENST00000641834,1693,TTGACAT,"[0.00721, 4.58, 105.0, 0.00398, 6.58, 113.0, 0...",0.00721,4.58,105.0,0.00398,6.58,113.0,0.00316,2.28,85.3,ENSG00000167747,0
11027104,ENST00000641834,1693,TTGACAT,"[0.00266, 2.33, 109.0, 0.009130000000000001, 1...",0.00266,2.33,109.0,0.00913,10.40,108.0,0.00664,4.44,76.8,ENSG00000167747,0


In [39]:
# Each segment has 3 features - dwelling time of the 5-mer nucleotides, 
# standard deviation of the direct RNA current, and mean of the direct RNA current. 
# Separate the dataframe into 1 segment per row and change the position & 5-mers accordingly.
# input: df (i.e. df2 as shown above)
# output: df
def rearrange(df):
    # first segment
    feat_df1 = df[['gene_id', 'transcript_id', 'transcript_position', 'fivemers', 0, 1, 2, 'label']].rename(
        columns = {0:'dwelling_time', 1:'sd_current', 2:'mean_current'}
    )
    feat_df1['transcript_position'] = feat_df1['transcript_position'].map(lambda x: x-1)
    feat_df1['relative_position'] = 1
    feat_df1['fivemers'] = feat_df1['fivemers'].map(lambda x: x[:5])
    feat_df1['label'] = 0
    
    # second segment
    feat_df2 = df[['gene_id', 'transcript_id', 'transcript_position', 'fivemers', 3, 4, 5, 'label']].rename(
        columns = {3:'dwelling_time', 4:'sd_current', 5:'mean_current'}
    )
    feat_df2['relative_position'] = 2
    feat_df2['fivemers'] = feat_df2['fivemers'].map(lambda x: x[1:6])
    
    # third segment
    feat_df3 = df[['gene_id', 'transcript_id', 'transcript_position', 'fivemers', 6, 7, 8, 'label']].rename(
        columns = {6:'dwelling_time', 7:'sd_current', 8:'mean_current'}
    )
    feat_df3['transcript_position'] = feat_df3['transcript_position'].map(lambda x: x+1)
    feat_df3['relative_position'] = 3
    feat_df3['fivemers'] = feat_df3['fivemers'].map(lambda x: x[2:])
    feat_df3['label'] = 0
    
    return pd.concat([feat_df1, feat_df2, feat_df3])

In [40]:
final_df2 = rearrange(final_df)
final_df2

Unnamed: 0,gene_id,transcript_id,transcript_position,fivemers,dwelling_time,sd_current,mean_current,label,relative_position
0,ENSG00000004059,ENST00000000233,243,AAGAC,0.00299,2.06,125.0,0,1
1,ENSG00000004059,ENST00000000233,243,AAGAC,0.00631,2.53,125.0,0,1
2,ENSG00000004059,ENST00000000233,243,AAGAC,0.00465,3.92,109.0,0,1
3,ENSG00000004059,ENST00000000233,243,AAGAC,0.00398,2.06,125.0,0,1
4,ENSG00000004059,ENST00000000233,243,AAGAC,0.00664,2.92,120.0,0,1
...,...,...,...,...,...,...,...,...,...
11027101,ENSG00000167747,ENST00000641834,1694,GACAT,0.01000,2.01,76.4,0,3
11027102,ENSG00000167747,ENST00000641834,1694,GACAT,0.01760,2.61,74.6,0,3
11027103,ENSG00000167747,ENST00000641834,1694,GACAT,0.00316,2.28,85.3,0,3
11027104,ENSG00000167747,ENST00000641834,1694,GACAT,0.00664,4.44,76.8,0,3


In [43]:
# aggregate and find min, max, mean, median, std, skew for each feature

lst = ['gene_id', 'transcript_id', 'transcript_position', 'relative_position', 'fivemers', 'label']

grouped_df = final_df2.groupby(by = lst).agg(
    {'dwelling_time': [min, max, 'mean', 'median', 'std', 'skew'],
     'sd_current': [min, max, 'mean', 'median', 'std', 'skew'],
     'mean_current': [min, max, 'mean', 'median', 'std', 'skew']
    }).reset_index()

# rename the columns
grouped_df.columns = ["_".join(x) for x in grouped_df.columns.ravel()]
grouped_df = grouped_df.rename(columns = {'gene_id_': 'gene_id', 
                                          'transcript_id_': 'transcript_id', 
                                          'transcript_position_': 'transcript_position', 
                                          'relative_position_': 'relative_position',
                                          'fivemers_': 'fivemers', 
                                          'label_': 'label'})

  grouped_df.columns = ["_".join(x) for x in grouped_df.columns.ravel()]


In [44]:
grouped_df

Unnamed: 0,gene_id,transcript_id,transcript_position,relative_position,fivemers,label,dwelling_time_min,dwelling_time_max,dwelling_time_mean,dwelling_time_median,...,sd_current_mean,sd_current_median,sd_current_std,sd_current_skew,mean_current_min,mean_current_max,mean_current_mean,mean_current_median,mean_current_std,mean_current_skew
0,ENSG00000000003,ENST00000373020,511,1,ATAAC,0,0.00266,0.0169,0.007248,0.005990,...,2.359500,2.325,0.755704,0.476160,83.5,91.2,86.795000,87.05,1.726717,0.591063
1,ENSG00000000003,ENST00000373020,512,2,TAACT,0,0.00232,0.0461,0.011177,0.007735,...,2.496500,2.375,0.654557,1.323767,94.7,101.0,97.965000,98.05,1.469792,-0.177996
2,ENSG00000000003,ENST00000373020,513,3,AACTC,0,0.00266,0.0219,0.008500,0.006870,...,1.588750,1.570,0.366750,-0.081245,83.5,94.5,92.330000,92.55,2.258574,-3.353925
3,ENSG00000000003,ENST00000373020,688,1,TAAAC,0,0.00232,0.0279,0.009868,0.007640,...,1.972857,1.980,0.433280,-0.045798,96.5,106.0,102.495238,103.00,2.526158,-1.322034
4,ENSG00000000003,ENST00000373020,689,2,AAACA,0,0.00232,0.0179,0.007332,0.006420,...,2.551905,2.430,0.752301,0.517212,91.1,102.0,97.928571,98.70,2.900542,-0.931646
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365509,ENSG00000284707,ENST00000641784,3243,2,GAACA,0,0.00232,0.0222,0.007648,0.006580,...,2.998125,2.675,1.378389,3.059431,91.6,105.0,97.346875,97.60,2.678188,0.392733
365510,ENSG00000284707,ENST00000641784,3244,3,AACAA,0,0.00199,0.0136,0.005190,0.004660,...,2.203750,2.130,0.688065,1.758215,80.3,97.4,88.439063,88.70,3.335648,-0.153579
365511,ENSG00000284707,ENST00000641784,3265,1,CTAAC,0,0.00232,0.0187,0.005972,0.005065,...,1.874516,1.760,0.540193,0.531989,86.2,99.7,94.209677,94.80,2.611055,-1.303836
365512,ENSG00000284707,ENST00000641784,3266,2,TAACT,0,0.00232,0.0203,0.006831,0.005785,...,2.194032,2.170,0.414924,0.459611,89.8,103.0,99.730645,99.75,2.094848,-2.010306


In [45]:
# split the fivemers into five columns
order_df = pd.DataFrame(grouped_df['fivemers'].str.split('').to_list())[[1, 2, 3, 4, 5]].rename(
    columns = {1: 'order_1', 2:'order_2', 3:'order_3', 4:'order_4', 5: 'order_5'}
)
grouped_df = pd.concat([grouped_df, order_df], axis = 1)
grouped_df

Unnamed: 0,gene_id,transcript_id,transcript_position,relative_position,fivemers,label,dwelling_time_min,dwelling_time_max,dwelling_time_mean,dwelling_time_median,...,mean_current_max,mean_current_mean,mean_current_median,mean_current_std,mean_current_skew,order_1,order_2,order_3,order_4,order_5
0,ENSG00000000003,ENST00000373020,511,1,ATAAC,0,0.00266,0.0169,0.007248,0.005990,...,91.2,86.795000,87.05,1.726717,0.591063,A,T,A,A,C
1,ENSG00000000003,ENST00000373020,512,2,TAACT,0,0.00232,0.0461,0.011177,0.007735,...,101.0,97.965000,98.05,1.469792,-0.177996,T,A,A,C,T
2,ENSG00000000003,ENST00000373020,513,3,AACTC,0,0.00266,0.0219,0.008500,0.006870,...,94.5,92.330000,92.55,2.258574,-3.353925,A,A,C,T,C
3,ENSG00000000003,ENST00000373020,688,1,TAAAC,0,0.00232,0.0279,0.009868,0.007640,...,106.0,102.495238,103.00,2.526158,-1.322034,T,A,A,A,C
4,ENSG00000000003,ENST00000373020,689,2,AAACA,0,0.00232,0.0179,0.007332,0.006420,...,102.0,97.928571,98.70,2.900542,-0.931646,A,A,A,C,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365509,ENSG00000284707,ENST00000641784,3243,2,GAACA,0,0.00232,0.0222,0.007648,0.006580,...,105.0,97.346875,97.60,2.678188,0.392733,G,A,A,C,A
365510,ENSG00000284707,ENST00000641784,3244,3,AACAA,0,0.00199,0.0136,0.005190,0.004660,...,97.4,88.439063,88.70,3.335648,-0.153579,A,A,C,A,A
365511,ENSG00000284707,ENST00000641784,3265,1,CTAAC,0,0.00232,0.0187,0.005972,0.005065,...,99.7,94.209677,94.80,2.611055,-1.303836,C,T,A,A,C
365512,ENSG00000284707,ENST00000641784,3266,2,TAACT,0,0.00232,0.0203,0.006831,0.005785,...,103.0,99.730645,99.75,2.094848,-2.010306,T,A,A,C,T


In [46]:
# find the number of occurrence of a letter in a word
# input: str, str
# output: int
def find(word, letter):
    res = 0
    for i in word:
        if i==letter:
            res += 1
    return res

In [47]:
# count the A,C,G,T in the fivemers
grouped_df['count_A'] = grouped_df['fivemers'].map(lambda x: find(x, 'A'))
grouped_df['count_C'] = grouped_df['fivemers'].map(lambda x: find(x, 'C'))
grouped_df['count_G'] = grouped_df['fivemers'].map(lambda x: find(x, 'G'))
grouped_df['count_T'] = grouped_df['fivemers'].map(lambda x: find(x, 'T'))
grouped_df

Unnamed: 0,gene_id,transcript_id,transcript_position,relative_position,fivemers,label,dwelling_time_min,dwelling_time_max,dwelling_time_mean,dwelling_time_median,...,mean_current_skew,order_1,order_2,order_3,order_4,order_5,count_A,count_C,count_G,count_T
0,ENSG00000000003,ENST00000373020,511,1,ATAAC,0,0.00266,0.0169,0.007248,0.005990,...,0.591063,A,T,A,A,C,3,1,0,1
1,ENSG00000000003,ENST00000373020,512,2,TAACT,0,0.00232,0.0461,0.011177,0.007735,...,-0.177996,T,A,A,C,T,2,1,0,2
2,ENSG00000000003,ENST00000373020,513,3,AACTC,0,0.00266,0.0219,0.008500,0.006870,...,-3.353925,A,A,C,T,C,2,2,0,1
3,ENSG00000000003,ENST00000373020,688,1,TAAAC,0,0.00232,0.0279,0.009868,0.007640,...,-1.322034,T,A,A,A,C,3,1,0,1
4,ENSG00000000003,ENST00000373020,689,2,AAACA,0,0.00232,0.0179,0.007332,0.006420,...,-0.931646,A,A,A,C,A,4,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365509,ENSG00000284707,ENST00000641784,3243,2,GAACA,0,0.00232,0.0222,0.007648,0.006580,...,0.392733,G,A,A,C,A,3,1,1,0
365510,ENSG00000284707,ENST00000641784,3244,3,AACAA,0,0.00199,0.0136,0.005190,0.004660,...,-0.153579,A,A,C,A,A,4,1,0,0
365511,ENSG00000284707,ENST00000641784,3265,1,CTAAC,0,0.00232,0.0187,0.005972,0.005065,...,-1.303836,C,T,A,A,C,2,2,0,1
365512,ENSG00000284707,ENST00000641784,3266,2,TAACT,0,0.00232,0.0203,0.006831,0.005785,...,-2.010306,T,A,A,C,T,2,1,0,2


# Export to csv

In [50]:
grouped_df.to_csv('../data/grouped_data.csv', index=False)

In [51]:
small_grouped_df = grouped_df.head(n=int(len(grouped_df)/4))
small_grouped_df.to_csv('../data/small_grouped_data.csv', index=False) # a quarter of original