## Import Libraries

In [1]:
import json
import pandas as pd

## Load Files for Parsing
#### If files are already parsed, head to the next section

In [3]:
json_data_path = "/Users/claudia/Downloads/Project 2/data.json"
info_data_path = "/Users/claudia/Downloads/Project 2/data.info"
data = [json.loads(line) for line in open(json_data_path, 'r')]
with open(info_data_path, 'r') as f:
    info = f.read().splitlines()

In [32]:
def get_key(dictionary):
    key_object = dictionary.keys()
    key = list(key_object)[0]
    return key

def concat_col(transcript, position, nucleotide, reads, n):
    t_df = pd.DataFrame([transcript]*n)
    p_df = pd.DataFrame([position]*n)
    nu_df = pd.DataFrame([nucleotide]*n)

    ## concat columns together
    final_df = pd.concat([t_df, p_df, nu_df, reads], axis = 1)
    final_df.columns = ['transcript', 'position', 'nucleotides', 'dwelling_time', 'std', 'mean']
    return final_df

def parse_line(line):
    ## get transcript
    t = get_key(line)
    
    ## get position
    p_0 = get_key(line[t])
    p_minus1 = str(int(p_0) - 1)
    p_plus1 = str(int(p_0) + 1)

    ## get nucleotide seq and slice into 5 nucleotides for each position
    nu = get_key(line[t][p_0])
    nu_minus1 = nu[0:5]
    nu_0 = nu[1:6]
    nu_plus1 = nu[2:]

    ## get number of reads
    nb = len(line[t][p_0][nu])

    ## get dataframe of list of reads
    reads = pd.DataFrame(line[t][p_0][nu])
    reads_minus1 = reads.iloc[:, :3]
    reads_0 = reads.iloc[:, 3:6]
    reads_plus1 = reads.iloc[:, 6:]

    ## concat columns together to create position -1, 0, +1 dataframe respectively
    df_minus1 = concat_col(t, p_minus1, nu_minus1, reads_minus1, nb)
    df_0 = concat_col(t, p_0, nu_0, reads_0, nb)
    df_plus1 = concat_col(t, p_plus1, nu_plus1, reads_plus1, nb)

    ## concat rows together
    merged_df = pd.concat([df_minus1, df_0, df_plus1], axis = 0)

    return merged_df

In [35]:
## parse all lines into dataframe for concatenation
reads_df = [parse_line(data[i]) for i in range(len(data))]

## concat reads dataframes
data_df = pd.concat(reads_df, axis = 0)

In [46]:
total_rows = len(data_df)

def save_file(dataframe, start, stop, filename):
    pd.DataFrame(dataframe.iloc[start:stop, :]).to_parquet(filename)

## split data into 5 files to save and upload
save_file(data_df, 0, 5000000, "data_1.parquet")
save_file(data_df, 5000000, 10000000, "data_2.parquet")
save_file(data_df, 10000000, 15000000, "data_3.parquet")
save_file(data_df, 15000000, 20000000, "data_4.parquet")
save_file(data_df, 20000000, 25000000, "data_5.parquet")
save_file(data_df, 25000000, 30000000, "data_6.parquet")
save_file(data_df, 30000000, total_rows, "data_7.parquet")

In [4]:
## transform info data into dataframe
info_list = [info[i].split(",") for i in range(len(info))]
info_df = pd.DataFrame(info_list[1:]) 
info_df.columns = info_list[0]
info_df.to_parquet("info.parquet")

## Read Parsed Files

In [5]:
data_df = pd.read_parquet("/Users/claudia/Downloads/Project 2/data.parquet")

## Left Join DataFrames

In [6]:
complete_df = data_df.merge(info_df, how = "left", left_on = ["transcript", "position"], right_on = ["transcript_id", "transcript_position"])

In [10]:
complete_df = complete_df[['transcript', 'position', 'nucleotides', 'dwelling_time', 'std', 'mean', 'label']]

In [11]:
complete_df.head()

Unnamed: 0,transcript,position,nucleotides,dwelling_time,std,mean,label
0,ENST00000000233,243,AAGAC,0.00299,2.06,125.0,
1,ENST00000000233,243,AAGAC,0.00631,2.53,125.0,
2,ENST00000000233,243,AAGAC,0.00465,3.92,109.0,
3,ENST00000000233,243,AAGAC,0.00398,2.06,125.0,
4,ENST00000000233,243,AAGAC,0.00664,2.92,120.0,


In [15]:
## split data into 5 files to save and upload
save_file(complete_df, 0, 5000000, "merged_data_1.parquet")
save_file(complete_df, 5000000, 10000000, "merged_data_2.parquet")
save_file(complete_df, 10000000, 15000000, "merged_data_3.parquet")
save_file(complete_df, 15000000, 20000000, "merged_data_4.parquet")
save_file(complete_df, 20000000, 25000000, "merged_data_5.parquet")
save_file(complete_df, 25000000, 30000000, "merged_data_6.parquet")
save_file(complete_df, 30000000, len(complete_df), "merged_data_7.parquet")