In [None]:
%pip install nbimporter
import nbimporter
import main


## Access SG-NEx data through AWS

In [None]:
# list all samples that have processed data for RNA modification detection using m6Anet
!aws s3 ls --no-sign-request s3://sg-nex-data/data/processed_data/m6Anet/  

# saves all samples that have processed data for RNA modification detection using m6Anet under data directory
!aws s3 sync --no-sign-request s3://sg-nex-data/data/processed_data/m6Anet/ ../data/sg-nex-data/raw

In [None]:
import os
import json
import pandas as pd
import shutil

def process_sg_nex_json_files(data_directory, output_directory):
    # Create the output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)
    
    # Iterate through each folder in the data directory
    for root, dirs, files in os.walk(data_directory):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                print(f"Processing: {file_path}")

                # Load and parse the JSON file
                data = []
                with open(file_path, 'r') as f:
                    for line in f:
                        # Parse each line as a JSON object
                        data.append(json.loads(line))

                # Flatten the data
                flattened_data = main.flatten_json(data)

                # Convert to a DataFrame
                df = pd.DataFrame(flattened_data)

                # Construct the output CSV file name using the folder name
                folder_name = os.path.basename(root)  # Get the folder name
                output_file_name = f"{folder_name}.csv"  # Use folder name for the CSV file
                output_path = os.path.join(output_directory, output_file_name)

                # Export as CSV
                df.to_csv(output_path, index=False)
                print(f"Saved: {output_path}")

                # Remove the folder after the CSV is successfully exported
                shutil.rmtree(root)
                print(f"Removed folder: {root}")

# Example usage
data_directory = '../data/sg-nex-data/raw'  
output_directory = '../data/sg-nex-data/processed'
process_sg_nex_json_files(data_directory, output_directory)


## Prediction of SG-NEx data using Random Forest

In [None]:
from joblib import load

RF_classifier = load('rf_classifier.joblib')  # Load the model


In [None]:
import os
import pandas as pd

def predict_processed_csv_files(processed_directory, prediction_directory):
    # Ensure the output directory exists
    os.makedirs(prediction_directory, exist_ok=True)

    # Iterate through all CSV files in the processed directory
    for file_name in os.listdir(processed_directory):
        if file_name.endswith('.csv'):
            file_path = os.path.join(processed_directory, file_name)
            print(f"Processing: {file_path}")

            # Read the features from the CSV file
            features = pd.read_csv(file_path)

            # Aggregate the features by transcript position
            features_agg = main.aggregate_by_transcript_position(features)


            # Define the columns to be used for prediction
            features_columns = [
                '-1_dwelling_time_mean', '-1_dwelling_time_min', '-1_dwelling_time_max',
                '-1_standard_dev_mean', 
                '-1_mean_current_mean', '-1_mean_current_min', '-1_mean_current_max',
                '0_dwelling_time_mean', '0_dwelling_time_min', '0_dwelling_time_max',
                '0_standard_dev_mean', 
                '0_mean_current_mean', '0_mean_current_min', '0_mean_current_max',
                '+1_dwelling_time_mean', '+1_dwelling_time_min', '+1_dwelling_time_max',
                '+1_standard_dev_mean', 
                '+1_mean_current_mean', '+1_mean_current_min', '+1_mean_current_max'
    ]
            
            # Prepare the data for prediction
            X_test = features_agg[features_columns]
            id_test = features_agg[['transcript_id', 'transcript_position']]

            # Make predictions
            result = main.predict(RF_classifier, id_test, X_test)
            result = result.sort_values(by='probability', ascending=False)
            result = result.rename(columns={'probability': 'score'})
            result = result[['transcript_id', 'transcript_position', 'score']]

            # Construct the output CSV file name
            output_file_name = file_name.replace('.csv', '_prediction.csv')
            output_csv_path = os.path.join(prediction_directory, output_file_name)

            # Export the results to the output directory
            result.to_csv(output_csv_path, index=False)
            print(f"Results exported to {output_csv_path}")

# Example usage
processed_directory = '../data/sg-nex-data/processed'
prediction_directory = '../data/sg-nex-data/predictions'
predict_processed_csv_files(processed_directory, prediction_directory)


In [None]:
import os
import pandas as pd

# Directory containing the prediction CSV files
prediction_directory = '../data/sg-nex-data/predictions'

# Dictionary to store dataframes by cell line
cell_line_dataframes = {
    'A549': [],
    'Hct116': [],
    'K562': [],
    'HepG2': [],
    'MCF7': []
}

# Iterate through the files in the prediction directory
for file_name in os.listdir(prediction_directory):
    if file_name.endswith('.csv'):
        file_path = os.path.join(prediction_directory, file_name)
        
        # Load the CSV file into a dataframe
        df = pd.read_csv(file_path)
        
        # Check the file name to determine which cell line it belongs to
        if 'A549' in file_name:
            cell_line_dataframes['A549'].append(df)
        elif 'Hct116' in file_name:
            cell_line_dataframes['Hct116'].append(df)
        elif 'K562' in file_name:
            cell_line_dataframes['K562'].append(df)
        elif 'HepG2' in file_name:
            cell_line_dataframes['HepG2'].append(df)
        elif 'MCF7' in file_name:
            cell_line_dataframes['MCF7'].append(df)

# Optionally, concatenate the dataframes for each cell line into a single dataframe
for cell_line, dfs in cell_line_dataframes.items():
    # Concatenate dataframes if the list is not empty
    cell_line_dataframes[cell_line] = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()

# Now, cell_line_dataframes dictionary contains concatenated dataframes for each cell line

# Example: Print the first few rows of each cell line dataframe to verify
for cell_line, df in cell_line_dataframes.items():
    print(f"First few rows of {cell_line} dataframe:")
    print(df.head())
    print("\n")
