In [1]:
# Import functions for analysis of SPLASH np.arrays
import io_ops as iops
import find_interactions as fi
import pandas as pd

In [2]:
# Define file paths and viral features:
DIRECTORY = "/home/ru27wav/Projects/gl_iav-splash_freiburg"
INPUT = f"{DIRECTORY}/data/arrays_IAV_wt_vs_mut"
RESULT = f"{DIRECTORY}/results/IAV_wt_vs_mut"
iav_segments = ["PB2", "PB1", "PA", "HA", "NP", "NA", "M", "NS"]
strains = ["wt", "mut"]

In [3]:
# Read the arrays from file and put them into dictionaries
wt_d_repDir2Combinations, wt_d_combinations2arrays = iops.read_arrays(
    f'{INPUT}/wt', iav_segments
)

mut_d_repDir2Combinations, mut_d_combinations2arrays = iops.read_arrays(
    f'{INPUT}/mut', iav_segments
)

In [4]:
# Unpack the arrays and filter the regions with readcounts greater than the mean of all values...
wt_d_combinations2arrays_filtered = {}
for combination, arrays in wt_d_combinations2arrays.items():
    for i in range(len(arrays)):
        if i == 0:
            wt_d_combinations2arrays_filtered[combination] = [fi.mean_filter(arrays[i])]
        else:
            wt_d_combinations2arrays_filtered[combination].append(
                fi.mean_filter(arrays[i])
            )

mut_d_combinations2arrays_filtered = {}
for combination, arrays in mut_d_combinations2arrays.items():
    for i in range(len(arrays)):
        if i == 0:
            mut_d_combinations2arrays_filtered[combination] = [fi.mean_filter(arrays[i])]
        else:
            mut_d_combinations2arrays_filtered[combination].append(
                fi.mean_filter(arrays[i])
            )

In [5]:
# Merge the binary arrays
# This is kind of a hack (ugly hack), it doesn't generalize very well

d_combinations2arrays_combined = {}
for combination, arrays in wt_d_combinations2arrays_filtered.items():
    arrays.append(mut_d_combinations2arrays_filtered[combination][0])
    d_combinations2arrays_combined[combination] = fi.combine_filters(arrays)


In [6]:
# Create a dictionary of coordinates
d_combinations2coordinates = {}
for combination, array in d_combinations2arrays_combined.items():
    d_combinations2coordinates[combination] = fi.extract_coordinates(array)

In [7]:
# Create a dictionary of regions
d_combinations2regions = {}
for combination, coordinates in d_combinations2coordinates.items():
    d_combinations2regions[combination] = fi.extract_regions(coordinates)

In [8]:
for combination, values in d_combinations2regions.items():
    pd.DataFrame.from_dict(d_combinations2regions, orient='index').to_csv(f"{RESULT}/{combination}.csv")

In [9]:
# Create a dictionary of the mean countvalue for each region
wt_d_combinations2means = {}
for combination, regions in d_combinations2regions.items():
    wt_d_combinations2means[combination] = [
        fi.readcounts_to_means(regions, array)
        for array in wt_d_combinations2arrays[combination]
    ]

mut_d_combinations2means = {}
for combination, regions in d_combinations2regions.items():
    mut_d_combinations2means[combination] = [
        fi.readcounts_to_means(regions, array)
        for array in mut_d_combinations2arrays[combination]
    ]

In [10]:
# Format each of the mean dictionaries to a .csv file and save it
for combination, means in wt_d_combinations2means.items():
    iops.format_means_to_table(
        wt_d_combinations2means[combination],
        output_path=f"{RESULT}/wt_{combination}_interactions.csv",
    )

for combination, means in mut_d_combinations2means.items():
    iops.format_means_to_table(
        mut_d_combinations2means[combination],
        output_path=f"{RESULT}/mut_{combination}_interactions.csv",
    )

In [11]:
slosh_dataset = {}
for combination in d_combinations2arrays_combined.keys():
        # This loop goest through the same combinations lots of times, (ugly hack) this has to be fixed
            slosh_dataset[f"{combination}"] = {
                strain : pd.read_csv(f"{RESULT}/{strain}_{combination}_interactions.csv", index_col=0, header=None,) for strain in strains
                }

In [12]:
# Change column name of dataframes
for combination, strain2df in slosh_dataset.items():
    for strain, df in strain2df.items():
        slosh_dataset[combination][strain].columns = [strain for i in range(len(slosh_dataset[combination][strain].columns))]

# Consolidate dataframes
slosh_dataset_consolidated = {}
for combination, strain2df in slosh_dataset.items():
    slosh_dataset_consolidated[combination] = pd.concat([df for df in strain2df.values()], axis=1)

# Rename interaction ids
for combination, df in slosh_dataset_consolidated.items():
    for id in df.index:
        df = df.rename(index={id: f"{combination}-{id}"})
    slosh_dataset_consolidated[combination] = df

slosh_df = pd.concat(slosh_dataset_consolidated.values(), axis=0)

In [13]:
# Generate dataframe with fake data (ugly hack)
slosh_df = pd.concat([slosh_df, slosh_df["mut"], slosh_df["mut"]], axis=1)
slosh_df

Unnamed: 0_level_0,wt,wt,wt,mut,mut,mut
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HA_PA-1,86,25,151,0,0,0
HA_PA-2,86,26,155,0,0,0
HA_PA-3,87,26,159,2,2,2
HA_PA-4,90,28,159,2,2,2
HA_PA-5,92,28,162,2,2,2
...,...,...,...,...,...,...
NP_NP-1189,75,37,100,79,79,79
NP_NP-1190,100,75,76,281,281,281
NP_NP-1191,86,28,69,112,112,112
NP_NP-1192,66,83,293,133,133,133


In [14]:
slosh_df.to_csv(f"{RESULT}/wt_mut_interactions.csv")

In [15]:
# Manually run R script (ugly hack)

In [16]:
deseq2_output = pd.read_csv(f"/home/ru27wav/Projects/gl_iav-splash_freiburg/results/IAV_wt_vs_mut/deseq2_output_wt_mut_interactions.csv", index_col=0, header=0)