In [1]:
from protzilla.utilities.transform_dfs import long_to_wide
import pandas as pd
from protzilla.constants.paths import PROJECT_PATH
import matplotlib.pyplot as plt
import numpy as np


### Create datasets with simulated MVs

In [3]:
# load filtered ba39 dataset without MVs
df = pd.read_csv(f"{PROJECT_PATH}\\user_data\\runs\\ba_39_small_gt\\dataframes\\df_3.csv")
df = long_to_wide(df).transpose(copy=False)
columns = df.columns
columns_new = []
for c in columns:
    columns_new.append(f"Intensity {c}")
df.columns = columns_new

In [6]:
df.isna().sum().sum()

0

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 481 entries, A0A075B759;A0A0B4J2A2;F5H284;P0DN26;P0DN37;P62937;Q9Y536 to iRT-Kit_WR_fusion
Columns: 130 entries, Intensity AD01_C1_INSOLUBLE_01 to Intensity CTR44_C1_INSOLUBLE_01
dtypes: float64(130)
memory usage: 508.4+ KB


In [7]:
probabilities = [0,0.05,0.1,0.15,0.2,0.25,0.3,0.4,0.5]

In [10]:
# create datasets
for p in probabilities:
    np.random.seed(42)
    na_mask = np.random.choice([True, False], size=df.shape, p=[p,1-p] )
    print(na_mask)
    na_df = df.mask(na_mask,other=np.nan)
    # add columns to make datasets compatible with MaxQuant import
    na_df["Intensity"] = [np.nan for i in range(0,481)]
    na_df["iBAQ"] = [np.nan for i in range(0,481)]
    na_df["iBAQ peptides"] = [np.nan for i in range(0,481)]
    na_df["Gene names"] = ["Gene1" for i in range(0,481)]
    na_df.index.name = "Protein IDs"
    na_df.to_csv(f"{PROJECT_PATH}\\user_data\\data\\ba_39_small_{p}.csv",sep="\t")


[[False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]
 ...
 [False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]]
[[False False False ... False  True False]
 [False False False ... False False False]
 [False False False ... False False False]
 ...
 [False False False ...  True False False]
 [False False False ... False False False]
 [False False False ... False False False]]
[[False False False ... False  True False]
 [False False False ... False False  True]
 [False False False ... False False False]
 ...
 [False False False ...  True False False]
 [False False False ... False False False]
 [False False False ... False False False]]
[[False False False ... False  True False]
 [False False  True ... False False  True]
 [False False  True ... False False False]
 ...
 [ True False False ...  True False False]
 [False False False ... False F

In [11]:
# print actual MV rates of the created datasets
for p in probabilities:
    df_sim_nas = pd.read_csv(f"{PROJECT_PATH}\\user_data\\data\\ba_39_small_{p}.csv",sep="\t")
    print((df_sim_nas.isna().sum().sum()-3*481)/(130*481)*100)
    

0.0
4.968814968814969
10.03518311210619
14.972013433551895
19.940828402366865
24.951223412761873
29.97920997920998
40.02558771789541
50.007996161842314
