# Format data for datasette upload

In [None]:
import pandas as pd

In [None]:
from pyprojroot import here

In [None]:
df_bioc = pd.read_csv(here() / "data/df_bioc.csv", index_col=0).set_index('miRNA')

df_bioc.columns

In [None]:
df_vari = pd.read_csv(here() / "data/df_vari.csv", index_col=0).set_index('miRNA')

In [None]:
from scipy.special import logit
import janitor

combined_df = pd.concat([df_bioc, df_vari]).query("frac_avg < 1").query("frac_avg > 0").transform_column("frac_avg", logit, "logit")

In [None]:
seq_columns = [c for c in df_bioc.columns if "seq_" in c]
entropy_columns = [c for c in df_bioc.columns if "shannon_" in c]
bp_columns = [c for c in df_bioc.columns if "bp_" in c]
cut_columns = [c for c in df_bioc.columns if "cut_" in c]

## Combined dataset

In [None]:
import janitor

def remove_long_cols(df):
    seq_columns = [c for c in df_bioc.columns if "seq_" in c]
    entropy_columns = [c for c in df_bioc.columns if "shannon_" in c]
    bp_columns = [c for c in df_bioc.columns if "bp_" in c]
    cut_columns = [c for c in df_bioc.columns if "cut_" in c]
    return df.select_columns(seq_columns, invert=True).select_columns(entropy_columns, invert=True).select_columns(bp_columns, invert=True).select_columns(cut_columns, invert=True)

smaller_df = combined_df.pipe(remove_long_cols)
smaller_df.to_csv(here() / "data/combined.csv")

## Position-based data

In [None]:
entropy = combined_df.select_columns(entropy_columns)
entropy.to_csv(here() / "data/entropy.csv")

In [None]:
onehot = combined_df.select_columns(seq_columns)
onehot.to_csv(here() / "data/onehot.csv")

In [None]:
bp = combined_df.select_columns(bp_columns)
bp.to_csv(here() / "data/bp.csv")

In [None]:
cut = combined_df.select_columns(cut_columns)
cut.to_csv(here() / "data/cut.csv")

In [None]:
import seaborn as sns
sns.pairplot(
    smaller_df
    .query("replicate == 'biochem'")
    .select_columns(
        [
            "logit", "length", "GC", "AU", "compact", 
            "hairpin_score", "minimum_free_energy", "ensemble_free_energy"
        ]
    )
)

In [None]:
smaller_df.columns