# New Big Dataframe

In [None]:
import pandas as pd
import os

In [None]:

dfs = pd.read_csv("../speos/Simulations_GEARS_UC.tsv", sep="\t", header=0)

In [None]:
dfs.head()

In [None]:
len(dfs)

In [None]:
import json
individual_genes = set()
for combination in dfs["Combination"]:
    individual_genes.update(set(json.loads(combination.replace('\'', '"'))))

len(individual_genes)
    

In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 5))

myorder = dfs.groupby(by=["Group"])["mag"].median().sort_values().iloc[::-1].index

ax = sns.boxplot(data=dfs, x="Group", y="mag", order=myorder, fliersize=3, ax=ax)

for l in ax.lines:
     if l.get_marker() != '':
          xs = l.get_xdata()
          xs += np.random.uniform(-0.2, 0.2, len(xs))
          l.set_xdata(xs)

for i, text in enumerate(["n={}".format(dfs.value_counts(subset="Group")[labels.get_text()]) for labels in ax.get_xticklabels()]):
     ax.text(i, 0.25, text, ha="center")
ax.hlines((1.0, 1.15), -0.5, 7, ls=":", color="lightgray", zorder = -1)

#ax.text(6.95, y=1.08, s="Epistasis", ha="right", va="center")
ax.text(6.95, y=1.2, s="Synergy",  ha="right", va="bottom")
ax.text(6.95, y=0.95, s="Suppression", ha="right", va="top")
ax.set_ylim(0.2, 2.4)
ax.set_xlim(-0.5, 7)




In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
fig, ax = plt.subplots(figsize=(5,3))
ax = sns.kdeplot(dfs, x="mag")

ax.set_xlim(0.35)
ax.set_xlabel("Magnitude")
ax.vlines(x=1, ymin=0, ymax=1.5, linestyles="--", color="gray")
ax.text(x=1 * 0.97, ha="right", y=1.45, va="top", s="Suppression\nThreshold")
plt.tight_layout()
#plt.savefig("allmag.pdf")

In [None]:

fig, ax = plt.subplots(figsize=(5,3))
ax = sns.kdeplot(dfs, x="mag")

dfs.sort_values("mag").iloc[500, :]["mag"]
ax.vlines(x=dfs.sort_values("mag").iloc[100, :]["mag"], ymin=0, ymax=0.15, color="orange")
ax.text(x=dfs.sort_values("mag").iloc[100, :]["mag"] * 0.99, ha="right", y=0.15, va="top", s="Top 100\nSuppressions")
ax.vlines(x=dfs.sort_values("mag").iloc[500, :]["mag"], ymin=0, ymax=0.25, color="green")
ax.text(x=dfs.sort_values("mag").iloc[500, :]["mag"] * 0.99, ha="right", y=0.25, va="top", s="Top 500\nSuppressions")
ax.vlines(x=1, ymin=0, ymax=0.4, linestyles="--", color="gray")
ax.text(x=1 * 0.99, ha="right", y=0.4, va="top", s="Suppression Threshold")
ax.set_xlim((0.35, 1.2))
ax.set_ylim((0, 0.6))
ax.set_xlabel("Magnitude")
plt.tight_layout()
plt.savefig("topmag.pdf")

In [None]:
fig, ax = plt.subplots(figsize=(5,3))
ax = sns.kdeplot(dfs, x="corr_fit")

dfs.sort_values("corr_fit").iloc[500, :]["corr_fit"]
ax.vlines(x=dfs.sort_values("corr_fit").iloc[100, :]["corr_fit"], ymin=0, ymax=0.15, color="orange")
ax.text(x=dfs.sort_values("corr_fit").iloc[100, :]["corr_fit"] * 0.99, ha="right", y=0.15, va="top", s="Top 100\nNeomorphisms")
ax.vlines(x=dfs.sort_values("corr_fit").iloc[500, :]["corr_fit"], ymin=0, ymax=0.25, color="green")
ax.text(x=dfs.sort_values("corr_fit").iloc[500, :]["corr_fit"] * 0.99, ha="right", y=0.25, va="top", s="Top 500\nNeomorphisms")
ax.vlines(x=0.88, ymin=0, ymax=0.6, linestyles="--", color="gray")
ax.set_xlim((0.2, 0.9))
ax.set_ylim((0, 0.6))
ax.set_xlabel("Model Fit")
plt.tight_layout()
plt.savefig("topfit.pdf")

In [None]:
fig, ax = plt.subplots(figsize=(5,3))
ax = sns.kdeplot(dfs, x="corr_fit")
ax.set_xlabel("Model Fit")
ax.vlines(x=0.88, ymin=0, ymax=20, linestyles="--", color="gray")
ax.text(x=0.88 * 0.99, ha="right", y=20, va="top", s="Neomorphism Threshold")
ax.set_xlim(0.2)
plt.tight_layout()
plt.savefig("allfit.pdf")

In [None]:
dfs.sort_values(by="mag").iloc[:100, :].value_counts("Group")

In [None]:
from scipy.stats import chisquare

counts = dfs.sort_values(by="mag").iloc[:100, :].value_counts("Group").sort_index()
try:
    del counts["HSPxHSP"]
except KeyError:
    pass

counts.sort_index()

expected = dfs.value_counts("Group")

del expected["HSPxHSP"]

expected = expected / expected.sum() 

chisquare(counts, expected.sort_index() * counts.sum())

In [None]:
data = pd.DataFrame({"Counts": counts.tolist() + (expected.sort_index() * counts.sum()).tolist(),
                     "Category": counts.index.tolist() + expected.sort_index().index.tolist(),
                     "Group":  ["Observed"] * len(counts) + ["Expected"] * len(expected)})

myorder = data[data["Group"] == "Observed"].groupby(by=["Category"])["Counts"].max().sort_values().iloc[::-1].index

g = sns.catplot(
    data=data, kind="bar",
    x="Category", y="Counts", hue="Group", order=myorder
)

In [None]:
import matplotlib.pyplot as plt
data = pd.DataFrame({"Counts": counts - (expected.sort_index() * counts.sum())},
                    index=counts.index.tolist())
data = data.transpose()

myorder = ["CorexCore", "HSPxCore", "CorexPeri", "HSPxPeri", "PerixPeri"]

data = data[myorder]
fig, ax = plt.subplots(figsize=(6, 3))
ax.bar(x=range(len(data.columns)), height=data.loc["Counts", :], color="orange")
ax.set_ylabel("$\Delta Counts$\n(Observed - Expected)")
ax.set_xlabel("Co-Perturbation Group")
ax.set_xticks(range(len(data.columns)))
ax.set_xticklabels(data.columns)
ax.hlines(xmin=-0.5, xmax =4.5, y=0,  color="black")
ax.set_xlim((-0.5, 4.5))
plt.tight_layout()
#plt.savefig("100mag.pdf")

In [None]:
from scipy.stats import chisquare

counts = dfs.sort_values(by="mag").iloc[:500, :].value_counts("Group").sort_index()
try:
    del counts["HSPxHSP"]
except KeyError:
    pass

counts.sort_index()

expected = dfs.value_counts("Group")

del expected["HSPxHSP"]

expected = expected / expected.sum() 

chisquare(counts, expected.sort_index() * counts.sum())

In [None]:
counts

In [None]:
expected.sort_index() * counts.sum()

In [None]:
import matplotlib.pyplot as plt
data = pd.DataFrame({"Counts": counts - (expected.sort_index() * counts.sum())},
                    index=counts.index.tolist())
data = data.transpose()

myorder = ["CorexCore", "HSPxCore", "CorexPeri", "HSPxPeri", "PerixPeri"]

data = data[myorder]
fig, ax = plt.subplots(figsize=(6, 3))
ax.bar(x=range(len(data.columns)), height=data.loc["Counts", :], color="green"
)
ax.set_ylabel("$\Delta Counts$\n(Observed - Expected)")
ax.set_xlabel("Co-Perturbation Group")
ax.set_xticks(range(len(data.columns)))
ax.set_xticklabels(data.columns)
ax.hlines(xmin=-0.5, xmax =4.5, y=0,  color="black")
ax.set_xlim((-0.5, 4.5))
plt.tight_layout()
plt.savefig("500mag.pdf")

In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 5))

myorder = dfs.groupby(by=["Group"])["mag"].median().sort_values().iloc[::-1].index

ax = sns.boxplot(data=dfs, x="Group", y="corr_fit", order=myorder, fliersize=3, ax=ax)

for l in ax.lines:
     if l.get_marker() != '':
          xs = l.get_xdata()
          xs += np.random.uniform(-0.2, 0.2, len(xs))
          l.set_xdata(xs)

for i, text in enumerate(["n={}".format(dfs.value_counts(subset="Group")[labels.get_text()]) for labels in ax.get_xticklabels()]):
     ax.text(i, 0.13, text, ha="center")
ax.hlines(0.88 , -0.5, 7, ls=":", color="lightgray", zorder = -1)

ax.text(6.95, y=0.85, s="Neomorphism", ha="right", va="center")
#ax.text(6.95, y=1.2, s="Synergy",  ha="right", va="bottom")
#ax.text(6.95, y=0.95, s="Suppression", ha="right", va="top")
ax.set_ylim(0.1, 1)
ax.set_xlim(-0.5, 7)


In [None]:
(dfs[dfs["corr_fit"] < 0.88].value_counts("Group") / dfs.value_counts("Group")).sort_values()

In [None]:
dfs.sort_values(by="corr_fit").iloc[:100, :].value_counts("Group")

In [None]:
from scipy.stats import chisquare

counts = dfs.sort_values(by="corr_fit").iloc[:500, :].value_counts("Group").sort_index()

try:
    del counts["HSPxHSP"]
except KeyError:
    pass

expected = dfs.value_counts("Group")

del expected["HSPxHSP"]

expected = expected / expected.sum() 

chisquare(counts, expected.sort_index() * counts.sum())

In [None]:
counts

In [None]:
expected.sort_index() * counts.sum()

In [None]:
data = pd.DataFrame({"Counts": counts.tolist() + (expected.sort_index() * counts.sum()).tolist(),
                     "Category": counts.index.tolist() + expected.sort_index().index.tolist(),
                     "Group": ["Observed"] * len(counts) + ["Expected"] * len(expected)})

myorder = data[data["Group"] == "Observed"].groupby(by=["Category"])["Counts"].max().sort_values().iloc[::-1].index

g = sns.catplot(
    data=data, kind="bar",
    x="Category", y="Counts", hue="Group", order=myorder
)

In [None]:
import matplotlib.pyplot as plt
data = pd.DataFrame({"Counts": counts - (expected.sort_index() * counts.sum())},
                    index=counts.index.tolist())
data = data.transpose()

myorder = ["CorexCore", "HSPxCore", "CorexPeri", "HSPxPeri", "PerixPeri"]

data = data[myorder]
fig, ax = plt.subplots(figsize=(6, 3))
ax.bar(x=range(len(data.columns)), height=data.loc["Counts", :], color="orange"
)
ax.set_ylabel("$\Delta Counts$ \n (Observed - Expected)")
ax.set_xlabel("Co-Perturbation Group")
ax.set_xticks(range(len(data.columns)))
ax.set_xticklabels(data.columns)
ax.hlines(xmin=-0.5, xmax =4.5, y=0,  color="black")
ax.set_xlim((-0.5, 4.5))
plt.tight_layout()
plt.savefig("100fit.pdf")

In [None]:
dfs.sort_values(by="corr_fit").iloc[:100, :] 

In [None]:
from scipy.stats import chisquare

counts = dfs.sort_values(by="corr_fit").iloc[:500, :].value_counts("Group").sort_index()

try:
    del counts["HSPxHSP"]
except KeyError:
    pass

expected = dfs.value_counts("Group")

del expected["HSPxHSP"]

expected = expected / expected.sum() 

chisquare(counts, expected.sort_index() * counts.sum())

In [None]:
data = pd.DataFrame({"Counts": counts.tolist() + (expected.sort_index() * counts.sum()).tolist(),
                     "Category": counts.index.tolist() + expected.sort_index().index.tolist(),
                     "Group": ["Observed"] * len(counts) + ["Expected"] * len(expected)})

myorder = data[data["Group"] == "Observed"].groupby(by=["Category"])["Counts"].max().sort_values().iloc[::-1].index

g = sns.catplot(
    data=data, kind="bar",
    x="Category", y="Counts", hue="Group", order=myorder
)

In [None]:
import matplotlib.pyplot as plt
data = pd.DataFrame({"Counts": counts - (expected.sort_index() * counts.sum())},
                    index=counts.index.tolist())
data = data.transpose()

myorder = ["CorexCore", "HSPxCore", "CorexPeri", "HSPxPeri", "PerixPeri"]

data = data[myorder]
fig, ax = plt.subplots(figsize=(6,3))
ax.bar(x=range(len(data.columns)), height=data.loc["Counts", :], color="green"
)
ax.set_ylabel("$\Delta Counts$ \n(Observed - Expected)")
ax.set_xlabel("Co-Perturbation Group")
ax.set_xticks(range(len(data.columns)))
ax.set_xticklabels(data.columns)
ax.hlines(xmin=-0.5, xmax =4.5, y=0,  color="black")
ax.set_xlim((-0.5, 4.5))
plt.tight_layout()
plt.savefig("500fit.pdf")

In [None]:
(dfs[dfs["corr_fit"] < 0.88].value_counts("Group") / len(dfs[dfs["corr_fit"] < 0.88])).sort_index() * 100

In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 5))

myorder = dfs.groupby(by=["Group"])["mag"].median().sort_values().iloc[::-1].index

ax = sns.boxplot(data=dfs, x="Group", y="eq_contr", order=myorder, fliersize=3, ax=ax)

for l in ax.lines:
     if l.get_marker() != '':
          xs = l.get_xdata()
          xs += np.random.uniform(-0.2, 0.2, len(xs))
          l.set_xdata(xs)

for i, text in enumerate(["n={}".format(dfs.value_counts(subset="Group")[labels.get_text()]) for labels in ax.get_xticklabels()]):
     ax.text(i, 0.13, text, ha="center")
#ax.hlines(0.88 , -0.5, 7, ls=":", color="lightgray", zorder = -1)

#ax.text(6.95, y=0.85, s="Neomorphism", ha="right", va="center")
#ax.text(6.95, y=1.2, s="Synergy",  ha="right", va="bottom")
#ax.text(6.95, y=0.95, s="Suppression", ha="right", va="top")
ax.set_ylim(0.1, 1.1)
ax.set_xlim(-0.5, 7)