In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = "Alveolar_cells_Type_II_Merged_Batches"
df = pd.read_csv("{}/mathematica/mathematica_results.csv".format(dataset))

In [3]:
df.thresholds[df.thresholds.str.startswith("\"\"")] = np.nan
df.loc[df.thresholds == "NotFound", "thresholds"] = np.nan

In [4]:
df.thresholds = df.thresholds.astype("float")

In [5]:
df

Unnamed: 0.1,Unnamed: 0,distribution,antimode,thresholds
0,0610005C13Rik,PoissonDistribution[0.0012091898428053204],NotMixture,
1,0610007N19Rik,WaringYuleDistribution[9.023527549978395],NotMixture,
2,0610007P14Rik,WaringYuleDistribution[7.155166729336329],NotMixture,
3,0610008F07Rik,WaringYuleDistribution[414.50480192076844],NotMixture,
4,0610009B14Rik,GeometricDistribution[0.9892344497607655],NotMixture,
...,...,...,...,...
22439,n-R5s33,PoissonDistribution[0.0012091898428053204],NotMixture,
22440,n-R5s62,WaringYuleDistribution[414.50480192076844],NotMixture,
22441,n-R5s76,PoissonDistribution[0.015719467956469165],NotMixture,
22442,n-R5s89,PoissonDistribution[0.0012091898428053204],NotMixture,


In [6]:
df[~df.antimode.str.contains("Mixture")]

Unnamed: 0.1,Unnamed: 0,distribution,antimode,thresholds
1421,AC167036.1,"MixtureDistribution[{0.5401414786859126, 0.459...",ContainsLogSeries,
1685,Actb,"MixtureDistribution[{0.7862016944879495, 0.213...","{{{x -> 4.228966969927115}, ""Mode""}}",
2327,Areg,"MixtureDistribution[{0.9079494816088617, 0.092...",{},
2572,Atf3,"MixtureDistribution[{0.9306350111748624, 0.069...","{{{x -> 2.377795257297313}, ""Antimode""}, {{x -...",2.377795
2624,Atp1b1,"MixtureDistribution[{0.6880458791436712, 0.311...","{{{x -> 3.67322947511148}, ""Mode""}}",
...,...,...,...,...
22386,mt-Co1,"MixtureDistribution[{0.866332348989702, 0.1336...","{{{x -> 7.371478372337116}, ""Mode""}}",
22387,mt-Co2,"MixtureDistribution[{0.7071574673807035, 0.292...","{{{x -> 1.4545536850874003}, ""Mode""}}",
22388,mt-Co3,"MixtureDistribution[{0.8886651607909138, 0.111...","{{{x -> 3.000909731311978}, ""Mode""}}",
22397,mt-Rnr1,"MixtureDistribution[{0.817240494667062, 0.1827...","{{{x -> 0.3399707826380639}, ""Mode""}}",


In [7]:
print("Total genes considered:",df.shape[0])
print("SystemException:",df[df.distribution == "-1"].shape[0])
print("Not Mixtures:",df[df.antimode.str.contains("Mixture")].shape[0])
print("Contains Laplace:",df[df.antimode.str.contains("Lapla")].shape[0])
print("Contains Zipf:",df[df.antimode.str.contains("Zipf")].shape[0])
print("Contains Uniform:",df[df.antimode.str.contains("Uniform")].shape[0])
print("Contains Benford:",df[df.antimode.str.contains("Benford")].shape[0])
print("Contains LogSeries:",df[df.antimode.str.contains("LogSeries")].shape[0])
print("Hand Removed:",df[df.antimode.str.contains("HandRemoved")].shape[0])
print("Antimode Found:",df.thresholds.count())
print("Antimode Found < 0.5:",(df.thresholds < 0.5).sum())

Total genes considered: 22444
SystemException: 0
Not Mixtures: 22299
Contains Laplace: 0
Contains Zipf: 26
Contains Uniform: 1
Contains Benford: 0
Contains LogSeries: 15
Hand Removed: 0
Antimode Found: 3
Antimode Found < 0.5: 0


In [8]:
df.distribution.str.split("[").apply(lambda x: x[0]).value_counts()

PoissonDistribution             8650
WaringYuleDistribution          7680
GeometricDistribution           5274
DataDistribution                 556
MixtureDistribution              145
NegativeBinomialDistribution     137
ExtremeValueDistribution           1
LogNormalDistribution              1
Name: distribution, dtype: int64

In [13]:
df.rename({"Unnamed: 0": "gene_id"}, axis=1, inplace=True)

In [14]:
df[df.gene_id.str.startswith("Pcdh")]

Unnamed: 0,gene_id,distribution,antimode,thresholds


In [15]:
_df = df.dropna().drop(["distribution", "antimode"],axis=1).copy()
_df["uncorrected_threshold"] = _df.thresholds.copy()
_df["threshold"] = _df.thresholds.copy()
_df.loc[_df.threshold < 0.5,"threshold"] = 0.5
_df.drop("thresholds",inplace=True,axis=1)

In [16]:
_df.to_csv("{}/mathematica/fitted_distributions_thresholds.csv".format(dataset),index=False)

In [17]:
all_genes_df = pd.read_csv("{}.csv".format(dataset),index_col="gene_id").loc[_df.gene_id]

In [18]:
for _, (gene_id, _, threshold) in _df.iterrows():
    all_genes_df.loc[gene_id,all_genes_df.loc[gene_id,:] < threshold] = 0
    all_genes_df.loc[gene_id,all_genes_df.loc[gene_id,:] >= threshold] = 1

In [19]:
all_genes_df.to_csv("{}/mathematica/fitted_distributions_dichotomised.csv".format(dataset))

In [20]:
_df.uncorrected_threshold.describe()

count    10.000000
mean      0.402285
std       0.009734
min       0.387475
25%       0.397812
50%       0.402509
75%       0.406376
max       0.417335
Name: uncorrected_threshold, dtype: float64

In [21]:
_df

Unnamed: 0,gene_id,uncorrected_threshold,threshold
1,Trbv13-2,0.403877,0.5
2,Trbv13-3,0.407209,0.5
4,Trbv15,0.416052,0.5
5,Trbv16,0.398348,0.5
7,Trbv19,0.397633,0.5
8,Trbv2,0.389903,0.5
9,Trbv20,0.402567,0.5
14,Trbv3,0.417335,0.5
16,Trbv31,0.387475,0.5
17,Trbv5,0.402451,0.5
