In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import spearmanr

In [None]:
dataset = "Dopaminergic_TPM_clean"
sns.set_context("paper")

In [None]:
# Set-up matplotlib
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Helvetica']

# All genes (no fitted distributions)
## Thresholds

In [None]:
vrs = pd.read_csv("{}/intermediate/vrs/optimal_thresholds.csv".format(dataset))
three_max = pd.read_csv("{}/intermediate/3max/optimal_thresholds.csv".format(dataset))
geomean = pd.read_csv("{}/intermediate/geomean/optimal_thresholds.csv".format(dataset))
thresholds = pd.DataFrame(
    [vrs.threshold,three_max.threshold,geomean.threshold], 
    index=["VRS", "three_max", "geo_mean"]
).T 

In [None]:
three_max.uncorrected_threshold[three_max.uncorrected_threshold < 0]

In [None]:
display("Pearson")
display(thresholds.corr(method="pearson"))
display("Kendall")
display(thresholds.corr(method="kendall"))
display("Spearman")
display(thresholds.corr(method="spearman"))

In [None]:
sns.pairplot(np.log2(thresholds), height=3, aspect=1.0)
plt.show()

## Expression frequency

In [None]:
vrs = pd.read_csv("{}/intermediate/vrs/dichotomised_genes.csv".format(dataset),index_col="gene_id").mean(axis=1)
three_max = pd.read_csv("{}/intermediate/3max/dichotomised_genes.csv".format(dataset), index_col="gene_id").mean(axis=1)
geo_mean = pd.read_csv("{}/intermediate/geomean/dichotomised_genes.csv".format(dataset), index_col="gene_id").mean(axis=1)
frequencies = pd.DataFrame(
    [vrs, three_max, geo_mean], 
    index=["VRS", "three_max", "geo_mean"]
).T

In [None]:
display("Pearson")
display(frequencies.corr(method="pearson"))
display("Kendall")
display(frequencies.corr(method="kendall"))
display("Spearman")
display(frequencies.corr(method="spearman"))

In [None]:
sns.pairplot(frequencies, height=3, aspect=1.0)
plt.show()

# Genes with idenified antimode
## Thresholds

In [None]:
dist = pd.read_csv("{}/mathematica/fitted_distributions_thresholds.csv".format(dataset),index_col="gene_id")
vrs = pd.read_csv("{}/intermediate/vrs/optimal_thresholds.csv".format(dataset),index_col="gene_id").loc[dist.index]
three_max = pd.read_csv("{}/intermediate/3max/optimal_thresholds.csv".format(dataset),index_col="gene_id").loc[dist.index]
geomean = pd.read_csv("{}/intermediate/geomean/optimal_thresholds.csv".format(dataset),index_col="gene_id").loc[dist.index]
thresholds = pd.DataFrame(
    [vrs.threshold,three_max.threshold,geomean.threshold,dist.threshold], 
    index=["VRS", "three_max", "geo_mean","fitted_distribution"]
).T 

In [None]:
three_max

In [None]:
display("Pearson")
display(thresholds.corr(method="pearson"))
display("Kendall")
display(thresholds.corr(method="kendall"))
display("Spearman")
display(thresholds.corr(method="spearman"))

In [None]:
sns.pairplot(thresholds, height=3, aspect=1.0)
plt.show()

## Expression frequency

In [None]:
dist = pd.read_csv("{}/mathematica/fitted_distributions_dichotomised.csv".format(dataset),index_col="gene_id").mean(axis=1)
vrs = pd.read_csv("{}/intermediate/vrs/dichotomised_genes.csv".format(dataset),index_col="gene_id").mean(axis=1).loc[dist.index]
three_max = pd.read_csv("{}/intermediate/3max/dichotomised_genes.csv".format(dataset),index_col="gene_id").mean(axis=1).loc[dist.index]
geomean = pd.read_csv("{}/intermediate/geomean/dichotomised_genes.csv".format(dataset),index_col="gene_id").mean(axis=1).loc[dist.index]
frequencies = pd.DataFrame(
    [vrs, three_max, geomean, dist], 
    index=["VRS", "FM", "GTME", "Fit. Distr."]
).T

In [None]:
frequencies.shape

In [None]:
# display("Pearson")
# display(frequencies.corr(method="pearson"))
# display("Kendall")
# display(frequencies.corr(method="kendall"))
# display("Spearman")
# display(frequencies.corr(method="spearman"))
# frequencies.columns = ["VRS", "FM", "GTME", "Fit. Distr."]
# print(frequencies.corr(method="spearman").to_latex(float_format="%.2f"))

# sns.heatmap(frequencies.corr(method="spearman"), annot=True, cbar=False, cmap="Reds", square=True)
# plt.show()
sns.set(context="notebook", style="ticks", font="Arial")
sns.heatmap(frequencies.corr(method="spearman"), annot=True, cbar=False, cmap="Blues", square=True)
plt.yticks(rotation = 0)
plt.xticks(rotation = 90)
plt.show()
# sns.heatmap(frequencies.corr(method="spearman"), center=0, annot=True, cbar=False, square=True)
# plt.show()

In [None]:
# sns.set(font_scale=1)
# sns.set()
# sns.set_context("notebook")
sns.set(font_scale=2.2, style="ticks", font="Arial")
sns.pairplot(frequencies, plot_kws={"s": 12})
plt.show()

In [None]:
import matplotlib
matplotlib.rcParams['font.family']
from matplotlib.font_manager import findfont, FontProperties
font = findfont(FontProperties(family=['sans-serif']))
font

In [None]:
findfont(FontProperties(family="Arial"))

# Manual correlation calculation

In [None]:
frequencies

In [None]:
from scipy.stats import rankdata
geo_mean_ranks = rankdata(frequencies.geo_mean)
fitted_distributions_ranks = rankdata(frequencies.fitted_distribution)

In [None]:
from scipy.stats import pearsonr
pearsonr(geo_mean_ranks, fitted_distributions_ranks)

In [None]:
n = len(geo_mean_ranks)
1 - (((geo_mean_ranks - fitted_distributions_ranks) ** 2).sum() * 6) / (n * (n ** 2 - 1)) 