In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["figure.figsize"] = (15,5)

In [None]:
def read_data(file_name):
    """
    Reads data from a csv file and returns a pandas dataframe
    """
    return pd.read_csv(file_name)

In [None]:
data = read_data("metadata.csv")

data.head()

In [None]:
data.tail()

In [None]:
public_hpa_df = data.loc[data["data_type"] == 'public']
print(public_hpa_df.filename.count())
public_hpa_df.head()

In [None]:
df_all_per_patient_stats = public_hpa_df[['patient_id', 'sex', 'age', 'tissue_name']]
df_all_per_patient_stats = df_all_per_patient_stats.drop_duplicates('patient_id', keep='first')
df_all_per_patient_stats.head()

In [None]:
df_all_per_patient_stats.count()

In [None]:
df_all_count = df_all_per_patient_stats['tissue_name'].value_counts()
print('Total', df_all_count.sum())
df_all_count

In [None]:
df_all_per_patient_stats = df_all_per_patient_stats.sort_values("tissue_name")

In [None]:
df_all_per_patient_stats['tissue_name'] = df_all_per_patient_stats['tissue_name'].apply(lambda x: str(x.replace("kidney", "Kidney")))
df_all_per_patient_stats['tissue_name'] = df_all_per_patient_stats['tissue_name'].apply(lambda x: str(x.replace("largeintestine", "Large Intestine")))
df_all_per_patient_stats['tissue_name'] = df_all_per_patient_stats['tissue_name'].apply(lambda x: str(x.replace("spleen", "Spleen")))
df_all_per_patient_stats['tissue_name'] = df_all_per_patient_stats['tissue_name'].apply(lambda x: str(x.replace("lung", "Lung")))
df_all_per_patient_stats['tissue_name'] = df_all_per_patient_stats['tissue_name'].apply(lambda x: str(x.replace("prostate", "Prostate")))

In [None]:
# sns.set(style="darkgrid")
# g = sns.scatterplot(x="tissue_name", y="age", data=df_all_per_patient_stats, style='sex', hue='tissue_name', s=80, palette="colorblind")
# g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)


In [None]:
sns.set(style="whitegrid")
g = sns.swarmplot(x='age', y='tissue_name', data=df_all_per_patient_stats, hue='sex', s=7, palette=['red', 'blue'], alpha=0.8)
g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)
g.set(xlabel='Age', ylabel='Organ')
g.set(xlim=(0,90))
sns.set_context("paper", rc={"font.size":25,"axes.titlesize":25,"axes.labelsize":25})
plt.figure(tight_layout=True)
sns.despine()


In [None]:
fig = g.get_figure()
fig.savefig('plots/donor_distribution_plot.png', bbox_inches = "tight")
fig.savefig('plots/donor_distribution_plot.svg', bbox_inches = "tight")

In [None]:
# jittered_x = df_all_per_patient_stats['age'] + 2 * np.random.rand(len(df_all_per_patient_stats['age'])) -0.05
# sns.set(style="darkgrid")
# g = sns.scatterplot(x="tissue_name", y=jittered_x, data=df_all_per_patient_stats, style='sex', hue='tissue_name', s=80, palette="colorblind")
# g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)