In [None]:
import pickle
import pandas as pd
import numpy as  np
from matplotlib import pyplot as plt
import seaborn as sns
from IPython.display import display

In [None]:
df = pd.read_csv('./data/gtex/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct', sep='\t', skiprows=2, index_col=0)
# print(df.shape)
# In [9]: np.where(df.index.isin(['FBXO11',  'CIITA', 'HLA-DRA', ]))
# Out[9]: (array([ 5820, 18006, 41214]),)

# retrieve only 'FBXO11',  'CIITA', 'HLA-DRA'
skip = np.arange(56203)
# skip = np.delete(skip, [2, 5822, 18008, 41216])
skip = np.delete(skip, [2, 5823, 18009, 41217])
df = pd.read_csv('./data/gtex/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct', sep='\t', skiprows=skip, index_col=1)
df.to_pickle('data/gtex/v8cache.pickle')

In [None]:
with open('data/gtex/v8cache.pickle', 'rb') as f:
    df = pickle.load(f)

df = df.drop(columns=['Name']).T

ll = pd.read_csv('./data/gtex/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt', sep='\t',  index_col=0)['SMTS']

mm = pd.merge(df, ll, how='left', left_index=True, right_index=True)

target_cols = ['FBXO11', 'CIITA', 'HLA-DRA']
tmps = []
for col in target_cols:
    tmp = mm[['SMTS', col]]
    tmp = tmp.rename(columns={col: 'value'})
    tmp['gene'] = col
    tmps.append(tmp)

vv = pd.concat(tmps)
vv['value'] = np.log2(vv['value'] )

In [None]:

# 3遺伝子セットで組織ごとにプロット
fig = plt.figure()
split_count = 4
for i, cols in enumerate(np.array_split(vv['SMTS'].unique(), split_count)):
    ax = fig.add_subplot(split_count, 1, i + 1)
    v = vv[vv['SMTS'].isin(cols)]
    sns.violinplot(x='SMTS', y='value', data=v, hue='gene', dodge=True,
                  jitter=True, color='black', palette='Set3', ax=ax)
    ax.set_ylabel('log2(TPM)')
    ax.tick_params(axis='x', labelrotation=45)
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.7))

plt.subplots_adjust(hspace=0.6)
plt.show()
plt.close()

In [None]:

# 遺伝子別に組織ごとにプロット
fig = plt.figure()
for i, col in enumerate(target_cols):
    ax = fig.add_subplot(len(target_cols), 1, i + 1)
    v = vv[vv['gene'] == col]
    ax.set_title(col)
    sns.violinplot(x='SMTS', y='value', data=v, dodge=True,
                  jitter=True, color='black', palette='Set3', ax=ax)
    ax.set_ylabel('log2(TPM)')
    ax.tick_params(axis='x', labelrotation=45)

plt.subplots_adjust(hspace=0.6)
plt.show()
plt.close()

In [None]:
from itertools import combinations

# 部位ごとの3遺伝子の相関、p valueを算出

ii = vv.groupby('SMTS')

for (i, v) in ii:
    print(i)
    print(v.shape)
    for (a, b) in combinations(target_cols, 2):
        aa = v[a]
        bb = v[b]
        r = np.corrcoef(aa, bb)[0][1]
        print(r)