### FULL DATASET

- Encoding of categorical features in binary format
- Basic plotting of data features, focusing on disease occurrence

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
## Set the correct working directory
%cd ~/hackathon/data

In [None]:
df = pd.read_csv('GTEx_pancreas_liver_images_liverfat_pancreasfat.csv')

In [None]:
df = df.drop(['Pathology.Notes_liver','Pathology.Notes_pancreas','Tissue.Sample.ID_liver','Tissue.Sample.ID_pancreas'],axis=1)

In [None]:
## Label encoding
from sklearn import preprocessing
for val in ['Sex','Age.Bracket']:
    le = preprocessing.LabelEncoder()
    le.fit(df[val])
    df[val]=le.transform(df[val])

In [None]:
## One hot encoding
# Get dummies of the given column
one_hot = pd.get_dummies(df['Hardy.Scale'])
# Drop the hold column conteining categorical values
df = df.drop('Hardy.Scale',axis = 1)
# Join the new one-hot encoded columns
df = df.join(one_hot)  

In [None]:
def encode(column, basename=''):
    """input is a list of strings, single words separated by comma. Ouput is encoding of those strings
    as lists of 0 and 1 for presence/absence of those words in alphabetical order
    returns a list of lists of presence/absence"""
    # figure out the mapping b/w vector positions and terms
    mapper = dict()
    term_set = list()
    out_column = list()
    for i in range(len(column)):
        if isinstance(column[i], str):
            terms = column[i].split(',')
            for j in range(len(terms)):
                terms[j] = terms[j].strip()
                if terms[j] not in term_set:
                    term_set.append(terms[j])
            column[i] = terms
    term_set = sorted(term_set)
    for i in range(len(term_set)):
        mapper[term_set[i]] = i

    # encode the whole thing as vectors
    for i in column:
        if type(i) == list:
            encoded_i = [0 for k in range(len(term_set))]
            for j in i:
                encoded_i[mapper[j]] = 1
            out_column.append(encoded_i)
        else:
            encoded_i = [0 for i in range(len(term_set))]
            out_column.append(encoded_i)
    
    # transform in pandas dataframe
    basenames = list()
    for i in term_set:
        basenames.append(basename+i)
    out_df = pd.DataFrame(out_column, columns = basenames)
    return out_df

In [None]:
gtex_noseq = df
#    print(list(gtex_noseq.columns))
#    print(list(gtex_noseq['Pathology.Categories_liver']))
liver_disease = encode(list(gtex_noseq['Pathology.Categories_liver']), basename='liver_')
pancreas_disease = encode(list(gtex_noseq['Pathology.Categories_pancreas']), basename='pancreas_')

#df = pd.concat([df,pancreas_disease,liver_disease,],axis=1)
df = pd.concat([df,pancreas_disease,liver_disease,],axis=1)

In [None]:
targets = df[['Fat.Percentage_liver','Fat.Percentage_pancreas']]
df = df.drop(['Fat.Percentage_liver','Fat.Percentage_pancreas','Pathology.Categories_liver','Pathology.Categories_pancreas'],axis=1)
df = df.join(targets)

In [None]:
sex = df['Sex'].value_counts(dropna=False).rename(index={0:'female',1:'male'})
sex.plot(kind='pie',autopct='%.2f')
plt.show()

In [None]:
def disease_sex_analysis(sex,dfr):
    disease_sex = dict()
    tot = dfr['Sex'].value_counts(dropna=False)[sex]
    for name in dfr.columns[8:-2]:
        X = list(dfr[name])
        for val in list(dfr.index):
            if dfr.loc[val,'Sex'] == sex:
                if dfr.loc[val,name] == 1:
                    disease_sex[name] = disease_sex.get(name,0)
                    disease_sex[name] += 1

    return pd.DataFrame(disease_sex,index=[0])/tot * 100

In [None]:
female_hill = disease_sex_analysis(0,df)
male_hill = disease_sex_analysis(1,df)

In [None]:
female_hill.T.plot(kind='bar',figsize=(16,9),title='Female',edgecolor='white', width=0.80, legend=False)
male_hill.T.plot(kind='bar',figsize=(16,9),title='Male',edgecolor='white', width=0.80, legend=False)
plt.show()

In [None]:
def disease_fat_analysis(cols,dfr):
    y = np.zeros(len(df.index))
    for i in list(df.index):
        if dfr.loc[i,cols] >= 33 :
            y[i] += 1

    disease_fat = dict()
    for name in dfr.columns[8:-2]:
        X = list(df[name])
        hill = sum(X)
        fat = 0
        for val in list(df.index):
            if dfr.loc[val,name] == 1 and y[val] == 1:
                fat += 1.0
        res = fat/hill
        disease_fat[name] = disease_fat.get(name,[hill,fat,res])
    disease_fat_df = pd.DataFrame(disease_fat)
    disease_fat_df = disease_fat_df.rename(index={0:'hill',1:'fat_hill',2:'percentage'})
    return disease_fat_df

In [None]:
liver_disease = disease_fat_analysis('Fat.Percentage_liver',df)
pancreas_disease = disease_fat_analysis('Fat.Percentage_pancreas',df)

In [None]:
## Plot the disease occurrency and the percentage of patient with fat organs
liver_disease.T.plot(kind='bar',figsize=(16,9),title='Liver Fat',edgecolor='white', width=0.80)
pancreas_disease.T.plot(kind='bar',figsize=(16,9),title='Pancreas Fat',edgecolor='white', width=0.80)
plt.show()

In [None]:
liver_disease

In [None]:
## Plot the occurrency of the disease in the dataset
disease_occurency = liver_disease.loc['hill',:]/577
disease_occurency.plot(kind='bar',figsize=(16,9),edgecolor='white', width=0.80)
plt.show()

In [None]:
type(disease_occurency)

In [None]:
## Remove the disease occurring less tha 1% of the time
non_significant_disease = []
for val in range(len(disease_occurency)):
    if disease_occurency[val] <= 0.01:
        non_significant_disease.append(str(disease_occurency.index[val]))

In [None]:
df = df.drop(non_significant_disease,axis=1)

In [None]:
df = df.set_index('Subject.ID')

In [None]:
## Save the dataframe
df.to_pickle('full_dataset.pkl') 

### RNA-SEQ DATASET

In [None]:
ids = open('subjID.txt').readlines()[1:]

In [None]:
for val in range(len(ids)):
    ids[val] = ids[val].rstrip()

In [None]:
df_rna_seq = df_rna_seq.reset_index(drop=True)

In [None]:
df_rna_seq = df.loc[df['Subject.ID'].isin(ids)]

In [None]:
sex = df_rna_seq['Sex'].value_counts(dropna=False).rename(index={0:'female',1:'male'})
sex.plot(kind='pie',autopct='%.2f')
plt.show()

In [None]:
## Sex hillness
female_hill_rna = disease_sex_analysis(0,df_rna_seq)
male_hill_rna = disease_sex_analysis(1,df_rna_seq)

In [None]:
female_hill.T.plot(kind='bar',figsize=(16,9),title='Female',edgecolor='white', width=0.80, legend=False)
male_hill.T.plot(kind='bar',figsize=(16,9),title='Male',edgecolor='white', width=0.80, legend=False)
plt.show()

In [None]:
liver_disease_seq = disease_fat_analysis('Fat.Percentage_liver',df)
pancreas_disease_seq = disease_fat_analysis('Fat.Percentage_pancreas',df)

In [None]:
## Plot the disease occurrency and the percentage of patient with fat organs
liver_disease_seq.T.plot(kind='bar',figsize=(16,9),title='Liver Fat',edgecolor='white', width=0.80)
pancreas_disease_seq.T.plot(kind='bar',figsize=(16,9),title='Pancreas Fat',edgecolor='white', width=0.80)
plt.show()

In [None]:
df_rna_seq = df_rna_seq.set_index('Subject.ID')

In [None]:
## Save the dataframe
df_rna_seq.to_pickle('rna_seq_dataset.pkl')