In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
# Config for accessing the data on the s3 storage
storage_options = {'anon':True, 'client_kwargs':{'endpoint_url':'https://os.unil.cloud.switch.ch'}}
s3_path = 's3://lts2-graphnex/BXDmice/'

## Genotype
The genotype file contains a list of differences in the genome of the different mice. These differences are at the scale of a nucleotide. In the data table, each row is an `SNP` [Single-nucleotide polymorphism](https://en.wikipedia.org/wiki/Single-nucleotide_polymorphism). It can be inherited from one of the initial ancestors or the other. This is encoded as a binary value -1 or 1. The initial ancestors have a zero value.

In [None]:
# Load the data
# Genotype
genotype_path = os.path.join(s3_path, 'genotype_BXD.txt.gz')
genotype = pd.read_csv(genotype_path, sep='\t', storage_options=storage_options)
print('File {} Opened.'.format(genotype_path))

In [None]:
genotype.head()

In [None]:
# Gene postion in the genome
geno_map_path = os.path.join(s3_path, 'map_BXD.txt.gz')
geno_map = pd.read_csv(geno_map_path, sep='\t', storage_options=storage_options)
print('File {} Opened.'.format(geno_map_path))

In [None]:
geno_map.head()

## Tissues
During or after experiments, the expression of proteins in different tissues of the mice has been measured.
The measurements have been recorded in a file per tissue. The data are in a large table with proteins as rows and mice as columns. The expression is a float number.

For each mouse, only a subset of the tissues have been measured. Therefore, not all mice are present in each tissue data and different group of mice are found in the different tissue files.

In [None]:
# Tissue
tissue_name = 'Muscle_CD'
#organ = 'Lung'
#organ = 'Hippocampus'
#organ = 'Gastrointestinal'
tissue_path = os.path.join(s3_path,  'expression data', tissue_name + '.txt.gz')
tissue = pd.read_csv(tissue_path, sep='\t', storage_options=storage_options)
print('File {} Opened.'.format(tissue_path))

In [None]:
tissue.head()

## Phenotype
The phenotype data corresponds to the results of different experiments. It is made of 2 files, one file contains the results and the other contain the description of the experiment (experiment type, authors,...).
In the result table, rows correspond to phenotypes and columns to mouse strains. The entries are float numbers. The table contains a large number of missing values as not all the mouse strains have been involved in all the experiments.

In [None]:
# Load the data
# Phenotype
phenotype_path = os.path.join(s3_path, 'Phenotype.txt.gz')
phenotype = pd.read_csv(phenotype_path, sep='\t', storage_options=storage_options)
print('File {} Opened.'.format(phenotype_path))
# Phenotype description
phenotypeinfo_path = os.path.join(s3_path, 'phenotypes_id_aligner.txt.gz')
phenotypeinfo = pd.read_csv(phenotypeinfo_path, sep='\t', storage_options=storage_options)
print('File {} Opened.'.format(phenotypeinfo_path))

In [None]:
phenotype.head()

In [None]:
phenotypeinfo.head()

In [None]:
phenotypeinfo[phenotypeinfo['RecordID']==12894]

## Data cleaning

### Drop duplicate genes in the dataset
Some lines in the genotype DataFrame are identical and we will drop them to reduce the number of features and the computation.

In [None]:
# drop duplicate genes in the dataset
geno_merge = pd.merge(geno_map, genotype, on='SNP')
print('Size of the data before dropping duplicates',geno_merge.shape)
# define a duplicate SNP as: 
# 1) an SNP where all the entries corresponding to BXD mice are identical to another SNP and
# 2) both SNPs are on the same chromosome.
col_to_search_duplicates = ['Chr'] + list(genotype.columns.values[5:])
geno_reduced = geno_merge.drop_duplicates(subset=col_to_search_duplicates)
print('Size of the data after dropping duplicates',geno_reduced.shape)

In [None]:
# Optionally, save the result as a compressed csv file, to be used by other notebooks
geno_reduced.to_csv('geno_reduced.csv.gz')