# Import Packages and Data

In [170]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [216]:
data = pd.read_excel("Mouse_DS_Proteins.xls")

# Data Pre-processing
## 1 Impute missing protein expression data with mean for that class

In [217]:
for i in data.columns[1:78]:
    data[i] = data.groupby("class").transform(lambda x: x.fillna(x.mean()))[i]
sum(data.isnull().any()) # check

0

## 2 Normalize data onto 0-1 scale

In [218]:
for i in data.columns[1:78]:
    data[i] = (data[i] - min(data[i])) / (max(data[i]) - min(data[i]))

## 3 Rearrange dataframe columns

In [219]:
if "MouseID" in data.columns:
    data[["Mouse", "Measurement"]] = data.loc[:,'MouseID'].str.split('_', expand = True)
    data.drop(["MouseID"], axis = 1, inplace = True)
    
if data.columns[0] != "Mouse":
    rearranged = data.columns[-3:].append(data.columns[:-3])  
    data = data.loc[:,rearranged]
    
data.rename(columns = {'class':'Class'}, inplace = True)

## 4 Find and Remove Outlier Mouse
According to the paper, a mouse from the t-CS-m class was removed from the dataset becasue it had missing values for the majority of the proteins among the 7 with missing values. Also, the protein expression values of this mouse were different form the other mice in its class. I tried to identify this mouse visually in order to remove it from the dataset. The mouse is number 3417.

In [222]:
data.loc[data.loc[:,"Class"] == "t-CS-m",].iloc[:,:79].groupby(by = "Mouse").sum()

Unnamed: 0_level_0,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,pELK_N,...,pGSK3B_Tyr216_N,SHH_N,BAD_N,BCL2_N,pS6_N,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N
Mouse,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3414,3.587627,4.185675,8.189412,6.04379,4.300769,4.674723,6.423922,1.974692,7.919193,3.575694,...,7.286319,3.255513,2.569978,4.258962,5.713672,3.711905,3.296366,1.50733,1.056494,2.249216
3416,2.38489,2.738108,7.21835,5.675715,3.685554,4.340778,6.085658,2.03718,6.612115,2.459166,...,7.794286,5.673827,4.821649,4.027217,6.690434,3.860099,4.40386,3.486512,3.340448,3.803083
3417,1.797175,2.003946,5.383435,1.935321,1.778066,4.445029,5.891176,2.22123,5.611694,2.208919,...,4.685497,6.124858,4.821649,8.439472,9.068719,5.233347,5.601754,2.618821,3.340448,3.803083
3429,3.419082,3.560649,5.625238,2.880311,2.654682,3.961694,5.909969,1.454198,5.143653,3.160818,...,5.180993,5.544304,4.844653,3.951813,6.548443,4.60892,2.253691,1.904739,4.805353,3.80776
3504,3.793105,4.06406,6.31321,4.500603,2.989978,3.444463,4.755165,1.823168,4.737206,3.336529,...,9.003131,4.687223,4.198056,2.970413,6.457471,3.938486,2.008409,2.892986,3.582731,3.633061
3505,3.863271,4.809588,9.93142,7.163805,6.022094,5.270115,7.049442,2.507314,8.702258,3.703803,...,8.451146,1.852691,3.461265,2.511576,6.210112,1.74564,5.196446,0.86059,1.912242,2.430948
3522,2.809536,3.563428,8.754629,6.450425,5.374671,5.209248,7.537799,11.615739,7.829805,3.180705,...,6.193994,3.486505,5.75322,4.393377,6.936037,3.567597,6.290245,4.030293,3.701285,5.424214
361,1.945684,2.483008,9.791468,7.359207,6.053957,5.493372,7.476841,5.842998,8.177956,2.295621,...,7.959342,5.200688,5.64408,3.618538,6.758837,3.27062,7.041167,2.018859,3.571908,3.780198
363,3.385922,4.183768,8.611855,6.149043,3.735661,5.828528,7.51401,9.739252,8.484807,3.031484,...,9.003363,4.900105,7.280287,4.159293,6.026332,3.470528,5.669976,4.249257,4.753125,5.296181


In [223]:
data = data.loc[data.loc[:,"Mouse"] != "3417",]

In [224]:
data.to_csv(path_or_buf = "data.csv", index = False)

In [82]:
# data = pd.melt(data, 
#                id_vars = ['Mouse', 'Measurement','Genotype', 'Treatment', 'Behavior', 'class'],
#                var_name = "Protein",
#                value_name = "Expression"
#               )