In [1]:
import pandas as pd
import numpy as np 

from sklearn.impute import SimpleImputer

from matplotlib import pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
import scipy

from sklearn.preprocessing import LabelEncoder

import os
import zipfile


os.environ['KAGGLE_USERNAME'] = "YOUR KAGGLE USERNAME HERE"
os.environ['KAGGLE_KEY'] = "YOUR KAGGLE KEY HERE"

In [None]:
!pip install --upgrade --force-reinstall --no-deps kaggle

# Получение данных

Мы будем использовать данные об уровнях экспресии 77 белков в коре головного мозга мышей (контрольные группы + мыши с синдромом Дауна), которых напугали (при этом часть мышей пугали просто так, а часть - во время процесса обучения навыку)

In [3]:
!kaggle datasets list -s "mice-protein-expression"

ref                                          title                               size  lastUpdated          downloadCount  voteCount  usabilityRating  
-------------------------------------------  ---------------------------------  -----  -------------------  -------------  ---------  ---------------  
ruslankl/mice-protein-expression             Mice Protein Expression            423KB  2018-05-06 15:09:39           2763         93  0.7058824        
muhammetvarl/mice-protein                    Mice Protein                       425KB  2020-12-16 12:17:50             15          3  1.0              
rarunk4495/mice-protein-expression-data-set  Mice Protein Expression Data Set   869KB  2019-07-31 11:53:17             22          0  0.47058824       


In [4]:
!kaggle datasets download -d ruslankl/mice-protein-expression 

for filename in os.listdir():
    if filename.endswith(".zip"):
        zip_ref = zipfile.ZipFile(filename)
        zip_ref.extractall()

mice-protein-expression.zip: Skipping, found more recently modified local copy (use --force to force download)


# Ознакомление с набором данных

Работаем мы с классом мышей - одна из 8 категорий в столбце "class"- в зависимости от группы, поведения и вводимого вещества (для достижения страха):

c-CS-s: мышь из контрольной группы, стимулированные к обучению, инъекция физраствора (9 объектов)

c-CS-m: мышь из контрольной группы, стимулированные к обучению, инъекция мемантина (10 объектов)

c-SC-s: мышь из контрольной группы, не стимулированные к обучению, инъекция физраствора (физраствор) (9 объектов)

c-SC-m: мышь из контрольной группы, не стимулированные к обучению, инъекция мемантина (10 объектов)

t-CS-s: мышь с трисомией, стимулированные к обучению, инъекция физраствора (7 объектов)

t-CS-m: мышь с трисомией, стимулированные к обучению, инъекция мемантина (9 объектов)

t-SC-s: мышь с трисомией, не стимулированные к обучению, инъекция физраствора (9 объектов)

t-SC-m: мышь с трисомией, не стимулированные к обучению, инъекция мемантина (9 объектов)

In [5]:
df = pd.read_csv('Data_Cortex_Nuclear.csv')

In [6]:
df.head()

Unnamed: 0,MouseID,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,pELK_N,pERK_N,pJNK_N,PKCA_N,pMEK_N,pNR1_N,pNR2A_N,pNR2B_N,pPKCAB_N,pRSK_N,AKT_N,BRAF_N,CAMKII_N,CREB_N,ELK_N,ERK_N,GSK3B_N,JNK_N,MEK_N,TRKA_N,RSK_N,APP_N,Bcatenin_N,SOD1_N,MTOR_N,P38_N,pMTOR_N,DSCR1_N,AMPKA_N,NR2B_N,...,TIAM1_N,pP70S6_N,NUMB_N,P70S6_N,pGSK3B_N,pPKCG_N,CDK5_N,S6_N,ADARB1_N,AcetylH3K9_N,RRP1_N,BAX_N,ARC_N,ERBB4_N,nNOS_N,Tau_N,GFAP_N,GluR3_N,GluR4_N,IL1B_N,P3525_N,pCASP9_N,PSD95_N,SNCA_N,Ubiquitin_N,pGSK3B_Tyr216_N,SHH_N,BAD_N,BCL2_N,pS6_N,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Genotype,Treatment,Behavior,class
0,309_1,0.503644,0.747193,0.430175,2.816329,5.990152,0.21883,0.177565,2.373744,0.232224,1.750936,0.687906,0.306382,0.402698,0.296927,1.02206,0.605673,1.877684,2.308745,0.441599,0.859366,0.416289,0.369608,0.178944,1.866358,3.685247,1.537227,0.264526,0.319677,0.813866,0.165846,0.45391,3.037621,0.36951,0.458539,0.335336,0.825192,0.576916,0.448099,0.586271,...,0.482864,0.29417,0.18215,0.842725,0.192608,1.443091,0.2947,0.354605,1.33907,0.170119,0.159102,0.188852,0.106305,0.144989,0.176668,0.12519,0.115291,0.228043,0.142756,0.430957,0.247538,1.60331,2.014875,0.108234,1.044979,0.831557,0.188852,0.122652,,0.106305,0.108336,0.427099,0.114783,0.13179,0.128186,1.675652,Control,Memantine,C/S,c-CS-m
1,309_2,0.514617,0.689064,0.41177,2.789514,5.685038,0.211636,0.172817,2.29215,0.226972,1.596377,0.695006,0.299051,0.385987,0.281319,0.956676,0.587559,1.725774,2.043037,0.445222,0.834659,0.400364,0.356178,0.17368,1.761047,3.485287,1.509249,0.255727,0.304419,0.780504,0.157194,0.43094,2.921882,0.342279,0.42356,0.324835,0.761718,0.545097,0.420876,0.545097,...,0.454519,0.276431,0.182086,0.847615,0.194815,1.43946,0.29406,0.354548,1.306323,0.171427,0.158129,0.18457,0.106592,0.150471,0.178309,0.134275,0.118235,0.238073,0.142037,0.457156,0.257632,1.671738,2.004605,0.109749,1.009883,0.84927,0.200404,0.116682,,0.106592,0.104315,0.441581,0.111974,0.135103,0.131119,1.74361,Control,Memantine,C/S,c-CS-m
2,309_3,0.509183,0.730247,0.418309,2.687201,5.622059,0.209011,0.175722,2.283337,0.230247,1.561316,0.677348,0.291276,0.381002,0.28171,1.003635,0.602449,1.731873,2.017984,0.467668,0.814329,0.399847,0.368089,0.173905,1.765544,3.571456,1.501244,0.259614,0.311747,0.785154,0.160895,0.423187,2.944136,0.343696,0.425005,0.324852,0.757031,0.54362,0.40463,0.552994,...,0.447197,0.256648,0.184388,0.856166,0.200737,1.524364,0.301881,0.386087,1.2796,0.185456,0.148696,0.190532,0.108303,0.14533,0.176213,0.13256,0.11776,0.244817,0.142445,0.510472,0.255343,1.66355,2.016831,0.108196,0.996848,0.846709,0.193685,0.118508,,0.108303,0.106219,0.435777,0.111883,0.133362,0.127431,1.926427,Control,Memantine,C/S,c-CS-m
3,309_4,0.442107,0.617076,0.358626,2.466947,4.979503,0.222886,0.176463,2.152301,0.207004,1.595086,0.583277,0.296729,0.377087,0.313832,0.87539,0.520293,1.566852,2.132754,0.477671,0.727705,0.385639,0.36297,0.179449,1.286277,2.970137,1.41971,0.259536,0.279218,0.734492,0.16221,0.410615,2.500204,0.344509,0.429211,0.330121,0.74698,0.546763,0.38686,0.547849,...,0.44265,0.398534,0.161768,0.760234,0.184169,1.612382,0.296382,0.29068,1.198765,0.159799,0.166112,0.185323,0.103184,0.140656,0.163804,0.12321,0.117439,0.234947,0.145068,0.430996,0.251103,1.484624,1.957233,0.119883,0.990225,0.833277,0.192112,0.132781,,0.103184,0.111262,0.391691,0.130405,0.147444,0.146901,1.700563,Control,Memantine,C/S,c-CS-m
4,309_5,0.43494,0.61743,0.358802,2.365785,4.718679,0.213106,0.173627,2.134014,0.192158,1.50423,0.55096,0.286961,0.363502,0.277964,0.864912,0.50799,1.480059,2.013697,0.483416,0.687794,0.367531,0.355311,0.174836,1.324695,2.896334,1.359876,0.250705,0.273667,0.702699,0.154827,0.39855,2.45656,0.329126,0.408755,0.313415,0.691956,0.53686,0.360816,0.512824,...,0.419095,0.393447,0.1602,0.768113,0.185718,1.645807,0.296829,0.309345,1.206995,0.16465,0.160687,0.188221,0.104784,0.141983,0.16771,0.136838,0.116048,0.255528,0.140871,0.481227,0.251773,1.534835,2.009109,0.119524,0.997775,0.878668,0.205604,0.129954,,0.104784,0.110694,0.434154,0.118481,0.140314,0.14838,1.83973,Control,Memantine,C/S,c-CS-m


# Обработка данных

Для начала несколько обработаем данные.

1. Избавимся от информации о классах в выборке (['MouseID', 'Genotype', 'Treatment', 'Behavior'])

2. Уберем из рассмотрения колонки, где встречается большое число (10) пропусков

3. Заполним пропуски в данных усредненными значениями по показателю

In [7]:
df_mod = df[df.columns[df.isnull().sum() < 10]]

In [8]:
X = df_mod.drop(['MouseID', 'Genotype', 'Treatment', 'Behavior', 'class'], axis=1)


In [9]:
imputer = SimpleImputer(missing_values=np.NaN, strategy='mean')

X.replace('',np.NaN,inplace=True)
imputer.fit(X)

X = pd.DataFrame(columns=X.columns, data=imputer.transform(X))

# Частный случай предсказания

(чтобы оценить, чего вообще ожидать)

Для начала посмотрим, что мы можем сказать, например, о генетических особенностях рассматриваемых мышей. 

In [10]:
y = df_mod['Genotype']

le = LabelEncoder()
le.fit(y)
y = le.transform(y)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

svm = SVC(gamma='scale')
svm.fit(X_train, y_train)
print("Train score ", svm.score(X_train, y_train))
print("Test score ", svm.score(X_test, y_test))

Train score  0.8020833333333334
Test score  0.8888888888888888


In [12]:
s = RandomizedSearchCV(SVC(),param_distributions={'C': scipy.stats.expon(scale=10), 'gamma': scipy.stats.expon(scale=.1)},)
s.fit(X_train,y_train)
print("Train score ",s.score(X_train,y_train))
print("Test score ",s.score(X_test,y_test))

Train score  1.0
Test score  1.0


In [13]:
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
print("Train score ", svm.score(X_train, y_train))
print("Test score ", svm.score(X_test, y_test))

Train score  0.9560185185185185
Test score  0.9722222222222222


In [14]:
s = RandomizedSearchCV(SVC(kernel = 'linear'),param_distributions={'C': scipy.stats.expon(scale=10), 'gamma': scipy.stats.expon(scale=.1)},)
s.fit(X_train,y_train)
print("Train score ",s.score(X_train,y_train))
print("Test score ",s.score(X_test,y_test))

Train score  0.9803240740740741
Test score  0.9629629629629629


# Предсказание типа мыши
Теперь будем предсказывать уже комбинацию 3 бинарных признаков - изначальную классификацию 

In [15]:
y = df_mod['class']

le = LabelEncoder()
le.fit(y)
y = le.transform(y)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## RBF

In [17]:
s = RandomizedSearchCV(SVC(),param_distributions={'C': scipy.stats.expon(scale=10), 'gamma': scipy.stats.expon(scale=.1)},)
s.fit(X_train,y_train)
print("Train score ",s.score(X_train,y_train))
print("Test score ",s.score(X_test,y_test))

Train score  1.0
Test score  1.0


In [18]:
s.best_params_

{'C': 12.689366326944771, 'gamma': 0.12450352859931224}

## Linear

In [19]:
s = RandomizedSearchCV(SVC(kernel='linear'),param_distributions={'C': scipy.stats.expon(scale=10), 'gamma': scipy.stats.expon(scale=.1)},)
s.fit(X_train,y_train)
print("Train score ",s.score(X_train,y_train))
print("Test score ",s.score(X_test,y_test))

Train score  0.9988425925925926
Test score  0.9907407407407407


Таким образом мы получаем более высокий результат в 1 эксперименте