In [1]:
# Настраиваем импорты.
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import missingno
%matplotlib inline

# Вводные.
trainPath = 'data/exam_module_4/1_variant_dna_sequence_mutation_prediction/input/train.csv'
testPath = 'data/exam_module_4/1_variant_dna_sequence_mutation_prediction/input/test.csv'
cvFraction = 0.15
randomCeed = 777

In [2]:
# 1. Определяем тип задачи.
print('Task type: logistic regression or SVM')

Task type: logistic regression or SVM


In [3]:
# 2. Создаём фреймы и выделяем часть датасета на CV.
trainDf = pd.read_csv(trainPath).sample(frac=(1 - cvFraction), random_state=randomCeed).drop('mutation', axis=1)
trainDfTarget = pd.read_csv(trainPath).sample(frac=(1 - cvFraction), random_state=randomCeed)[['ID', 'mutation']]
cvDf = pd.read_csv(trainPath).drop(trainDf.index).drop('mutation', axis=1)
cvDfTarget = pd.read_csv(trainPath).drop(trainDfTarget.index)[['ID', 'mutation']]
testDf = pd.read_csv(testPath)

print('Original train data: ' + str(pd.read_csv(trainPath).shape))
print('Original test data: ' + str(pd.read_csv(testPath).shape))
print('Train data: ' + str(trainDf.shape))
print('Train target data: ' + str(trainDfTarget.shape))
print('CV data: ' + str(cvDf.shape))
print('CV target data: ' + str(cvDfTarget.shape))
print('Test data: ' + str(testDf.shape))

Original train data: (180000, 32)
Original test data: (120000, 31)
Train data: (153000, 31)
Train target data: (153000, 2)
CV data: (27000, 31)
CV target data: (27000, 2)
Test data: (120000, 31)


In [4]:
# 3. Определяем тип переменных в датасете.
print('Train data types: \n' + str(trainDf.dtypes))
print('Train target data types: \n' + str(trainDfTarget.dtypes))

Train data types: 
ID             int64
sequence1     object
sequence2     object
sequence3     object
sequence4     object
sequence5     object
sequence6     object
sequence7     object
sequence8     object
sequence9     object
A              int64
B              int64
C              int64
D              int64
E              int64
F              int64
G              int64
H              int64
I              int64
J              int64
K            float64
L            float64
M            float64
N            float64
O            float64
P            float64
Q            float64
R            float64
S            float64
T            float64
U            float64
dtype: object
Train target data types: 
ID          int64
mutation    int64
dtype: object


In [5]:
# 4. Если это необходимо провести препроцессинг данных, нужно ли применять алгоритмы понижения размерности?
# Нужно ли убирать аномалии?
print('Так как n << m, лучше использовать логистическую регрессию, либо SMV without kernel.')

# Переводим первые 9 столбцов в цифры (по методу one-hot, one-hot столбцы добавляются в конце датафрейма).
dummieCounter = 0
for col in trainDf.columns:
    if trainDf[col].dtypes == object:
        dummieCounter += len(trainDf[col].unique())
        print('Unique in ' + str(col) + ': ' + str(len(trainDf[col].unique())))
print('Dummie columns: ' + str(dummieCounter))

trainDf = pd.get_dummies(trainDf)
cvDf = pd.get_dummies(cvDf)
testDf = pd.get_dummies(testDf)

print("TrainDF: ")
print(trainDf.head())
print('CvDF: ')
print(cvDf.head())
print("TestDF: ")
print(testDf.head())

Так как n << m, лучше использовать логистическую регрессию, либо SMV without kernel.
Unique in sequence1: 2
Unique in sequence2: 2
Unique in sequence3: 2
Unique in sequence4: 2
Unique in sequence5: 2
Unique in sequence6: 4
Unique in sequence7: 4
Unique in sequence8: 4
Unique in sequence9: 4
Dummie columns: 26
TrainDF: 
            ID  A   B  C  D   E  F   G   H  I  ...  sequence7_GTCATGCACCCT  \
105497  176114  5   3  0  7  33  0  22  45  0  ...                       0   
151648  252874  8   2  0  3   2  0  46  28  0  ...                       0   
173456  289103  8   0  1  4  28  0   8  51  0  ...                       0   
2006      3295  7  15  0  5  33  2   8  38  0  ...                       0   
653       1053  8   0  0  6  33  6  50  52  0  ...                       0   

        sequence7_TAACATACAGAG  sequence8_AAATCAGCTAAA  \
105497                       0                       0   
151648                       0                       0   
173456                       0      

In [11]:
# 5. Провести EDA и вывести какие-то умозаключения и посмотреть на распределения признаков, на корреляции, на выбросы.
print("trainDf:")
trainDf.info()
trainDf.head()

trainDf:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 153000 entries, 105497 to 170866
Data columns (total 48 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   ID                      153000 non-null  int64  
 1   A                       153000 non-null  int64  
 2   B                       153000 non-null  int64  
 3   C                       153000 non-null  int64  
 4   D                       153000 non-null  int64  
 5   E                       153000 non-null  int64  
 6   F                       153000 non-null  int64  
 7   G                       153000 non-null  int64  
 8   H                       153000 non-null  int64  
 9   I                       153000 non-null  int64  
 10  J                       153000 non-null  int64  
 11  K                       153000 non-null  float64
 12  L                       153000 non-null  float64
 13  M                       153000 non-null  float64
 14  N     

Unnamed: 0,ID,A,B,C,D,E,F,G,H,I,...,sequence7_GTCATGCACCCT,sequence7_TAACATACAGAG,sequence8_AAATCAGCTAAA,sequence8_ATTCCATATTTT,sequence8_GTCATGCACCCT,sequence8_TAACATACAGAG,sequence9_AAATCAGCTAAA,sequence9_ATTCCATATTTT,sequence9_GTCATGCACCCT,sequence9_TAACATACAGAG
105497,176114,5,3,0,7,33,0,22,45,0,...,0,0,0,1,0,0,0,0,0,1
151648,252874,8,2,0,3,2,0,46,28,0,...,0,0,0,1,0,0,1,0,0,0
173456,289103,8,0,1,4,28,0,8,51,0,...,0,0,0,1,0,0,1,0,0,0
2006,3295,7,15,0,5,33,2,8,38,0,...,0,0,1,0,0,0,1,0,0,0
653,1053,8,0,0,6,33,6,50,52,0,...,0,0,0,0,0,1,1,0,0,0


In [12]:
print("trainDfTarget:")
trainDfTarget.info()
trainDfTarget.head()

trainDfTarget:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 153000 entries, 105497 to 170866
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   ID        153000 non-null  int64
 1   mutation  153000 non-null  int64
dtypes: int64(2)
memory usage: 3.5 MB


Unnamed: 0,ID,mutation
105497,176114,0
151648,252874,0
173456,289103,0
2006,3295,1
653,1053,0


In [13]:
print("cvDf:")
cvDf.info()
cvDf.head()

cvDf:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 27000 entries, 0 to 179996
Data columns (total 48 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      27000 non-null  int64  
 1   A                       27000 non-null  int64  
 2   B                       27000 non-null  int64  
 3   C                       27000 non-null  int64  
 4   D                       27000 non-null  int64  
 5   E                       27000 non-null  int64  
 6   F                       27000 non-null  int64  
 7   G                       27000 non-null  int64  
 8   H                       27000 non-null  int64  
 9   I                       27000 non-null  int64  
 10  J                       27000 non-null  int64  
 11  K                       27000 non-null  float64
 12  L                       27000 non-null  float64
 13  M                       27000 non-null  float64
 14  N                       27000 n

Unnamed: 0,ID,A,B,C,D,E,F,G,H,I,...,sequence7_GTCATGCACCCT,sequence7_TAACATACAGAG,sequence8_AAATCAGCTAAA,sequence8_ATTCCATATTTT,sequence8_GTCATGCACCCT,sequence8_TAACATACAGAG,sequence9_AAATCAGCTAAA,sequence9_ATTCCATATTTT,sequence9_GTCATGCACCCT,sequence9_TAACATACAGAG
0,0,8,0,1,1,33,0,44,54,0,...,0,0,0,1,0,0,1,0,0,0
27,48,5,3,0,10,33,2,19,19,0,...,0,0,0,1,0,0,1,0,0,0
34,57,8,0,1,5,45,0,14,55,0,...,0,0,0,1,0,0,1,0,0,0
36,59,8,2,0,5,33,2,33,45,0,...,0,0,0,1,0,0,1,0,0,0
54,96,5,0,2,6,33,0,30,54,0,...,0,0,0,1,0,0,1,0,0,0


In [14]:
print("cvDfTarget:")
cvDfTarget.info()
cvDfTarget.head()

cvDfTarget:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 27000 entries, 0 to 179996
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   ID        27000 non-null  int64
 1   mutation  27000 non-null  int64
dtypes: int64(2)
memory usage: 632.8 KB


Unnamed: 0,ID,mutation
0,0,0
27,48,1
34,57,0
36,59,0
54,96,0


In [15]:
print("testDf:")
testDf.info()
testDf.head()

testDf:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 48 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   ID                      120000 non-null  int64  
 1   A                       120000 non-null  int64  
 2   B                       120000 non-null  int64  
 3   C                       120000 non-null  int64  
 4   D                       120000 non-null  int64  
 5   E                       120000 non-null  int64  
 6   F                       120000 non-null  int64  
 7   G                       120000 non-null  int64  
 8   H                       120000 non-null  int64  
 9   I                       120000 non-null  int64  
 10  J                       120000 non-null  int64  
 11  K                       120000 non-null  float64
 12  L                       120000 non-null  float64
 13  M                       120000 non-null  float64
 14  N           

Unnamed: 0,ID,A,B,C,D,E,F,G,H,I,...,sequence7_GTCATGCACCCT,sequence7_TAACATACAGAG,sequence8_AAATCAGCTAAA,sequence8_ATTCCATATTTT,sequence8_GTCATGCACCCT,sequence8_TAACATACAGAG,sequence9_AAATCAGCTAAA,sequence9_ATTCCATATTTT,sequence9_GTCATGCACCCT,sequence9_TAACATACAGAG
0,1,8,0,0,4,33,8,48,3,5,...,0,0,0,1,0,0,1,0,0,0
1,3,10,0,2,4,33,0,50,3,5,...,0,0,0,1,0,0,1,0,0,0
2,4,8,6,1,4,33,2,32,54,0,...,0,0,0,1,0,0,1,0,0,0
3,6,0,6,0,7,33,0,44,3,4,...,0,0,0,1,0,0,1,0,0,0
4,8,8,0,0,4,33,2,11,4,4,...,0,0,0,1,0,0,1,0,0,0


In [None]:
# 6. Подумать над вариантом модели, для того чтобы решить задачу (либо ансамблем моделей)

In [None]:
# 7. Подумать нужно ли применять Unsupervised learning подход для решения задачи?
# Неоходима ли дополнительная информация?

In [None]:
# 8. Обучить модель и вывести валидационный скор по метрике качества.

In [None]:
# 9. Построить отчет на 10 предложнений.

In [None]:
# 10. Выйти и объяснить подход к решению задачи.