### 1 使用特征文本信息处理

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

#### 1.1 数据加载

In [2]:
train_data = pd.read_csv("./cancer_detection_homework/train/feats.csv")
train_data

Unnamed: 0,id,age,HER2,P53,molecular_subtype
0,8a1af6f74cafc720511888998f2f361767c77965a07464...,48,3,True,3
1,b18adad8c39bb458f208581fe40e9be7b04f2b49be04b0...,54,3,False,2
2,8b9af020d0e621168fd0f3913f2cd74e8bb5ea85f61ee0...,55,2,True,3
3,1a0e54f07794fbb95a0fd7f0b4ef6f79ef1a9c83dbaa05...,66,0,False,4
4,2805706a656b013743d84357d5ffa44536e19d18f0509f...,31,0,False,1
...,...,...,...,...,...
195,eb11bf800ee8630c03ed4011fd3378ee4698e122f8e994...,47,1,False,2
196,d473360c0e902f50d70cfb9c1790960018b6f25a2a522f...,47,3,False,2
197,2bf2ccaf14d32628af62a20b9bd8c89a098c1756915968...,57,1,True,2
198,1b77b71ab08bf9312265083930af9ddaf354447dffdde0...,43,3,True,3


In [3]:
test_data = pd.read_csv("./cancer_detection_homework/test/feats.csv")
test_data

Unnamed: 0,id,age,HER2,P53
0,d6b47f0c2ccbbd7923e37dc434ab25445df6bc060d5338...,57,0,False
1,2e9d400ee8450c30d19c43a20e9da770774d3169da4574...,36,0,False
2,72f92da6ca6d521221baee367b5c76fcbfe61edc00c0bc...,81,0,False
3,ca7e160144dd6f459e2b18c48a33d4a53d09de5d7c2557...,82,0,False
4,d65889918d2b9073e7601c2ba67c3a9d26f60612f14e93...,59,2,False
...,...,...,...,...
82,0de3f1bf4ccc440271f091d79e9d3be7348dd5a39d7657...,66,3,False
83,46aaa33f2f3c07741ca4aeea4c9f3e54e23d86ee8e4fd6...,50,0,False
84,2eaae4e8b29e28877112cfdc0257b93218d97cdbfb6670...,53,3,False
85,7a1725339e56e7a9cff48bb5af4ea84f731996a88a4422...,46,0,False


#### 1.2 对P53列进行标签编码

In [4]:
label_encoder = LabelEncoder()

In [5]:
train_data.P53 = label_encoder.fit_transform(train_data.P53)

In [6]:
train_data

Unnamed: 0,id,age,HER2,P53,molecular_subtype
0,8a1af6f74cafc720511888998f2f361767c77965a07464...,48,3,1,3
1,b18adad8c39bb458f208581fe40e9be7b04f2b49be04b0...,54,3,0,2
2,8b9af020d0e621168fd0f3913f2cd74e8bb5ea85f61ee0...,55,2,1,3
3,1a0e54f07794fbb95a0fd7f0b4ef6f79ef1a9c83dbaa05...,66,0,0,4
4,2805706a656b013743d84357d5ffa44536e19d18f0509f...,31,0,0,1
...,...,...,...,...,...
195,eb11bf800ee8630c03ed4011fd3378ee4698e122f8e994...,47,1,0,2
196,d473360c0e902f50d70cfb9c1790960018b6f25a2a522f...,47,3,0,2
197,2bf2ccaf14d32628af62a20b9bd8c89a098c1756915968...,57,1,1,2
198,1b77b71ab08bf9312265083930af9ddaf354447dffdde0...,43,3,1,3


In [7]:
test_data.P53 = label_encoder.transform(test_data.P53)

In [8]:
test_data

Unnamed: 0,id,age,HER2,P53
0,d6b47f0c2ccbbd7923e37dc434ab25445df6bc060d5338...,57,0,0
1,2e9d400ee8450c30d19c43a20e9da770774d3169da4574...,36,0,0
2,72f92da6ca6d521221baee367b5c76fcbfe61edc00c0bc...,81,0,0
3,ca7e160144dd6f459e2b18c48a33d4a53d09de5d7c2557...,82,0,0
4,d65889918d2b9073e7601c2ba67c3a9d26f60612f14e93...,59,2,0
...,...,...,...,...
82,0de3f1bf4ccc440271f091d79e9d3be7348dd5a39d7657...,66,3,0
83,46aaa33f2f3c07741ca4aeea4c9f3e54e23d86ee8e4fd6...,50,0,0
84,2eaae4e8b29e28877112cfdc0257b93218d97cdbfb6670...,53,3,0
85,7a1725339e56e7a9cff48bb5af4ea84f731996a88a4422...,46,0,0


#### 1.3构造X_train, y_train, X_test

In [9]:
X_train = train_data.iloc[:, 1:-1]
X_train

Unnamed: 0,age,HER2,P53
0,48,3,1
1,54,3,0
2,55,2,1
3,66,0,0
4,31,0,0
...,...,...,...
195,47,1,0
196,47,3,0
197,57,1,1
198,43,3,1


In [10]:
y_train = train_data.iloc[:, -1]
y_train

0      3
1      2
2      3
3      4
4      1
      ..
195    2
196    2
197    2
198    3
199    3
Name: molecular_subtype, Length: 200, dtype: int64

In [11]:
X_test = test_data.iloc[:, 1:]
X_test

Unnamed: 0,age,HER2,P53
0,57,0,0
1,36,0,0
2,81,0,0
3,82,0,0
4,59,2,0
...,...,...,...
82,66,3,0
83,50,0,0
84,53,3,0
85,46,0,0


#### 1.4 对数据进行归一化

In [12]:
X_train_and_test = pd.concat([X_train, X_test])
X_train_and_test

Unnamed: 0,age,HER2,P53
0,48,3,1
1,54,3,0
2,55,2,1
3,66,0,0
4,31,0,0
...,...,...,...
82,66,3,0
83,50,0,0
84,53,3,0
85,46,0,0


In [13]:
mm = MinMaxScaler()
mm_X_train_and_test = mm.fit_transform(X_train_and_test)
mm_X_train_and_test

array([[0.453125  , 1.        , 1.        ],
       [0.546875  , 1.        , 0.        ],
       [0.5625    , 0.66666667, 1.        ],
       [0.734375  , 0.        , 0.        ],
       [0.1875    , 0.        , 0.        ],
       [0.609375  , 0.33333333, 0.        ],
       [0.453125  , 0.        , 0.        ],
       [0.59375   , 0.66666667, 0.        ],
       [0.84375   , 0.66666667, 0.        ],
       [0.578125  , 0.33333333, 0.        ],
       [0.671875  , 0.        , 0.        ],
       [0.65625   , 0.        , 0.        ],
       [0.546875  , 0.33333333, 0.        ],
       [1.        , 0.66666667, 0.        ],
       [0.515625  , 1.        , 0.        ],
       [0.578125  , 0.        , 0.        ],
       [0.484375  , 0.        , 0.        ],
       [0.453125  , 1.        , 1.        ],
       [0.703125  , 0.66666667, 0.        ],
       [0.65625   , 0.        , 0.        ],
       [0.4375    , 0.        , 1.        ],
       [0.484375  , 1.        , 0.        ],
       [0.

In [14]:
mm_X_train = mm_X_train_and_test[:len(X_train)]
mm_X_train.shape

(200, 3)

In [15]:
mm_X_test = mm_X_train_and_test[len(X_train):]
mm_X_test.shape

(87, 3)

#### 1.5 用各种常见的机器学习方法进行训练对比

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
import time
import warnings

In [17]:
warnings.filterwarnings("ignore")

In [18]:
def classifiers(X_train, y_train):
    scores = []
    models = [
        LogisticRegression(),
        KNeighborsClassifier(),
        SVC(),
        MLPClassifier(),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        AdaBoostClassifier()]
    models_res = []
    
    for model in models:
        t1 = time.time()
        score = cross_val_score(model, X=X_train, y=y_train, cv=10)
        mean_score = np.mean(score)
        scores.append(mean_score)
        models_res.append(model)
        print(model,"finished!!!")
        print(mean_score)
        print("use time: ", time.time()-t1)
        print("*"*50)
    
    return scores, models_res

In [19]:
%%time
scores, models = classifiers(mm_X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False) finished!!!
0.7150000000000001
use time:  2.402352809906006
**************************************************
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform') finished!!!
0.665
use time:  0.03633451461791992
**************************************************
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False) finished!!!
0.684999999999

#### 1.6 采用逻辑回归的方法进行预测

In [20]:
model = LogisticRegression()

In [21]:
model.fit(mm_X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
y_train

0      3
1      2
2      3
3      4
4      1
      ..
195    2
196    2
197    2
198    3
199    3
Name: molecular_subtype, Length: 200, dtype: int64

In [23]:
y_predict = model.predict(mm_X_test)
y_predict

array([1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 2,
       2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2, 2,
       2, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 2,
       2, 2, 2, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 1, 2, 1, 2])

#### 1.7 保存输出结果

In [24]:
test_data["predict"] = y_predict
test_data

Unnamed: 0,id,age,HER2,P53,predict
0,d6b47f0c2ccbbd7923e37dc434ab25445df6bc060d5338...,57,0,0,1
1,2e9d400ee8450c30d19c43a20e9da770774d3169da4574...,36,0,0,1
2,72f92da6ca6d521221baee367b5c76fcbfe61edc00c0bc...,81,0,0,1
3,ca7e160144dd6f459e2b18c48a33d4a53d09de5d7c2557...,82,0,0,1
4,d65889918d2b9073e7601c2ba67c3a9d26f60612f14e93...,59,2,0,2
...,...,...,...,...,...
82,0de3f1bf4ccc440271f091d79e9d3be7348dd5a39d7657...,66,3,0,2
83,46aaa33f2f3c07741ca4aeea4c9f3e54e23d86ee8e4fd6...,50,0,0,1
84,2eaae4e8b29e28877112cfdc0257b93218d97cdbfb6670...,53,3,0,2
85,7a1725339e56e7a9cff48bb5af4ea84f731996a88a4422...,46,0,0,1


In [25]:
result = test_data[["id", "predict"]]
result

Unnamed: 0,id,predict
0,d6b47f0c2ccbbd7923e37dc434ab25445df6bc060d5338...,1
1,2e9d400ee8450c30d19c43a20e9da770774d3169da4574...,1
2,72f92da6ca6d521221baee367b5c76fcbfe61edc00c0bc...,1
3,ca7e160144dd6f459e2b18c48a33d4a53d09de5d7c2557...,1
4,d65889918d2b9073e7601c2ba67c3a9d26f60612f14e93...,2
...,...,...
82,0de3f1bf4ccc440271f091d79e9d3be7348dd5a39d7657...,2
83,46aaa33f2f3c07741ca4aeea4c9f3e54e23d86ee8e4fd6...,1
84,2eaae4e8b29e28877112cfdc0257b93218d97cdbfb6670...,2
85,7a1725339e56e7a9cff48bb5af4ea84f731996a88a4422...,1


In [26]:
result.to_csv("./submission.csv", index=False, header=False)

### 2 使用图像特征去处理

In [27]:
import os
import tensorflow as tf
import cv2
import matplotlib.pyplot as plt

#### 2.1 构造id2label

通过病人id就可以知道其label是多少，供下面构造样本数据使用

In [28]:
id2label = {}
for i in range(len(train_data)):
    patient_id = train_data.iloc[i]["id"]
    patient_label = train_data.iloc[i]["molecular_subtype"]
    id2label[patient_id] = patient_label
id2label

{'8a1af6f74cafc720511888998f2f361767c77965a07464ce015504b363e98fac': 3,
 'b18adad8c39bb458f208581fe40e9be7b04f2b49be04b0d6026eadcf7bdd56ab': 2,
 '8b9af020d0e621168fd0f3913f2cd74e8bb5ea85f61ee01f551509d3afa193e0': 3,
 '1a0e54f07794fbb95a0fd7f0b4ef6f79ef1a9c83dbaa05cd940d4fe7cf03838d': 4,
 '2805706a656b013743d84357d5ffa44536e19d18f0509ffe0773dccabe9ef7b3': 1,
 'c97db71c13153820e3215a7f83fe220de808fa5138fe9174b9f75cafaf52e510': 2,
 '40eb5fd9e50f063b259bf2ea356c1f116fdda296e2f0162c8b62ae3d1eb7e094': 1,
 '481b26162cd94b8e0117b03cb8296c023f14355aa2a9045ecd944b070df51582': 2,
 '4d42fa68897b87dae8fca34cc8c523da7b40df409917298983dc650db16665c2': 2,
 '7d83911c950cddd5fbadfaf20ab2c230b4f5ecb059024e34b2a006cbce5a803e': 3,
 '311f5bcceb3767e5ce9df0f60ac08bc4fc6cad7125720b90a56cd6dee21a8d0f': 1,
 'a61271bf00789905061bd22825a89415a1306e196bcd8752ccc2b9a7264791ca': 1,
 '42489d3fe736c2ceb9d33d43693ee8ef756c52a875c08b0190c7138aa84a043f': 2,
 '25cdd0028d691dfb5e83d856cd65a515a697847c68452e9b421bad12d7e4ab

#### 2.2 构造X_trian, X_test, y_train

In [29]:
train_base_path = "./cancer_detection_homework/train/images/"

In [30]:
all_patient_id = os.listdir(train_base_path)
all_patient_id

['0fd6381eb57fdc43ad600f3a0fcd9e7e1946da62e3d730554dc2d610c09a2518',
 '791b150a1ddc9867afc5d1bdf82a89b42703ec002bf863a21c6fc14bebf7e3de',
 '79466b7692b7c9f2281490b8a37a955e67d108fd5a6b8867758dabf00ca24cb2',
 '044be4072454995a83d90c2856af619a367c05208caf6cea793e444232b6fb25',
 'abe1699ae59c14c5bb5b96382775a2b56bf553ba30caf5129de22d2b76c3607b',
 '2a83e7c8345b3e893bc4bcc3e761ae731e1eec62f5dec43accab1207af6ca0f7',
 '3f6ea1c2520b77bfcb9feeeae296c877dd99c4cddd3a1ec47a6cfed1f00faecb',
 '22d15d6d0500694a75cad61caa52e2cf6583eeac4776eee06b06be01dc464a97',
 '4b46c239a8a99f40942cd465afd1642254d3889c764643092ab09c057a706d46',
 '6b8b8e36d30317f75496e232216d944b37de2b3acbb3735b7b2bdc25bafbcc2b',
 '2f6c93cbd7ed05052631e53caf4312dd93b53e5cc82aa3c4658d8501a183d906',
 'd3433b4fb9260044ed0ef85f148a5e07e787a11d63b67b55062deb22c218858d',
 'cb3d4b0e57f47cf5f254c05ca07b783042332ae69e4a4afd7e64590771c58378',
 'ec4ba427153fa2832dd93c51cc49db02e19a47ff9107a60e07c2021fbc1b0690',
 '657f2e1bbb6711291e991312f65f0e3e

In [31]:
%%time
X_train = []
y_train = []
min_height, min_width = float("inf"), float("inf")
for patient_id in all_patient_id:
    filenames = os.listdir("{}/{}".format(train_base_path, patient_id))
    for filename in filenames:
        file_path = "{}/{}/{}".format(train_base_path, patient_id, filename)
        image = cv2.imread(file_path,cv2.IMREAD_GRAYSCALE)
        height, width = image.shape
        if height < min_height:
            min_height = height
        if width < min_width:
            min_width = width
        start_height = int((height-400)/2)
        start_width = int((width-600)/2)
        img_gray_sub = image[start_height:start_height+400, start_width:start_width+600]
        X_train.append(img_gray_sub/255)
        y_train.append(id2label[patient_id])

CPU times: user 3.82 s, sys: 2.05 s, total: 5.86 s
Wall time: 9.69 s


In [32]:
test_base_path = "./cancer_detection_homework/test/images/"

In [33]:
all_patient_id = os.listdir(test_base_path)
all_patient_id[:2]

['a34348ec5753aff281a01bf8ea14c4be6e35abf07e8e59d5211876207dbba820',
 '2e9d400ee8450c30d19c43a20e9da770774d3169da457490fd99d9dfb837419c']

In [34]:
len(all_patient_id)

87

In [35]:
%%time
id2X_test = {}
for patient_id in all_patient_id:
    filenames = os.listdir("{}/{}".format(test_base_path, patient_id))
    X_test = []
    for filename in filenames:
        file_path = "{}/{}/{}".format(test_base_path, patient_id, filename)
        image = cv2.imread(file_path,cv2.IMREAD_GRAYSCALE)
        height, width = image.shape
        start_height = int((height-400)/2)
        start_width = int((width-600)/2)
        img_gray_sub = image[start_height:start_height+400, start_width:start_width+600]
        X_test.append(img_gray_sub/255)
    id2X_test[patient_id] = X_test

CPU times: user 1.75 s, sys: 908 ms, total: 2.66 s
Wall time: 4.18 s


In [36]:
len(id2X_test["0172a059fb35811fca55608ba60652429d2200a84eeddf948a26002528e39b05"])

3

In [37]:
id2X_test["0172a059fb35811fca55608ba60652429d2200a84eeddf948a26002528e39b05"][0].shape

(400, 600)

#### 2.3 用逻辑回归模型训练图像

In [38]:
X_train_LR = np.array(X_train)
X_train_LR.shape

(925, 400, 600)

In [39]:
X_train_LR = X_train_LR.reshape(925, -1)
X_train_LR.shape

(925, 240000)

In [40]:
y_train_LR = np.array(y_train)
y_train_LR.shape

(925,)

In [41]:
model_LR = LogisticRegression()

In [42]:
%%time
model_LR.fit(X_train_LR, y_train_LR)

CPU times: user 9min 14s, sys: 1min 9s, total: 10min 24s
Wall time: 22.2 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [43]:
model_LR.score(X_train_LR, y_train_LR)

1.0

#### 2.4 用逻辑回归模型预测图像

In [44]:
test_patient_id = test_data.id.to_list()
len(test_patient_id)

87

In [45]:
y_predict_LR = []
for patient_id in test_patient_id:
    X_tests = id2X_test[patient_id]
    y_predict_one_patient = []
    for X_test in X_tests:
        X_test = X_test.reshape(1, -1)
        y_predict = model_LR.predict(X_test)
        y_predict_one_patient.append(y_predict[0])
    counts = np.bincount(y_predict_one_patient)
    y_predict_LR.append(np.argmax(counts))

In [46]:
result = test_data[["id"]]
result["predict"] = y_predict_LR

In [47]:
result.to_csv("./submission_lr.csv", index=False, header=False)

#### 2.5 使用卷积神经网络训练模型

In [48]:
X_train_CNN = X_train_LR.reshape(-1, 400, 600, 1)
X_train_CNN.shape

(925, 400, 600, 1)

In [49]:
y_train_CNN = np.array(y_train)
y_train_CNN = y_train_CNN.reshape(-1, 1)
y_train_CNN.shape

(925, 1)

In [57]:
model = tf.keras.Sequential()  
model.add(tf.keras.layers.Conv2D(filters=6, kernel_size=(5, 5), strides=(1, 1),
                                 activation='relu', input_shape=(400, 600, 1)))
model.add(tf.keras.layers.AveragePooling2D(pool_size=(2, 2), strides=2))
model.add(tf.keras.layers.Conv2D(filters=16, kernel_size=(
    5, 5), strides=(1, 1), activation='relu'))
model.add(tf.keras.layers.AveragePooling2D(pool_size=(2, 2), strides=2))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(units=120, activation='relu'))
model.add(tf.keras.layers.Dense(units=80, activation='relu'))
model.add(tf.keras.layers.Dense(units=4, activation='softmax'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_4 (Conv2D)            (None, 396, 596, 6)       156       
_________________________________________________________________
average_pooling2d_4 (Average (None, 198, 298, 6)       0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 194, 294, 16)      2416      
_________________________________________________________________
average_pooling2d_5 (Average (None, 97, 147, 16)       0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 228144)            0         
_________________________________________________________________
dense_6 (Dense)              (None, 120)               27377400  
_________________________________________________________________
dense_7 (Dense)              (None, 80)               

In [58]:
model.compile(optimizer='adam', loss=tf.nn.softmax_cross_entropy_with_logits, metrics=['accuracy'])

In [60]:
model.fit(X_train_CNN, y_train_CNN, batch_size=32, epochs=2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fead04f9ad0>

In [61]:
test_patient_id = test_data.id.to_list()
len(test_patient_id)

87

In [67]:
y_predict_CNN = []
for patient_id in test_patient_id:
    X_tests = id2X_test[patient_id]
    y_predict_one_patient = []
    for X_test in X_tests:
        X_test = X_test.reshape(1, 400, 600, 1)
        y_predict = model.predict(X_test)
        y_predict_one_patient.append(np.argmax(y_predict))
    counts = np.bincount(y_predict_one_patient)
    y_predict_CNN.append(np.argmax(counts))

In [68]:
y_predict_CNN

[0,
 1,
 0,
 0,
 2,
 2,
 0,
 2,
 1,
 2,
 2,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 2,
 2,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 2,
 2,
 2,
 0,
 0,
 0,
 2,
 2,
 2,
 2,
 0,
 0,
 0,
 2,
 1,
 2,
 1,
 2,
 2,
 0,
 2,
 0,
 3,
 0,
 1,
 2,
 1,
 3,
 2,
 2,
 2,
 0,
 2,
 0,
 0,
 1,
 3,
 3,
 2,
 2,
 0,
 1,
 0,
 2,
 0,
 0,
 0]

In [69]:
result = test_data[["id"]]
result["predict"] = y_predict_CNN

In [70]:
result.to_csv("./submission_cnn.csv", index=False, header=False)