In [224]:
import torch
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [225]:
spdata = pd.read_pickle('SP.pkl')
mpdata = pd.read_pickle('MP.pkl')

In [226]:
# 指定标签

spdata['label'] = 0
mpdata['label'] = 1

In [227]:
# 提取共有特征

intersection_columns = list(set(spdata.columns).intersection(set(mpdata.columns)))

In [228]:
# 合并单双原发癌表

mixdata = pd.DataFrame()
mixdata = mixdata.append(spdata[intersection_columns].sample(n=8288, random_state=1))
mixdata = mixdata.append(mpdata[mpdata['Record number recode']==1][intersection_columns])

In [229]:
mixdata.columns

Index([&#39;Race recode (W, B, AI, API)&#39;, &#39;Age at diagnosis&#39;,
       &#39;Radiation sequence with surgery&#39;, &#39;Derived AJCC N, 7th ed (2010-2015)&#39;,
       &#39;Derived SEER Combined M (2016+)&#39;, &#39;Sequence number&#39;,
       &#39;SEER Combined Mets at DX-liver (2010+)&#39;,
       &#39;SEER Combined Mets at DX-bone (2010+)&#39;,
       &#39;CS site-specific factor 7 (2004+ varying by schema)&#39;,
       &#39;SEER Combined Mets at DX-lung (2010+)&#39;, &#39;Histologic Type ICD-O-3&#39;,
       &#39;Derived AJCC Stage Group, 7th ed (2010-2015)&#39;,
       &#39;Derived SEER Cmb Stg Grp (2016+)&#39;, &#39;Survival months&#39;, &#39;Grade&#39;, &#39;label&#39;,
       &#39;ER Status Recode Breast Cancer (1990+)&#39;,
       &#39;Derived SEER Combined T (2016+)&#39;,
       &#39;SEER Combined Mets at DX-brain (2010+)&#39;, &#39;Radiation recode&#39;,
       &#39;PR Status Recode Breast Cancer (1990+)&#39;, &#39;Breast Subtype (2010+)&#39;,
       &#39;Patient I

In [239]:
mixdata['Grade']

163549    Moderately differentiated; Grade II
86190        Poorly differentiated; Grade III
76855            Well differentiated; Grade I
255979       Poorly differentiated; Grade III
155768    Moderately differentiated; Grade II
                         ...                 
16566        Poorly differentiated; Grade III
16568            Well differentiated; Grade I
16570            Well differentiated; Grade I
16572     Moderately differentiated; Grade II
16574                                 Unknown
Name: Grade, Length: 16576, dtype: object

In [247]:
# 指定标签及特征

label = mixdata['label']
features = mixdata[['Age at diagnosis', 'Breast Subtype (2010+)', 'Derived AJCC Stage Group, 7th ed (2010-2015)', 'Grade', 'ER Status Recode Breast Cancer (1990+)']]

In [248]:
# One-Hot 编码分类数据

features = pd.get_dummies(features)

In [249]:
# 数值标准化

features = preprocessing.StandardScaler().fit_transform(features)

In [250]:
# 指定 x, y

x = torch.from_numpy(features).float()
y = torch.from_numpy(np.array(label))

In [251]:
x.shape

torch.Size([16576, 27])

In [256]:
# 构建网络

net = torch.nn.Sequential(
    torch.nn.Linear(27, 50),
    torch.nn.ReLU(),
    torch.nn.Linear(50, 2)
)

optimizer = torch.optim.SGD(net.parameters(), lr=0.5)
loss_func = torch.nn.CrossEntropyLoss()

In [257]:
for t in range(100):
    prediction = net(x)
    loss = loss_func(prediction, y)

    print(t, loss)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 tensor(0.7008, grad_fn=&lt;NllLossBackward&gt;)
1 tensor(0.6890, grad_fn=&lt;NllLossBackward&gt;)
2 tensor(0.6812, grad_fn=&lt;NllLossBackward&gt;)
3 tensor(0.6748, grad_fn=&lt;NllLossBackward&gt;)
4 tensor(0.6694, grad_fn=&lt;NllLossBackward&gt;)
5 tensor(0.6648, grad_fn=&lt;NllLossBackward&gt;)
6 tensor(0.6608, grad_fn=&lt;NllLossBackward&gt;)
7 tensor(0.6574, grad_fn=&lt;NllLossBackward&gt;)
8 tensor(0.6544, grad_fn=&lt;NllLossBackward&gt;)
9 tensor(0.6518, grad_fn=&lt;NllLossBackward&gt;)
10 tensor(0.6495, grad_fn=&lt;NllLossBackward&gt;)
11 tensor(0.6476, grad_fn=&lt;NllLossBackward&gt;)
12 tensor(0.6459, grad_fn=&lt;NllLossBackward&gt;)
13 tensor(0.6445, grad_fn=&lt;NllLossBackward&gt;)
14 tensor(0.6433, grad_fn=&lt;NllLossBackward&gt;)
15 tensor(0.6422, grad_fn=&lt;NllLossBackward&gt;)
16 tensor(0.6412, grad_fn=&lt;NllLossBackward&gt;)
17 tensor(0.6404, grad_fn=&lt;NllLossBackward&gt;)
18 tensor(0.6397, grad_fn=&lt;NllLossBackward&gt;)
19 tensor(0.6391, grad_fn=&lt;NllLossBack