In [2]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import tqdm
 
# 准备数据集
class TitanicDataset(Dataset):
    def __init__(self, data):
        xy = data
        # xy.shape（）可以得到xy的行列数
        self.len = xy.shape[0]
        # 选取相关的数据特征
        # np.array()将数据转换成矩阵，方便进行接下来的计算
        # 要先进行独热表示，然后转化成array，最后再转换成矩阵
        self.x_data = torch.from_numpy(np.array(pd.get_dummies(xy.iloc[:,:-1])))
        self.y_data = torch.from_numpy(np.array(xy.iloc[:,-1]))
 
    # getitem函数，可以使用索引拿到数据
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
 
    # 返回数据的条数/长度
    def __len__(self):
        return self.len
 
 
# 实例化自定义类，并传入数据地址
data = pd.read_csv('train_full.csv')
from sklearn.model_selection import train_test_split
train, valid = train_test_split(data, test_size=0.2)
train_dataset = TitanicDataset(train)
valid_dataset = TitanicDataset(valid)
# num_workers是否要进行多线程服务，num_worker=2 就是2个进程并行运行
# 采用Mini-Batch的训练方法
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True, num_workers=0)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=32, shuffle=True, num_workers=0)
 
# 定义模型
class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        # 要先对选择的特征进行独热表示计算出维度，而后再选择神经网络开始的维度
        # inputsize
        self.linear1 = torch.nn.Linear(11, 13)
        self.linear2 = torch.nn.Linear(13, 1)
 
        self.sigmoid = torch.nn.Sigmoid()
 
    # 前馈
    def forward(self, x):
        x = self.sigmoid(self.linear1(x))
        x = self.sigmoid(self.linear2(x))
 
        return x
    
    def test(self, x):
        with torch.no_grad():
            x=self.sigmoid(self.linear1(x))
            x=self.sigmoid(self.linear2(x))
            y=[]
            # 根据二分法原理，划分y的值
            for i in x:
                if i >0.5:
                    y.append(1)
                else:
                    y.append(0)
        return y
 
# 实例化模型
model = Model()
 
# 定义损失函数
criterion = torch.nn.BCELoss(reduction='mean')
# 定义优化器
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
 
# 防止windows系统报错
if __name__ == '__main__':
    # 采用Mini-Batch的方法训练要采用多层嵌套循环
    # 所有数据都跑100遍
    for epoch in range(100):
        # data从train_loader中取出数据（取出的是一个元组数据）：（x，y）
        # enumerate可以获得当前是第几次迭代，内部迭代每一次跑一个Mini-Batch
        size = len(train_loader.dataset)
        for i, data in enumerate(train_loader):
            # inputs获取到data中的x的值，labels获取到data中的y值
            x, y = data
            x = x.float()
            y = y.float()
            y_pred = model(x)
            y_pred = y_pred.squeeze(-1)
            loss = criterion(y_pred, y)
            #print(epoch, i, loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if i % 100 == 0:
                loss, current = loss.item(), i * len(x)
                print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
        size = len(valid_loader.dataset)
        num_batches = len(valid_loader)
        valid_loss, correct = 0, 0
        for i, data in enumerate(valid_loader):
            # 这块用model.test()代替
            with torch.no_grad():
                x, y = data
                x = x.float()
                y = y.float()
                pred = model(x)
                pred = pred.squeeze(-1)
                y_res = []
                for p in pred:
                    if p > 0.5:
                        y_res.append(1)
                    else:
                        y_res.append(0)
                y_res = torch.from_numpy(np.array(y_res).astype(np.float32))
                valid_loss += criterion(pred, y).item()
                correct += (y_res == y).type(torch.float).sum().item()
        valid_loss /= num_batches
        correct /= size
        print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {valid_loss:>8f} \n")
test_data=pd.read_csv('test.csv')
feature = ["Pclass", "Sex", "SibSp", "Parch", "Fare"]
test=torch.from_numpy(np.array(pd.get_dummies(test_data[feature])))
y=model.test(test.float())
 
# 输出预测结果
output=pd.DataFrame({'PassengerId':test_data.PassengerId,'Survived':y})
output.to_csv('my_predict.csv',index=False)

loss: 0.707095  [    0/ 6954]
loss: 0.701680  [ 3200/ 6954]
loss: 0.685630  [ 6400/ 6954]
Test Error: 
 Accuracy: 55.5%, Avg loss: 0.689319 

loss: 0.695861  [    0/ 6954]
loss: 0.698382  [ 3200/ 6954]
loss: 0.689580  [ 6400/ 6954]
Test Error: 
 Accuracy: 59.5%, Avg loss: 0.683768 

loss: 0.695649  [    0/ 6954]
loss: 0.676761  [ 3200/ 6954]
loss: 0.666592  [ 6400/ 6954]
Test Error: 
 Accuracy: 61.6%, Avg loss: 0.677925 

loss: 0.683072  [    0/ 6954]
loss: 0.666905  [ 3200/ 6954]
loss: 0.696212  [ 6400/ 6954]
Test Error: 
 Accuracy: 67.3%, Avg loss: 0.671709 

loss: 0.666471  [    0/ 6954]
loss: 0.662152  [ 3200/ 6954]
loss: 0.657031  [ 6400/ 6954]
Test Error: 
 Accuracy: 68.3%, Avg loss: 0.664348 

loss: 0.672369  [    0/ 6954]
loss: 0.673547  [ 3200/ 6954]
loss: 0.660300  [ 6400/ 6954]
Test Error: 
 Accuracy: 69.1%, Avg loss: 0.657152 

loss: 0.659226  [    0/ 6954]
loss: 0.650813  [ 3200/ 6954]
loss: 0.620361  [ 6400/ 6954]
Test Error: 
 Accuracy: 69.4%, Avg loss: 0.647893 

loss: 