## PyTorch实现Titanic生存预测

### 1. 获取数据集

In [50]:
import pandas as pd
train_data = pd.read_csv("../../../datas/titanic/train.csv")
test_data = pd.read_csv("../../../datas/titanic/test.csv")

In [51]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [52]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


### 2. 数据预处理

In [53]:
train_data.isnull().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

In [54]:
train_data["Embarked"].isnull().sum()

2

In [55]:
data = pd.crosstab(train_data["PassengerId"],train_data["Embarked"])
data

Embarked,C,Q,S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,0,1
2,1,0,0
3,0,0,1
4,0,0,1
5,0,0,1
...,...,...,...
887,0,0,1
888,0,0,1
889,0,0,1
890,1,0,0


In [56]:
(train_data["Embarked"]=="S").sum()

644

In [57]:
train_data["Age"].isnull().sum()

177

In [58]:
from torch.utils.data import Dataset
import torch
from sklearn.feature_extraction import DictVectorizer
data = train_data[["Pclass","Sex","Embarked","Parch","Age"]]
data = data.to_dict(orient="records")
transer = DictVectorizer(sparse=False)
data = transer.fit_transform(data)
transer.get_feature_names()



['Age',
 'Embarked',
 'Embarked=C',
 'Embarked=Q',
 'Embarked=S',
 'Parch',
 'Pclass',
 'Sex=female',
 'Sex=male']

In [59]:
data

array([[22.,  0.,  0., ...,  3.,  0.,  1.],
       [38.,  0.,  1., ...,  1.,  1.,  0.],
       [26.,  0.,  0., ...,  3.,  1.,  0.],
       ...,
       [nan,  0.,  0., ...,  3.,  1.,  0.],
       [26.,  0.,  1., ...,  1.,  0.,  1.],
       [32.,  0.,  0., ...,  3.,  0.,  1.]])

In [60]:
train_data["Age"].fillna(train_data["Age"].mean(),inplace=True)
train_data["Embarked"].fillna("S",inplace=True)

In [61]:
from torch.utils.data import Dataset
import torch
from sklearn.feature_extraction import DictVectorizer

class MyData(Dataset):
    # 初始化方法
    def __init__(self,data):
        self.target = torch.tensor(data["Survived"].values,dtype=torch.float)
        data = data[["Pclass","Sex","Embarked","Parch","Age"]]
        data = data.to_dict(orient="records")
        transer = DictVectorizer(sparse=False)
        data = transer.fit_transform(data)
        data = torch.tensor(data,dtype=torch.float)
        self.x_data = data
    
    # 实列通过[]运算符取值时调用
    def __getitem__(self,idx):
        idx_data = self.x_data[idx]
        target = self.target[idx]
        return idx_data,target
    
    # 要获取有多少个元素，用 len() 函数
    def __len__(self):
        return len(self.x_data)

In [62]:
train_data.shape

(891, 12)

In [63]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(train_data,test_size=0.12,random_state=8)
train = MyData(train)
test = MyData(test)

### 3. 加载数据

In [64]:
from torch.utils.data import DataLoader
train_loader = DataLoader(
    dataset = train,
    batch_size = 32,   # 每次从中取32个数据集打包
    shuffle = True,   # 不按顺序取
    num_workers = 0,  # 设置进程数 0:核心进程
    drop_last = False,
    pin_memory = True
)
test_loader = DataLoader(
    dataset = test,
    batch_size = 32,   # 每次从中取四个数据集打包
    shuffle = True,   # 不按顺序取
    num_workers = 0,  # 设置进程数 0:核心进程
    drop_last = False,
    pin_memory = True
)

### 4. 构建模型

In [65]:
import torch.nn as nn

class Model(nn.Module):
    def __init__(self):
        super(Model,self).__init__()
        self.model = nn.Sequential(
            nn.Linear(8,4),
            nn.Linear(4,1),
            nn.Sigmoid()
        )
    def forward(self,x):
        x = self.model(x)
        return x
    
    def predict(self,x):
        x = self.model(x)
        y = []
        for i in x:
            if i>0.5:
                y.append(1)
            else:
                y.append(0)
        return torch.tensor(y)

### 5. 损失函数

In [66]:
device = ("cuda" if torch.cuda.is_available() else "cpu")
loss_fn = torch.nn.CrossEntropyLoss()
loss_fn.to(device)

CrossEntropyLoss()

### 6. 优化器

In [67]:
model = Model()
model.to(device)
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)

### 7 训练

In [68]:
for futures,targets in test_loader:
    outputs = model(futures.to(device))
    outputs = outputs.view((-1))
    # print(futures.to(device).shape)
    loss = loss_fn(outputs,targets.to(device))
    # print((targets == torch.heaviside(outputs)).sum())
    print(model.predict(futures.to(device)))

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


In [69]:
epoch = 100
test_data_size = test.__len__()
for i in range(epoch):
    print("-------------第{}轮训练开始------------".format(i+1))
    # 训练步骤开始
    model.train()
    for futures,targets in train_loader:
        futurs = futures.to(device)
        targets = targets.to(device)
        outputs = model(futurs)
        outputs = outputs.view((-1))
        loss = loss_fn(outputs,targets)
        
        # 优化器优化模型
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print("训练轮数：{},loss:{}".format(i,loss.item()))
    # 测试步骤
    model.eval()
    total_test_loss = 0
    total_accuracy = 0
    with torch.no_grad():
        for futures,targets in test_loader:
            futures = futures.to(device)
            targets = targets.to(device)
            outputs = model(futures)
            outputs = outputs.view((-1))
            loss = loss_fn(outputs,targets)
            total_test_loss += loss.item()
            accuracy = (targets == torch.tensor(model.predict(futures)).to(device)).sum()
            total_accuracy+=accuracy
    print("整体测试集上的loss:{}".format(total_test_loss))
    print("整体测试集上的正确率:{}".format(total_accuracy/test_data_size))
    
    if i == epoch-1:
        torch.save(model.state_dict(),"./models/titanic_model.pth")
        print("模型已保存")

-------------第1轮训练开始------------
训练轮数：0,loss:13.905879974365234
整体测试集上的loss:127.75838375091553
整体测试集上的正确率:0.6355140209197998
-------------第2轮训练开始------------
训练轮数：1,loss:16.507339477539062
整体测试集上的loss:127.87203121185303
整体测试集上的正确率:0.6728971600532532
-------------第3轮训练开始------------


  accuracy = (targets == torch.tensor(model.predict(futures)).to(device)).sum()


训练轮数：2,loss:7.427346706390381
整体测试集上的loss:125.73683834075928
整体测试集上的正确率:0.7196261286735535
-------------第4轮训练开始------------
训练轮数：3,loss:12.776372909545898
整体测试集上的loss:123.9265365600586
整体测试集上的正确率:0.6822429895401001
-------------第5轮训练开始------------
训练轮数：4,loss:16.08531951904297
整体测试集上的loss:126.48461294174194
整体测试集上的正确率:0.7102803587913513
-------------第6轮训练开始------------
训练轮数：5,loss:14.928171157836914
整体测试集上的loss:122.4650707244873
整体测试集上的正确率:0.7102803587913513
-------------第7轮训练开始------------
训练轮数：6,loss:21.070459365844727
整体测试集上的loss:126.9177474975586
整体测试集上的正确率:0.7383177280426025
-------------第8轮训练开始------------
训练轮数：7,loss:11.438263893127441
整体测试集上的loss:123.88616371154785
整体测试集上的正确率:0.7196261286735535
-------------第9轮训练开始------------
训练轮数：8,loss:11.816259384155273
整体测试集上的loss:127.6847128868103
整体测试集上的正确率:0.7196261286735535
-------------第10轮训练开始------------
训练轮数：9,loss:18.07278823852539
整体测试集上的loss:128.43574285507202
整体测试集上的正确率:0.7196261286735535
-------------第11轮训练开始------------
训练轮数：

### 8. 预测

In [70]:
model = Model()
model_status = torch.load("./models/titanic_model.pth")
model.load_state_dict(model_status)

<All keys matched successfully>

In [71]:
test[5:20][:]

(tensor([[32.0000,  0.0000,  0.0000,  1.0000,  0.0000,  2.0000,  0.0000,  1.0000],
         [27.0000,  0.0000,  0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000],
         [22.0000,  0.0000,  0.0000,  1.0000,  0.0000,  1.0000,  1.0000,  0.0000],
         [36.0000,  0.0000,  0.0000,  1.0000,  0.0000,  2.0000,  0.0000,  1.0000],
         [42.0000,  0.0000,  0.0000,  1.0000,  0.0000,  2.0000,  1.0000,  0.0000],
         [21.0000,  0.0000,  0.0000,  1.0000,  0.0000,  3.0000,  0.0000,  1.0000],
         [ 0.8300,  0.0000,  0.0000,  1.0000,  2.0000,  2.0000,  0.0000,  1.0000],
         [28.0000,  0.0000,  0.0000,  1.0000,  0.0000,  3.0000,  0.0000,  1.0000],
         [40.0000,  0.0000,  0.0000,  1.0000,  0.0000,  3.0000,  1.0000,  0.0000],
         [29.6991,  1.0000,  0.0000,  0.0000,  1.0000,  3.0000,  0.0000,  1.0000],
         [25.0000,  0.0000,  0.0000,  1.0000,  0.0000,  3.0000,  0.0000,  1.0000],
         [20.0000,  0.0000,  0.0000,  1.0000,  0.0000,  3.0000,  0.0000,  1.0000],
    

In [72]:
model.predict(test[5:20][0])

tensor([0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0])