In [1]:
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt

# 读入文件

In [2]:
test_data = pd.read_csv("test.csv")
train_data = pd.read_csv("train.csv")
train_data.shape
train_data.iloc[0:4,0:]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [3]:
train_data.drop(['Name'],axis=1,inplace = True)
test_data.drop(['Name'],axis=1, inplace = True)
train_data.iloc[0:4,0:]

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S


In [4]:
col_name = train_data.columns.tolist()
print(col_name)
c= train_data.pop('Survived')
train_data.insert(10,'Survived',c)
train_data.iloc[0:4,0:]

['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,female,35.0,1,0,113803,53.1,C123,S,1


In [5]:
train_data.iloc[0:4,1:-1]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,female,35.0,1,0,113803,53.1,C123,S


In [6]:
test_data.iloc[0:4,1:]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,male,34.5,0,0,330911,7.8292,,Q
1,3,female,47.0,1,0,363272,7.0,,S
2,2,male,62.0,0,0,240276,9.6875,,Q
3,3,male,27.0,0,0,315154,8.6625,,S


In [7]:
all_features = pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))

In [8]:
all_features.shape

(1309, 9)

In [9]:
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))
all_features[numeric_features] = all_features[numeric_features].fillna(0)

In [10]:
all_features = pd.get_dummies(all_features, dummy_na=True)

In [11]:
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype = torch.float)
test_features = torch.tensor(all_features[n_train:].values, dtype = torch.float)
train_labels = torch.tensor(train_data.Survived.values,dtype = torch.float).view(-1,1)

In [12]:
train_features.shape

torch.Size([891, 1129])

In [13]:
train_labels.shape

torch.Size([891, 1])

# 定义连接网络

In [23]:
def Linear(num_features,drop_prob):
    net = nn.Sequential(
        nn.Linear(num_features,2),
        #nn.Dropout(drop_prob),
       # nn.Linear(num_features//3,2)
    )
    for param in net.parameters():
        nn.init.normal_(param, mean=0, std=0.01)
    return net

# 准确率

In [15]:
def evaluate_accuracy(data_iter, net,device=torch.device('cpu')):
    """Evaluate accuracy of a model on the given data set."""
    acc_sum,n = torch.tensor([0],dtype=torch.float32,device=device),0
    net.eval()
    for X,y in data_iter:
        with torch.no_grad():
            y = y.long()
            acc_sum += torch.sum((torch.argmax(net(X), dim=1) == y.squeeze()))  #[[0.2 ,0.4 ,0.5 ,0.6 ,0.8] ,[ 0.1,0.2 ,0.4 ,0.3 ,0.1]] => [ 4 , 2 ]
            n += y.shape[0]
    return acc_sum.item()/n

# 定义训练模型

In [20]:
def train(net, train_features, train_labels, test_features, test_labels, 
                      num_epochs, lr , weight_decay,batch_size,criterion):
    optimizer = torch.optim.SGD(params = net.parameters(),lr = lr,weight_decay = weight_decay)
    train_dataset = torch.utils.data.TensorDataset(train_features, train_labels)
    train_iter = torch.utils.data.DataLoader(train_dataset,batch_size, shuffle= True,)
    if test_features is not None:
        test_dataset = torch.utils.data.TensorDataset(test_features, test_labels)
        test_iter = torch.utils.data.DataLoader(test_dataset,batch_size, shuffle= True)
    net = net.float()
    train_acc_sum = torch.tensor([0.0],dtype=torch.float32)
    test_acc_sum = torch.tensor([0.0],dtype=torch.float32)
    for i in range(num_epochs):
        train_acc = torch.tensor([0.0],dtype=torch.float32)
        n = 0
        for x,y in train_iter :
            net.train()
            y_hat = net(x)
            l = criterion(y_hat, y.long().squeeze())
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            with torch.no_grad():
                y = y.long()
                train_acc += (torch.sum((torch.argmax(y_hat, dim=1) == y.squeeze()))).float()
                n += y.shape[0]
        if test_features is not None:
            test_acc = evaluate_accuracy(test_iter,net)
            test_acc_sum += test_acc
        train_acc_sum += train_acc/n
    return train_acc_sum/num_epochs,test_acc_sum/num_epochs

# K折

In [17]:
def get_k_fold(i,k,train_features,train_labels):
    assert k>1
    fold_size = train_features.shape[0]//k
    features_train,labels_train =None,None
    for j in range(k):
        idx =slice(j*fold_size, (j+1)*fold_size)
        features_part,labels_part = train_features[idx,:],train_labels[idx]
        if i==j:
            features_valid,labels_valid = features_part,labels_part
        elif features_train is None :
            features_train,labels_train = features_part, labels_part
        else :
            features_train = torch.cat((features_train,features_part),dim=0)
            labels_train = torch.cat((labels_train, labels_part), dim=0)
    return features_train,labels_train, features_valid,labels_valid
            

In [18]:
def k_fold(k, train_features, train_labels, num_epochs, lr , weight_decay,batch_size,criterion,drop_prob):
    for i in range(k):
        data = get_k_fold(i,k,train_features, train_labels)
        net = Linear(train_features.shape[1],drop_prob)
        train_acc,test_acc = train(net,*data,num_epochs, lr , weight_decay,batch_size,criterion)
        print('fold %d, train rmse %f, valid rmse %f' % (i, train_acc[-1], test_acc[-1]))

In [26]:
k, num_epochs, lr, weight_decay, batch_size,drop_prob = 10, 100, 0.01, 0, 64,0.9
criterion = nn.CrossEntropyLoss()
k_fold(k, train_features, train_labels, num_epochs, lr , weight_decay,batch_size,criterion,drop_prob)

fold 0, train rmse 0.792971, valid rmse 0.740562
fold 1, train rmse 0.788789, valid rmse 0.819325
fold 2, train rmse 0.794956, valid rmse 0.770674
fold 3, train rmse 0.790362, valid rmse 0.749551
fold 4, train rmse 0.790362, valid rmse 0.782584
fold 5, train rmse 0.792747, valid rmse 0.783820
fold 6, train rmse 0.793133, valid rmse 0.758315
fold 7, train rmse 0.794844, valid rmse 0.765505
fold 8, train rmse 0.790337, valid rmse 0.821123
fold 9, train rmse 0.787690, valid rmse 0.834944


In [33]:
def train_and_pred(train_features, test_features, train_labels, test_data,
                   num_epochs, lr , weight_decay,batch_size,criterion,drop_prob):
    net = Linear(train_features.shape[1],drop_prob)
    train_acc, _ = train(net, train_features, train_labels, None, None,
                        num_epochs, lr , weight_decay,batch_size,criterion)
    print(' train rmse %f' % ( train_acc[-1]))
    net.eval()
    preds = net(test_features).argmax(dim=1).detach().numpy()
    test_data['Survived'] = pd.Series(preds.reshape(1, -1)[0])
    submission = pd.concat([test_data['PassengerId'], test_data['Survived']], axis=1)
    submission.to_csv('./submission.csv', index=False)
    # sample_submission_data = pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")

In [34]:
train_and_pred(train_features,test_features,train_labels,test_data,
                   num_epochs, lr , weight_decay,batch_size,criterion,drop_prob)

 train rmse 0.792727
