In [2]:
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
import torch

In [3]:
class MyDataset(Dataset):
    def __init__(self, filepath):
        features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare']
        data = pd.read_csv(filepath)
        self.len = data.shape[0]
        
        #data[features]的类型是DataFrame,先进行one-hot编码，然后转成array,最后转成tensor用于矩阵运算
        self.x_data = torch.from_numpy(pd.get_dummies(data[features])).float()
        self.y_data = torch.from_numpy(np.array(data["Survived"]))
        
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
    
    def __len__(self):
        return self.len

In [4]:
train_dataset = MyDataset('./data/train.csv')

TypeError: expected np.ndarray (got DataFrame)

In [5]:
train_dataset = pd.read_csv('./data/train.csv')
test_dataset = pd.read_csv('./data/test.csv')
train_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
train_dataset['Ticket']

0             A/5 21171
1              PC 17599
2      STON/O2. 3101282
3                113803
4                373450
             ...       
886              211536
887              112053
888          W./C. 6607
889              111369
890              370376
Name: Ticket, Length: 891, dtype: object

In [7]:
train_dataset['Ticket'][2]

'STON/O2. 3101282'

In [8]:
train_dataset['Ticket'][2].split()

['STON/O2.', '3101282']

In [9]:
def preprocess(dataset):
    dataset = dataset.copy()
    
    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])
    
    def ticket_number(x):
        return x.split(" ")[-1]
    
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "None"
        return "_".join(items[0:-1])
    
    dataset['Name'] = dataset["Name"].apply(normalize_name)
    dataset['Ticket_number'] = dataset['Ticket'].apply(ticket_number)
    dataset['Ticket_item'] = dataset['Ticket'].apply(ticket_item)
    return  dataset

preprocessed_train_df = preprocess(train_dataset)
preprocessed_test_df = preprocess(test_dataset)
preprocessed_train_df.head()
    

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Ticket_number,Ticket_item
0,1,0,3,Braund Mr Owen Harris,male,22.0,1,0,A/5 21171,7.25,,S,21171,A/5
1,2,1,1,Cumings Mrs John Bradley Florence Briggs Thayer,female,38.0,1,0,PC 17599,71.2833,C85,C,17599,PC
2,3,1,3,Heikkinen Miss Laina,female,26.0,0,0,STON/O2. 3101282,7.925,,S,3101282,STON/O2.
3,4,1,1,Futrelle Mrs Jacques Heath Lily May Peel,female,35.0,1,0,113803,53.1,C123,S,113803,
4,5,0,3,Allen Mr William Henry,male,35.0,0,0,373450,8.05,,S,373450,


In [10]:
preprocessed_train_df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Ticket_number,Ticket_item
0,1,0,3,Braund Mr Owen Harris,male,22.0,1,0,A/5 21171,7.25,,S,21171,A/5
1,2,1,1,Cumings Mrs John Bradley Florence Briggs Thayer,female,38.0,1,0,PC 17599,71.2833,C85,C,17599,PC
2,3,1,3,Heikkinen Miss Laina,female,26.0,0,0,STON/O2. 3101282,7.925,,S,3101282,STON/O2.
3,4,1,1,Futrelle Mrs Jacques Heath Lily May Peel,female,35.0,1,0,113803,53.1,C123,S,113803,
4,5,0,3,Allen Mr William Henry,male,35.0,0,0,373450,8.05,,S,373450,
5,6,0,3,Moran Mr James,male,,0,0,330877,8.4583,,Q,330877,
6,7,0,1,McCarthy Mr Timothy J,male,54.0,0,0,17463,51.8625,E46,S,17463,
7,8,0,3,Palsson Master Gosta Leonard,male,2.0,3,1,349909,21.075,,S,349909,
8,9,1,3,Johnson Mrs Oscar W Elisabeth Vilhelmina Berg,female,27.0,0,2,347742,11.1333,,S,347742,
9,10,1,2,Nasser Mrs Nicholas Adele Achem,female,14.0,1,0,237736,30.0708,,C,237736,


In [11]:
input_features = list(preprocessed_train_df.columns)
input_features.remove('Ticket')
input_features.remove('PassengerId')
input_features.remove('Survived')

print(f'Input features: {input_features}')

Input features: ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Ticket_number', 'Ticket_item']


In [12]:
preprocessed_train_df.type

AttributeError: 'DataFrame' object has no attribute 'type'

In [13]:
torch.from_numpy(np.array(preprocessed_train_df['Survived']))

tensor([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1,
        0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
        0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
        0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
        1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
        1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
        0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
        1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,

In [14]:
label = preprocessed_train_df['Survived']
features = ['Pclass', 'SibSp', 'Parch', 'Fare']
inputs = preprocessed_train_df[features]


In [15]:
y = torch.from_numpy(np.array(label.values)).float()
x = torch.from_numpy(np.array(inputs.values)).float()

In [16]:
import torch.nn as nn
import torch.nn.functional as F

In [17]:
class Titanic_num(nn.Module):
    def __init__(self):
        super(Titanic_num, self).__init__()
        self.linear1 = nn.Linear(4, 2)
        self.linear2 = nn.Linear(2, 1)
        
    def forward(self, x):
        x = self.linear1(x)
        x = F.relu(x)
        x = self.linear2(x)
        x = F.sigmoid(x)
        return x.squeeze(-1)
        

In [18]:
import matplotlib.pyplot as plt

In [19]:
# epoch_list = []
# l_list = []
# 
# model = Titanic_num()
# #loss = nn.BCELoss(reduction='sum')
# criterion = torch.nn.BCELoss(reduction='sum')
# optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)
# model.train()
# for epoch in range(100):
#     optimizer.zero_grad()
#     pred_y = model(x)
#     loss = criterion(pred_y, y)
#     epoch_list.append(epoch)
#     l_list.append(loss.item())
#     loss.backward()
#     optimizer.step()
#     
# plt.plot(np.array(epoch_list), np.array(l_list))
    

In [None]:
epoch_list = []
l_list = []

device = torch.device('cuda')
x = x.to(device)
y = y.to(device)

model = Titanic_num().to(device)
#loss = nn.BCELoss(reduction='sum')
criterion = torch.nn.BCELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)


model.train()
for epoch in range(100):
    optimizer.zero_grad()
    pred_y = model(x)
    loss = criterion(pred_y, y)
    epoch_list.append(epoch)
    l_list.append(loss.item())
    loss.backward()
    optimizer.step()

plt.plot(np.array(epoch_list), np.array(l_list))

[<matplotlib.lines.Line2D at 0x1a4793a1a90>]

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')
df_sub = pd.read_csv('./data/gender_submission.csv')

In [2]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


数据处理
去除无用特征(如何判断无用特征)
将有用的特征，其中的非数值型特征进行编码以便训练
处理数据中缺失值的部分
（用于机器学习时，可将数据归一化）
最后将特征，目标取出，数据集X，y提取出

In [3]:
df_train.drop(['Name', 'Ticket', "Cabin"], axis=1, inplace=True)
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.2500,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.9250,S
3,4,1,1,female,35.0,1,0,53.1000,S
4,5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,13.0000,S
887,888,1,1,female,19.0,0,0,30.0000,S
888,889,0,3,female,,1,2,23.4500,S
889,890,1,1,male,26.0,0,0,30.0000,C


In [4]:
df_test.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [5]:
sex = pd.get_dummies(df_train['Sex'], drop_first=True)

In [6]:
embark = pd.get_dummies(df_train['Embarked'], drop_first=True)

In [7]:
test_sex = pd.get_dummies(df_train['Sex'], drop_first=False)

In [8]:
df_train = pd.concat([df_train, sex, embark], axis=1)

In [9]:
sex = pd.get_dummies(df_test['Sex'], drop_first=True)
embark = pd.get_dummies(df_test['Embarked'], drop_first=True)
df_test = pd.concat([df_test, sex, embark], axis=1)

In [10]:
df_test.drop(['Sex', 'Embarked'], axis=1, inplace=True)

In [11]:
df_train.drop(['Sex', 'Embarked'], axis=1, inplace=True)

In [12]:
df_train.fillna(df_train.mean(), inplace=True)

In [13]:
df_test.fillna(df_test.mean(), inplace=True)

In [14]:
Scaler1 = StandardScaler()
Scaler2 = StandardScaler()

In [15]:
df_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
       'male', 'Q', 'S'],
      dtype='object')

In [16]:
train_columns = df_train.columns

In [17]:
test_columns = df_test.columns

In [18]:
df_train = pd.DataFrame(Scaler1.fit_transform(df_train))

In [19]:
df_test = pd.DataFrame(Scaler2.fit_transform(df_test))

In [20]:
df_train.columns = train_columns
df_test.columns = test_columns

In [22]:
features = df_train.iloc[:, 2:].columns.tolist()
# 所有行，第三到最后列

In [23]:
traget = df_train.loc[:, 'Survived'].name

In [24]:
X_train = df_train.iloc[:,2:].values
y_train = df_train.loc[:, 'Survived'].values

In [25]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.autograd import Variable

In [26]:
device = torch.device('cuda')

In [29]:
X_train = X_train.to(device)
y_train = y_train.to(device)

AttributeError: 'numpy.ndarray' object has no attribute 'to'

In [27]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(8, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512,2)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

model = Net().to(device)
print(model)

Net(
  (fc1): Linear(in_features=8, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [28]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)

In [30]:
batch_size = 64
n_epochs = 500
batch_no = len(X_train) // batch_size

train_loss = 0
train_loss_min = np.Inf

In [36]:
model = Net().to(device)

In [42]:
for epoch in range(n_epochs):
    for i in range(batch_no):
        start = i * batch_size
        end = start + batch_size
        x_var = Variable(torch.FloatTensor(X_train[start:end]))
        y_var = Variable(torch.LongTensor(y_train[start:end]))
        
        y_np = torch.from_numpy(y_train)
        y_np = y_np.to(device)
        
        x_var = x_var.to(device)
        y_var = y_var.to(device)
        
        optimizer.zero_grad()
        output = model(x_var)
        loss = criterion(output, y_var)
        loss.backward()
        optimizer.step()
        
        values, labels = torch.max(output, 1)
        num_right = np.sum(labels.data.cpu().numpy() == y_np[start:end])
        train_loss += loss.item()*batch_size
    
    train_loss = train_loss / len(X_train)
    if train_loss <= train_loss_min:
        print("Validation loss decreased ({:6f} ===> {:6f}). Saving the model...".format(train_loss_min,train_loss))
        torch.save(model.state_dict(), 'model.pt')
        train_loss_min = train_loss
        
    if epoch % 200 == 0:
        print('')
        print("Epoch: {} \tTrain Loss: {} \tTrain Accuracy: {}".format(epoch+1, train_loss,num_right / len(y_train[start:end]) ))
print('Training Ended!')


Epoch: 1 	Train Loss: 0.6578910815363972 	Train Accuracy: 0.0

Epoch: 201 	Train Loss: 0.6576706799673702 	Train Accuracy: 0.0

Epoch: 401 	Train Loss: 0.6579394745212966 	Train Accuracy: 0.0
Training Ended!


In [34]:
batch_size = 64
n_epochs = 500
batch_no = len(X_train) // batch_size

train_loss = 0
train_loss_min = np.Inf
for epoch in range(n_epochs):
    for i in range(batch_no):
        start = i * batch_size
        end   = start + batch_size
        x_var = Variable(torch.FloatTensor(X_train[start:end]))
        y_var = Variable(torch.LongTensor(y_train[start:end]))

        optimizer.zero_grad()
        output = model(x_var)
        loss   = criterion(output,y_var)
        loss.backward()
        optimizer.step()

        values, labels = torch.max(output, 1)
        num_right   = np.sum(labels.data.numpy() == y_train[start:end])
        train_loss += loss.item()*batch_size

    train_loss = train_loss / len(X_train)
    if train_loss <= train_loss_min:
        print("Validation loss decreased ({:6f} ===> {:6f}). Saving the model...".format(train_loss_min,train_loss))
        torch.save(model.state_dict(), "model.pt")
        train_loss_min = train_loss

    if epoch % 200 == 0:
        print('')
        print("Epoch: {} \tTrain Loss: {} \tTrain Accuracy: {}".format(epoch+1, train_loss,num_right / len(y_train[start:end]) ))
print('Training Ended! ')

Validation loss decreased (   inf ===> 0.646480). Saving the model...

Epoch: 1 	Train Loss: 0.6464795370562175 	Train Accuracy: 0.0
Validation loss decreased (0.646480 ===> 0.646177). Saving the model...
Validation loss decreased (0.646177 ===> 0.644746). Saving the model...

Epoch: 201 	Train Loss: 0.6512526249814969 	Train Accuracy: 0.0
Validation loss decreased (0.644746 ===> 0.644517). Saving the model...

Epoch: 401 	Train Loss: 0.6513040466886463 	Train Accuracy: 0.0
Training Ended! 
