In [1]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv("dataset/HR.csv")

In [3]:
data.part.unique() #part这一栏的所有出现过的值

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

In [4]:
data.groupby(["salary", "part"]).size()

salary  part       
high    IT               83
        RandD            51
        accounting       74
        hr               45
        management      225
        marketing        80
        product_mng      68
        sales           269
        support         141
        technical       201
low     IT              609
        RandD           364
        accounting      358
        hr              335
        management      180
        marketing       402
        product_mng     451
        sales          2099
        support        1146
        technical      1372
medium  IT              535
        RandD           372
        accounting      335
        hr              359
        management      225
        marketing       376
        product_mng     383
        sales          1772
        support         942
        technical      1147
dtype: int64

In [5]:
data = data.join(pd.get_dummies(data.salary))   #salary这一列转化成one hot编码（将"low" "high"等属性转化成010这种数值）
data = data.join(pd.get_dummies(data.part))
del data["salary"]
del data["part"]

In [6]:
data

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,high,low,...,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
0,0.38,0.53,2,157,3,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,0.80,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0.72,0.87,5,223,5,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,0.37,0.52,2,159,3,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
14995,0.37,0.48,2,160,3,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
14996,0.37,0.53,2,143,3,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
14997,0.11,0.96,6,280,4,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [7]:
data.left.value_counts()

0    11428
1     3571
Name: left, dtype: int64

In [8]:
#数据预处理
y_data = data.left.values.reshape(-1,1)
Y = torch.from_numpy(y_data).type(torch.FloatTensor)
x_data = data[[c for c in data.columns if c !="left"]].values
X = torch.from_numpy(x_data).type(torch.FloatTensor)

In [9]:
#创建模型
from torch import nn

自定义模型 

nn.module 继承这个类 

__init__ 初始化所有层 

forward 定义模型的运算过程（前向传播的过程）

In [10]:
import torch.nn.functional as F
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.liner_1 = nn.Linear(20, 64) #输入20个属性，输出64个属性
        self.liner_2 = nn.Linear(64, 64)
        self.liner_3 = nn.Linear(64, 1) #最终输出一个属性
        #self.relu = nn.ReLU() #激活层
        #self.sigmoid = nn.Sigmoid()
    def forward(self, input):
        x = F.relu(self.liner_1(input))
        x = F.relu(self.liner_2(x))
        x = torch.sigmoid(self.liner_3(x))
        return x

In [11]:
model = Model()
lr = 0.0001

In [12]:
def get_model():
    model = Model()
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    return model, opt

In [13]:
model, optim =  get_model()

In [14]:
loss_fn = nn.BCELoss()
batch = 64
no_of_batches = len(data)//batch
epochs = 100
for epoch in range(epochs):
    for i in range(no_of_batches):
        start = i*batch
        end = start + batch
        x = X[start:end]
        y = Y[start:end]
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        optim.zero_grad()
        loss.backward()
        optim.step()
    with torch.no_grad():
        print('epoch:', epoch, 'loss:', loss_fn(model(X),Y).data.item())

epoch: 0 loss: 0.6722943782806396
epoch: 1 loss: 0.6496067047119141
epoch: 2 loss: 0.6405868530273438
epoch: 3 loss: 0.6309906840324402
epoch: 4 loss: 0.621131420135498
epoch: 5 loss: 0.6248529553413391
epoch: 6 loss: 0.62132728099823
epoch: 7 loss: 0.6049251556396484
epoch: 8 loss: 0.5975252985954285
epoch: 9 loss: 0.5888492465019226
epoch: 10 loss: 0.5762665271759033
epoch: 11 loss: 0.5702469348907471
epoch: 12 loss: 0.567098081111908
epoch: 13 loss: 0.5644480586051941
epoch: 14 loss: 0.5764145851135254
epoch: 15 loss: 0.5662938952445984
epoch: 16 loss: 0.5607945919036865
epoch: 17 loss: 0.5647764801979065
epoch: 18 loss: 0.5604814887046814
epoch: 19 loss: 0.5593259334564209
epoch: 20 loss: 0.5604063272476196
epoch: 21 loss: 0.5610149502754211
epoch: 22 loss: 0.5611802935600281
epoch: 23 loss: 0.5615288615226746
epoch: 24 loss: 0.5618180632591248
epoch: 25 loss: 0.5621487498283386
epoch: 26 loss: 0.5623683333396912
epoch: 27 loss: 0.5626288056373596
epoch: 28 loss: 0.5671791434288025

In [15]:
loss_fn(model(X),Y)

tensor(0.5323, grad_fn=<BinaryCrossEntropyBackward>)

### Dataset类改写代码

In [16]:
from torch.utils.data import TensorDataset
HRdataset = TensorDataset(X, Y)
model, optim = get_model()
for epoch in range(epochs):
    for i in range(no_of_batches):
        x, y = HRdataset[i*batch: i*batch+batch]
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        optim.zero_grad()
        loss.backward()
        optim.step()
    with torch.no_grad():
        print('epoch:', epoch, 'loss:', loss_fn(model(X),Y).data.item())

epoch: 0 loss: 0.6750981211662292
epoch: 1 loss: 0.6760808229446411
epoch: 2 loss: 0.6654475331306458
epoch: 3 loss: 0.6556015014648438
epoch: 4 loss: 0.6449224352836609
epoch: 5 loss: 0.6266797780990601
epoch: 6 loss: 0.6162722110748291
epoch: 7 loss: 0.6237182021141052
epoch: 8 loss: 0.5998333692550659
epoch: 9 loss: 0.5928251147270203
epoch: 10 loss: 0.5866075754165649
epoch: 11 loss: 0.5811472535133362
epoch: 12 loss: 0.5862952470779419
epoch: 13 loss: 0.5814121961593628
epoch: 14 loss: 0.5751276612281799
epoch: 15 loss: 0.5712578892707825
epoch: 16 loss: 0.5680497288703918
epoch: 17 loss: 0.5657187700271606
epoch: 18 loss: 0.5638326406478882
epoch: 19 loss: 0.5624296069145203
epoch: 20 loss: 0.5615627765655518
epoch: 21 loss: 0.5608751773834229
epoch: 22 loss: 0.560674250125885
epoch: 23 loss: 0.5603113174438477
epoch: 24 loss: 0.5605241060256958
epoch: 25 loss: 0.560603678226471
epoch: 26 loss: 0.560169517993927
epoch: 27 loss: 0.5602093935012817
epoch: 28 loss: 0.560584068298339

### dataloader类

In [19]:
from torch.utils.data import DataLoader
HR_ds = TensorDataset(X,Y)
HR_dl = DataLoader(HR_ds, batch_size=batch, shuffle=True) #HR dataloader, shuffle:是否乱序
model, optim = get_model()
from torch.utils.data import TensorDataset
HRdataset = TensorDataset(X, Y)
model, optim = get_model()
for epoch in range(epochs):
    for x,y in HR_dl:
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        optim.zero_grad()
        loss.backward()
        optim.step()
    with torch.no_grad():
        print('epoch:', epoch, 'loss:', loss_fn(model(X),Y).data.item())

epoch: 0 loss: 0.5595817565917969
epoch: 1 loss: 0.5561066269874573
epoch: 2 loss: 0.5518569350242615
epoch: 3 loss: 0.5420863628387451
epoch: 4 loss: 0.5354121327400208
epoch: 5 loss: 0.5283897519111633
epoch: 6 loss: 0.5198034048080444
epoch: 7 loss: 0.5063390135765076
epoch: 8 loss: 0.4922177493572235
epoch: 9 loss: 0.48073360323905945
epoch: 10 loss: 0.4670585095882416
epoch: 11 loss: 0.45394015312194824
epoch: 12 loss: 0.44192153215408325
epoch: 13 loss: 0.43051043152809143
epoch: 14 loss: 0.41850537061691284
epoch: 15 loss: 0.40823349356651306
epoch: 16 loss: 0.40107664465904236
epoch: 17 loss: 0.40724125504493713
epoch: 18 loss: 0.3798242211341858
epoch: 19 loss: 0.3748375177383423
epoch: 20 loss: 0.3651047646999359
epoch: 21 loss: 0.3628564178943634
epoch: 22 loss: 0.35356998443603516
epoch: 23 loss: 0.34804242849349976
epoch: 24 loss: 0.3507224917411804
epoch: 25 loss: 0.33642080426216125
epoch: 26 loss: 0.3355002701282501
epoch: 27 loss: 0.32720696926116943
epoch: 28 loss: 0.

### 添加验证

#### 过拟合：对于训练数据过度拟合，对未知数据预测很差
#### 欠拟合：对于训练数据拟合不够，对未知数据预测很差

In [22]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(x_data, y_data)

train_x = torch.from_numpy(train_x).type(torch.float32)
train_y = torch.from_numpy(train_y).type(torch.float32)
test_x = torch.from_numpy(test_x).type(torch.float32)
test_y = torch.from_numpy(test_y).type(torch.float32)

In [23]:
train_ds = TensorDataset(train_x, train_y)
train_dl = DataLoader(train_ds, batch_size=batch, shuffle=True) #HR dataloader

In [24]:
test_ds = TensorDataset(test_x, test_y)
test_dl = DataLoader(test_ds, batch_size=batch) #HR dataloader

#### 计算正确率 

y_pred = (y_pred > 0.5).type(torch.int32)  #signoid函数值大于0.5预测为1

(y_pred == labels).float().mean() #计算预测正确的概率，作为正确率

In [25]:
def accuracy(y_pred, y_true):
    y_pred = (y_pred > 0.5).type(torch.int32)
    acc = (y_pred == y_true).float().mean()
    return acc

In [26]:
model, optim = get_model()

for epoch in range(epochs):
    for x,y in train_dl:
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        optim.zero_grad()
        loss.backward()
        optim.step()
    with torch.no_grad():
        epoch_accuracy = accuracy(model(X), Y) #accuracy(预测的y, 真实的y)
        print('epoch:', epoch, 'loss:', loss_fn(model(X),Y).data.item(), 'accuracy:', epoch_accuracy.item())

epoch: 0 loss: 0.5608251690864563 accuracy: 0.7619174718856812
epoch: 1 loss: 0.5604106783866882 accuracy: 0.7619174718856812
epoch: 2 loss: 0.5589512586593628 accuracy: 0.7619174718856812
epoch: 3 loss: 0.5549231767654419 accuracy: 0.7619174718856812
epoch: 4 loss: 0.5524945855140686 accuracy: 0.7619174718856812
epoch: 5 loss: 0.5485906004905701 accuracy: 0.7619174718856812
epoch: 6 loss: 0.5447193384170532 accuracy: 0.7619174718856812
epoch: 7 loss: 0.5397374033927917 accuracy: 0.7619174718856812
epoch: 8 loss: 0.5353987216949463 accuracy: 0.7619174718856812
epoch: 9 loss: 0.5331559777259827 accuracy: 0.7619174718856812
epoch: 10 loss: 0.5289856791496277 accuracy: 0.7619174718856812
epoch: 11 loss: 0.524376392364502 accuracy: 0.7619174718856812
epoch: 12 loss: 0.5220655202865601 accuracy: 0.7619174718856812
epoch: 13 loss: 0.5115835666656494 accuracy: 0.7619174718856812
epoch: 14 loss: 0.5050650238990784 accuracy: 0.7619174718856812
epoch: 15 loss: 0.5013545751571655 accuracy: 0.7619