In [1]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv("dataset/HR.csv")

In [3]:
data.part.unique() #part这一栏的所有出现过的值

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

In [4]:
data.groupby(["salary", "part"]).size()

salary  part       
high    IT               83
        RandD            51
        accounting       74
        hr               45
        management      225
        marketing        80
        product_mng      68
        sales           269
        support         141
        technical       201
low     IT              609
        RandD           364
        accounting      358
        hr              335
        management      180
        marketing       402
        product_mng     451
        sales          2099
        support        1146
        technical      1372
medium  IT              535
        RandD           372
        accounting      335
        hr              359
        management      225
        marketing       376
        product_mng     383
        sales          1772
        support         942
        technical      1147
dtype: int64

In [5]:
data = data.join(pd.get_dummies(data.salary))   #salary这一列转化成one hot编码（将"low" "high"等属性转化成010这种数值）
data = data.join(pd.get_dummies(data.part))
del data["salary"]
del data["part"]

In [6]:
data

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,high,low,...,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
0,0.38,0.53,2,157,3,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,0.80,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0.72,0.87,5,223,5,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,0.37,0.52,2,159,3,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
14995,0.37,0.48,2,160,3,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
14996,0.37,0.53,2,143,3,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
14997,0.11,0.96,6,280,4,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [7]:
data.left.value_counts()

0    11428
1     3571
Name: left, dtype: int64

In [8]:
#数据预处理
y_data = data.left.values.reshape(-1,1)
Y = torch.from_numpy(y_data).type(torch.FloatTensor)
x_data = data[[c for c in data.columns if c !="left"]].values
X = torch.from_numpy(x_data).type(torch.FloatTensor)

In [9]:
#创建模型
from torch import nn

自定义模型 

nn.module 继承这个类 

__init__ 初始化所有层 

forward 定义模型的运算过程（前向传播的过程）

In [10]:
import torch.nn.functional as F
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.liner_1 = nn.Linear(20, 64) #输入20个属性，输出64个属性
        self.liner_2 = nn.Linear(64, 64)
        self.liner_3 = nn.Linear(64, 1) #最终输出一个属性
        #self.relu = nn.ReLU() #激活层
        #self.sigmoid = nn.Sigmoid()
    def forward(self, input):
        x = F.relu(self.liner_1(input))
        x = F.relu(self.liner_2(x))
        x = torch.sigmoid(self.liner_3(x))
        return x

In [11]:
model = Model()
lr = 0.0001

In [12]:
def get_model():
    model = Model()
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    return model, opt

In [13]:
model, optim =  get_model()

In [16]:
loss_fn = nn.BCELoss()
batch = 64
no_of_batches = len(data)//batch
epochs = 100
for epoch in range(epochs):
    for i in range(no_of_batches):
        start = i*batch
        end = start + batch
        x = X[start:end]
        y = Y[start:end]
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        optim.zero_grad()
        loss.backward()
        optim.step()
    with torch.no_grad():
        print('epoch:', epoch, 'loss:', loss_fn(model(X),Y).data.item())

epoch: 0 loss: 0.4783909022808075
epoch: 1 loss: 0.4772498607635498
epoch: 2 loss: 0.4758020341396332
epoch: 3 loss: 0.47388550639152527
epoch: 4 loss: 0.4728475511074066
epoch: 5 loss: 0.4718479514122009
epoch: 6 loss: 0.4706517159938812
epoch: 7 loss: 0.46653684973716736
epoch: 8 loss: 0.46533700823783875
epoch: 9 loss: 0.4690420627593994
epoch: 10 loss: 0.4630567133426666
epoch: 11 loss: 0.46167898178100586
epoch: 12 loss: 0.46045204997062683
epoch: 13 loss: 0.4587278664112091
epoch: 14 loss: 0.45436036586761475
epoch: 15 loss: 0.453624427318573
epoch: 16 loss: 0.45227116346359253
epoch: 17 loss: 0.4490647614002228
epoch: 18 loss: 0.44730839133262634
epoch: 19 loss: 0.44479188323020935
epoch: 20 loss: 0.44312620162963867
epoch: 21 loss: 0.4409157931804657
epoch: 22 loss: 0.44043755531311035
epoch: 23 loss: 0.4372461140155792
epoch: 24 loss: 0.43821483850479126
epoch: 25 loss: 0.4334794878959656
epoch: 26 loss: 0.4338570833206177
epoch: 27 loss: 0.43247103691101074
epoch: 28 loss: 0.

In [17]:
loss_fn(model(X),Y)

tensor(0.3616, grad_fn=<BinaryCrossEntropyBackward>)

In [None]:
from torch.utils.data import TensorDataset