In [1]:
import pandas as pd
import numpy as np

In [2]:
import torch
import sklearn

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
?torch.nn.Sequential

[1;31mInit signature:[0m [0mtorch[0m[1;33m.[0m[0mnn[0m[1;33m.[0m[0mSequential[0m[1;33m([0m[1;33m*[0m[0margs[0m[1;33m:[0m [0mAny[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
A sequential container.
Modules will be added to it in the order they are passed in the constructor.
Alternatively, an ordered dict of modules can also be passed in.

To make it easier to understand, here is a small example::

    # Example of using Sequential
    model = nn.Sequential(
              nn.Conv2d(1,20,5),
              nn.ReLU(),
              nn.Conv2d(20,64,5),
              nn.ReLU()
            )

    # Example of using Sequential with OrderedDict
    model = nn.Sequential(OrderedDict([
              ('conv1', nn.Conv2d(1,20,5)),
              ('relu1', nn.ReLU()),
              ('conv2', nn.Conv2d(20,64,5)),
              ('relu2', nn.ReLU())
            ]))
[1;31mInit docstring:[0m Initializes internal Module state, shared by both nn.Module and Scrip

In [5]:
# 1. DataFrame에서 Object 데이터 타입 -> Int or Float으로 변환 
# 2. 결측값 채우기와 데이터 타입 바꾸기는 Imputer를 사용하지 말자
# 3. 범주화 (LabelEncoding) 시 결측값도 하나의 범주로 처리된다.
# 4. labelencoding 시 column 별로 진행
# ?함수명 혹은 ?변수명 을 하면 해당 설명이 나온다

In [6]:
# 랜덤 시드 설정
torch.manual_seed(1)
if device == "cuda" :
    torch.cuda.manual_seed_all(1)

In [7]:
# csv 데이터 로드
train = pd.read_csv("2021-ai-midterm-p4/train.csv")
test = pd.read_csv("2021-ai-midterm-p4/test.csv")
submission = pd.read_csv("2021-ai-midterm-p4/submit_sample.csv")

# 데이터 확인
print(train)
print(train.info())
print(train.describe())

       index  enrollee_id      city  city_development_index  gender  \
0          0         8949  city_103                   0.920    Male   
1          1        29725   city_40                   0.776    Male   
2          3        33241  city_115                   0.789     NaN   
3          4          666  city_162                   0.767    Male   
4          5        21651  city_176                   0.764     NaN   
...      ...          ...       ...                     ...     ...   
15321  19152        29754  city_103                   0.920  Female   
15322  19153         7386  city_173                   0.878    Male   
15323  19154        31398  city_103                   0.920    Male   
15324  19156         5756   city_65                   0.802    Male   
15325  19157        23834   city_67                   0.855     NaN   

           relevent_experience enrolled_university education_level  \
0      Has relevent experience       no_enrollment        Graduate   
1      

In [8]:
# 데이터 전처리
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# dataframe 이용하여 결측값 제거
# dataframe col 순회
# for col in train.columns:
#     if train[col].isnull().sum() != 0 : # 결측값이 있다면
#         #최빈값으로 처리
#         train[col] = train[col].fillna(train[col].mode()[0]) 
# for col in test.columns:
#     if test[col].isnull().sum() != 0:
#         test[col] = test[col].fillna(test[col].mode()[0])
        
# object -> int
for col in test.columns:
    if test[col].dtype == "object":
        test[col] = test[col].astype(str)
        train[col] = train[col].astype(str)
        label = pd.concat([test[col],train[col]],axis=0)
        le.fit(label)
        test[col]=le.transform(test[col])
        train[col]= le.transform(train[col])
        
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15326 entries, 0 to 15325
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   index                   15326 non-null  int64  
 1   enrollee_id             15326 non-null  int64  
 2   city                    15326 non-null  int32  
 3   city_development_index  15326 non-null  float64
 4   gender                  15326 non-null  int32  
 5   relevent_experience     15326 non-null  int32  
 6   enrolled_university     15326 non-null  int32  
 7   education_level         15326 non-null  int32  
 8   major_discipline        15326 non-null  int32  
 9   experience              15326 non-null  int32  
 10  company_size            15326 non-null  int32  
 11  company_type            15326 non-null  int32  
 12  last_new_job            15326 non-null  int32  
 13  training_hours          15326 non-null  int64  
 14  target                  15326 non-null

In [9]:
# 사용하지 않은 컬럼을 제거하여 Train, Test 데이터 설정
x_train = np.array(train.drop(["index","enrollee_id","target"],axis=1))
x_test = np.array(test.drop(["index","enrollee_id"],axis=1))
y_train = np.array(train["target"])

In [10]:
# 데이터 전처리
# 값의 범위가 크므로 정규화가 필요

from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [11]:
# 데이터 모양 확인
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)

(15326, 12)
(15326,)
(3832, 12)


In [12]:
# 데이터 Tensor에 올리기
x_train = torch.Tensor(x_train).to(device)
x_test = torch.Tensor(x_test).to(device)
y_train = torch.Tensor(y_train).to(device)

In [13]:
# 레이어 생성
layer1 = torch.nn.Linear(x_train.shape[1],512,bias=True).to(device)
layer2 = torch.nn.Linear(512,128,bias=True).to(device)
layer3 = torch.nn.Linear(128,1,bias=True).to(device)
relu = torch.nn.ReLU().to(device)
dropout = torch.nn.Dropout(0.3).to(device)
sigmoid = torch.nn.Sigmoid()

In [14]:
# 레이어 가중치 초기화
torch.nn.init.xavier_normal_(layer1.weight)
torch.nn.init.xavier_normal_(layer2.weight)
torch.nn.init.xavier_normal_(layer3.weight)


Parameter containing:
tensor([[-7.2337e-02, -1.2721e-01,  1.0374e-01,  4.0329e-02,  5.9353e-02,
          9.2824e-02, -7.2548e-02,  2.3038e-01,  2.5395e-02,  3.0111e-02,
          6.0890e-02, -1.0951e-01,  7.5195e-02, -7.1403e-02, -1.5186e-01,
         -2.3052e-01,  1.0646e-01,  3.0491e-02, -1.8354e-02,  1.7995e-01,
         -1.0617e-01,  1.1650e-01, -1.2367e-01, -8.5444e-02, -8.1011e-02,
          7.0291e-02,  7.6303e-02,  2.4786e-01, -8.1748e-03, -2.6638e-01,
         -1.6572e-02, -8.6298e-02,  1.0499e-01,  3.1148e-02,  2.4495e-01,
         -9.0411e-02, -1.5275e-01,  2.4809e-02, -1.8261e-01, -8.3660e-02,
         -1.4109e-01,  1.4893e-01, -2.7282e-03, -3.4879e-03, -1.4359e-02,
          1.8971e-04,  9.8188e-02,  3.6810e-02,  1.3839e-01,  1.1777e-01,
         -9.9582e-02,  9.4888e-02,  1.2752e-01, -2.1283e-01, -4.2352e-02,
         -3.8950e-02,  1.7021e-01, -9.4779e-02, -2.2908e-02, -9.3018e-02,
          8.0548e-03,  1.5736e-01,  5.2474e-02,  3.5708e-02,  3.3047e-02,
          4.8953

In [15]:
# 모델 생성
model = torch.nn.Sequential(layer1, relu,dropout,
                            layer2, relu,dropout,
                           layer3,sigmoid).to(device)

In [16]:
# 학습 파라미터 설정
lr = 1e-2
epochs = 200

In [17]:
# 손실 함수, 최적화 함수 정의
loss = torch.nn.BCELoss().to(device)
optim = torch.optim.Adam(model.parameters(), lr=lr)

In [18]:
# 학습
model.train()
for epoch in range(epochs+1):
    optim.zero_grad()
    h = model(x_train)
    cost = loss(h,y_train.unsqueeze(1))
    cost.backward()
    optim.step()
    if epoch % (epochs/10) == 0:
        predict = h > torch.Tensor([0.5]).to(device)
        predict = predict.int()
        predict = predict == y_train.unsqueeze(1)
        acc = sum(predict)/len(y_train.unsqueeze(1))
        print(epoch, cost.item(),acc.item())

0 0.7350389361381531 0.3586062788963318
20 0.5167916417121887 0.7530340552330017
40 0.4883447289466858 0.7632781267166138
60 0.47883015871047974 0.7664100527763367
80 0.4713878035545349 0.7731306552886963
100 0.4661242961883545 0.7701292037963867
120 0.4622177481651306 0.7785462737083435
140 0.4593464434146881 0.7809604406356812
160 0.45651745796203613 0.7820696830749512
180 0.454098641872406 0.7838966250419617
200 0.45318660140037537 0.7846143841743469


In [19]:
with torch.no_grad():
    model.eval()
    predict = model(x_test)
    predict = predict >= torch.Tensor([0.5]).to(device)
    predict = predict.float()
submission["target"] = predict.cpu().detach().numpy()
print(submission)

      index  target
0         2     1.0
1         9     0.0
2        10     1.0
3        11     0.0
4        15     0.0
...     ...     ...
3827  19129     0.0
3828  19132     0.0
3829  19135     0.0
3830  19149     0.0
3831  19155     0.0

[3832 rows x 2 columns]


In [20]:
submission.to_csv("submission.csv",index=False)