In [26]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [27]:
import torch
import sklearn
import random

# 장치 설정
device = "cuda" if torch.cuda.is_available() else "cpu"

# 랜덤 시드 
torch.manual_seed(1)
random.seed(1)
if device == "cuda":
    torch.cuda.manual_seed_all(1)

# 데이터 로딩
train = pd.read_csv("/kaggle/input/2021-ai-midterm-p4/train.csv")
test = pd.read_csv("/kaggle/input/2021-ai-midterm-p4/test.csv")
submission = pd.read_csv("/kaggle/input/2021-ai-midterm-p4/submit_sample.csv")

# 데이터 확인
print(train.head())
print(test.head())
print(submission.head())

#  데이터 전처리

# 필요 없는 데이터 정리
train = train.drop(['index'],axis=1)
test = test.drop(['index'],axis=1)
# enrollee_id 처리
train = train.drop(['enrollee_id'],axis=1)
test = test.drop(['enrollee_id'],axis=1)
# experience 처리
train = train.drop(['experience'],axis=1)
test = test.drop(['experience'],axis=1)
# company_size 처리
train = train.drop(['company_size'],axis=1)
test = test.drop(['company_size'],axis=1)
# last_new_job 처리
train = train.drop(['last_new_job'],axis=1)
test = test.drop(['last_new_job'],axis=1)
# city 
train = train.drop(['city'],axis=1)
test = test.drop(['city'],axis=1)



# 결측값 제거 
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

train = pd.DataFrame(imputer.fit_transform(train))
test = pd.DataFrame(imputer.fit_transform(test))


# 데이터 실수화 (encoding)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# gender 처리
train[1] = le.fit_transform(train[1])
test[1] = le.transform(test[1])
# relevent_experience 처리
train[2] = le.fit_transform(train[2])
test[2] = le.transform(test[2])
# enrolled_university 처리
train[3] = le.fit_transform(train[3])
test[3] = le.transform(test[3])
# education_level 처리
train[4] = le.fit_transform(train[4])
test[4] = le.transform(test[4])
# major_discipline 처리
train[5] = le.fit_transform(train[5])
test[5] = le.transform(test[5])
# company_type 처리
train[6] = le.fit_transform(train[6])
test[6] = le.transform(test[6])

# df to array
x_train = np.array(train.drop([8],axis=1))
x_test = np.array(test)
y_train = np.array(train[8])

# 데이터 변형 (scale)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# 데이터 실수화 (encoding)
y_train = le.fit_transform(y_train)

# 데이터 텐서에 올리기
x_test = torch.Tensor(x_test).to(device)
x_train = torch.Tensor(x_train).to(device)
y_train = torch.Tensor(y_train).to(device)

# 데이터 모양 확인 
print(x_train.shape, y_train.shape)


# layer 생성
layer1 = torch.nn.Linear(8,60,bias=True).to(device)
layer2 = torch.nn.Linear(60,8,bias=True).to(device)
layer3 = torch.nn.Linear(8,1,bias=True).to(device)
relu = torch.nn.ReLU()
dropout = torch.nn.Dropout()

# 가중치 초기화
torch.nn.init.xavier_normal_(layer1.weight).to(device)
torch.nn.init.xavier_normal_(layer2.weight).to(device)
torch.nn.init.xavier_normal_(layer3.weight).to(device)

# 모델 생성
model = torch.nn.Sequential(layer1,relu,dropout,
                            layer2,relu,dropout,
                           layer3).to(device)

# 학습 파라미터 설정
epochs = 2000
lr = 0.01

# loss 함수 이진 분류
# loss = torch.nn.CrossEntropyLoss()
# loss = torch.nn.MSELoss()
loss = torch.nn.BCELoss()

# optim 설정
optim = torch.optim.Adam(model.parameters(),lr=lr)

# 학습 
model.train()
for epoch in range(epochs + 1):
    optim.zero_grad()
    h = model(x_train)
    cost = loss(torch.sigmoid(h),y_train.unsqueeze(1))
    cost.backward()
    optim.step()
    if epoch % 100 == 0:
        print(epoch, cost.item())

In [28]:
with torch.no_grad():
    model.eval()
    predict= torch.sigmoid(model(x_test))
    predict = predict >= torch.Tensor([0.5]).to(device)
    submission['target'] = predict.float()
    print(predict.int())
submission.to_csv("submission.csv",index=False)