In [16]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [17]:
import torch
import sklearn
import random

# 장치 설정
device = "cuda" if torch.cuda.is_available() else "cpu"

# 랜덤 시드 
torch.manual_seed(1)
random.seed(1)
if device == "cuda":
    torch.cuda.manual_seed_all(1)

# 데이터 로딩
train = pd.read_csv("/kaggle/input/2021-ai-midterm-p5/train.csv")
test = pd.read_csv("/kaggle/input/2021-ai-midterm-p5/test.csv")
submission = pd.read_csv("/kaggle/input/2021-ai-midterm-p5/submit_sample.csv")

# 데이터 확인
print(train.head())
print(test.head())
print(submission.head())

#  데이터 전처리

# 필요 없는 데이터 정리
train = train.drop(['Unnamed: 0'],axis=1)
test = test.drop(['Unnamed: 0'],axis=1)
train = train.drop(['index'],axis=1)
test = test.drop(['index'],axis=1)
# customerID 처리
train = train.drop(['customerID'],axis=1)
test = test.drop(['customerID'],axis=1)
# MultipleLines 처리
train = train.drop(['MultipleLines'],axis=1)
test = test.drop(['MultipleLines'],axis=1)
# OnlineSecurity 처리
train = train.drop(['OnlineSecurity'],axis=1)
test = test.drop(['OnlineSecurity'],axis=1)
# OnlineBackup 처리
train = train.drop(['OnlineBackup'],axis=1)
test = test.drop(['OnlineBackup'],axis=1)
# DeviceProtection 처리
train = train.drop(['DeviceProtection'],axis=1)
test = test.drop(['DeviceProtection'],axis=1)
# TechSupport 처리
train = train.drop(['TechSupport'],axis=1)
test = test.drop(['TechSupport'],axis=1)
# StreamingTV 처리
train = train.drop(['StreamingTV'],axis=1)
test = test.drop(['StreamingTV'],axis=1)
# StreamingMovies 처리
train = train.drop(['StreamingMovies'],axis=1)
test = test.drop(['StreamingMovies'],axis=1)
# TotalCharges 처리
train = train.drop(['TotalCharges'],axis=1)
test = test.drop(['TotalCharges'],axis=1)



# 결측값 제거 
# from sklearn.impute import SimpleImputer
# imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
# train = pd.DataFrame(imputer.fit_transform(train))
# test = pd.DataFrame(imputer.fit_transform(test))

# 데이터 실수화 (encoding)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# gender 처리
train['gender'] = le.fit_transform(train['gender'])
test['gender'] = le.transform(test['gender'])
# Partner 처리
train['Partner'] = le.fit_transform(train['Partner'])
test['Partner'] = le.transform(test['Partner'])
# Dependents 처리
train['Dependents'] = le.fit_transform(train['Dependents'])
test['Dependents'] = le.transform(test['Dependents'])
# PhoneService 처리
train['PhoneService'] = le.fit_transform(train['PhoneService'])
test['PhoneService'] = le.transform(test['PhoneService'])
# InternetService 처리
train['InternetService'] = le.fit_transform(train['InternetService'])
test['InternetService'] = le.transform(test['InternetService'])
# Contract 처리
train['Contract'] = le.fit_transform(train['Contract'])
test['Contract'] = le.transform(test['Contract'])

# PaperlessBilling 처리
train['PaperlessBilling'] = le.fit_transform(train['PaperlessBilling'])
test['PaperlessBilling'] = le.transform(test['PaperlessBilling'])
# PaymentMethod 처리
train['PaymentMethod'] = le.fit_transform(train['PaymentMethod'])
test['PaymentMethod'] = le.transform(test['PaymentMethod'])


print("인코딩")
print(train)
print(test)

# df to array
x_train = np.array(train.drop(['Churn'],axis=1))
x_test = np.array(test)
y_train = np.array(train['Churn'])

# 데이터 변형 (scale)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# 데이터 실수화 (encoding)
y_train = le.fit_transform(y_train)

# 데이터 텐서에 올리기
x_test = torch.Tensor(x_test).to(device)
x_train = torch.Tensor(x_train).to(device)
y_train = torch.Tensor(y_train).to(device)

# 데이터 모양 확인 
print(x_train.shape, y_train.shape)


# layer 생성
layer1 = torch.nn.Linear(11,22,bias=True).to(device)
layer2 = torch.nn.Linear(22,11,bias=True).to(device)
layer3 = torch.nn.Linear(11,1,bias=True).to(device)
relu = torch.nn.ReLU()
dropout = torch.nn.Dropout()

# 가중치 초기화
torch.nn.init.xavier_normal_(layer1.weight).to(device)
torch.nn.init.xavier_normal_(layer2.weight).to(device)
torch.nn.init.xavier_normal_(layer3.weight).to(device)

# 모델 생성
model = torch.nn.Sequential(layer1,relu,dropout,
                            layer2,relu,dropout,
                           layer3).to(device)

# 학습 파라미터 설정
epochs = 2000
lr = 0.01

# loss 함수 이진 분류
# loss = torch.nn.CrossEntropyLoss()
# loss = torch.nn.MSELoss()
loss = torch.nn.BCELoss()

# optim 설정
optim = torch.optim.Adam(model.parameters(),lr=lr)

# 학습 
model.train()
for epoch in range(epochs + 1):
    optim.zero_grad()
    h = model(x_train)
    cost = loss(torch.sigmoid(h),y_train.unsqueeze(1))
    cost.backward()
    optim.step()
    if epoch % 100 == 0:
        print(epoch, cost.item())

In [18]:
with torch.no_grad():
    predict=torch.sigmoid(model(x_test))
    predict = predict >= torch.Tensor([0.5]).to(device)
    predict = predict.int()
    submission['Churn'] = predict
    
submission.to_csv("submission.csv",index=False)