In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from torch.optim import Adam

from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/nlp-getting-started


In [None]:
import nltk
nltk.download('stopwords')

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
#!pip install tensorflow

In [None]:
# 데이터 불러오기 및 전처리
pre_data_tr = pd.read_csv('train.csv')
train_data = pre_data_tr.sample(frac=1)  # 데이터 셔플
pre_data_te = pd.read_csv('test.csv')
test_data = pre_data_te.sample(frac=1)  # 데이터 셔플
sub_data = pd.read_csv('sample_submission.csv')

# BERT 토크나이저 초기화
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import tensorflow as tf

# NLTK 불용어 및 Lemmatizer 초기화
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# 데이터 정제 함수
def cleaned(text):
    # 개행 문자 제거
    text = re.sub(r"\n", "", text)
    # 소문자 변환
    text = text.lower()
    # 숫자 제거
    text = re.sub(r"\d", "", text)
    # 비 ASCII 문자 제거
    text = re.sub(r'[^\x00-\x7f]', r' ', text)
    # 구두점 제거
    text = re.sub(r'[^\w\s]', '', text)
    # URL 제거
    text = re.sub(r'http\S+|www.\S+', '', text)
    # 불용어 제거
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Lemmatizer 적용
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

# 데이터프레임의 텍스트 칼럼 정제
train_data['text'] = train_data['text'].apply(lambda x: cleaned(x))
train_data['keyword'] = train_data['keyword'].fillna("")

# 검증 데이터 분리
val_data = train_data.tail(1500)
train_data = train_data.head(len(train_data) - 1500)

# 토큰화 함수 정의
def define_tokenizer(train_sentences, val_sentences, test_sentences):
    sentences = pd.concat([train_sentences, val_sentences, test_sentences])
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(sentences)
    return tokenizer

def encode(sentences, tokenizer):
    encoded_sentences = tokenizer.texts_to_sequences(sentences)
    encoded_sentences = tf.keras.preprocessing.sequence.pad_sequences(encoded_sentences, padding='post')
    return encoded_sentences

# 토크나이저 정의 및 인코딩
tokenizer = define_tokenizer(train_data['text'], val_data['text'], test_data['text'])
encoded_train_sentences = encode(train_data['text'], tokenizer)
encoded_val_sentences = encode(val_data['text'], tokenizer)
encoded_test_sentences = encode(test_data['text'], tokenizer)

# 토크나이저 설정 확인
print('Lower: ', tokenizer.get_config()['lower'])
print('Split: ', tokenizer.get_config()['split'])
print('Filters: ', tokenizer.get_config()['filters'])


import torch
from sklearn.model_selection import train_test_split

# PyTorch 텐서로 변환
train_labels = torch.tensor(train_data['target'].values)
val_labels = torch.tensor(val_data['target'].values)
test_labels = torch.tensor([0] * len(test_data))  # 테스트 데이터 레이블은 dummy로 사용

# 학습 및 검증 데이터 분리
train_inputs, val_inputs, train_masks, val_masks = train_test_split(
    encoded_train_sentences, train_labels, test_size=0.3, random_state=42
)

train_inputs = torch.tensor(train_inputs)
val_inputs = torch.tensor(val_inputs)
train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(list(train_data['keyword'] + ' ' + train_data['text']), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_data['keyword'] + ' ' + val_data['text']), truncation=True, padding=True, max_length=128)

# 1. 결측값 처리
test_data['keyword'] = test_data['keyword'].fillna("")
test_data['text'] = test_data['text'].fillna("")

# 2. 문자열 형식 확인 및 변환
test_data['keyword'] = test_data['keyword'].astype(str)
test_data['text'] = test_data['text'].astype(str)

# 3. 텍스트 결합 및 리스트 변환
test_texts = (test_data['keyword'] + ' ' + test_data['text']).tolist()

# 4. BERT 토크나이저를 통한 토큰화
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)



In [None]:
from sklearn.model_selection import train_test_split

# 데이터 재분리
train_texts, val_texts, train_labels, val_labels = train_test_split(
    (train_data['keyword'] + ' ' + train_data['text']).tolist(),  # 텍스트 데이터
    train_data['target'].values,                                 # 레이블
    test_size=0.3,                                               # 검증 데이터 비율
    random_state=42
)

# 토큰화
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

# Tensor로 변환
train_inputs = torch.tensor(train_encodings['input_ids'])
train_masks = torch.tensor(train_encodings['attention_mask'])
train_labels = torch.tensor(train_labels)

val_inputs = torch.tensor(val_encodings['input_ids'])
val_masks = torch.tensor(val_encodings['attention_mask'])
val_labels = torch.tensor(val_labels)


In [None]:
print("train_inputs size: ", train_inputs.size())
print("train_masks size: ", train_masks.size())
print("train_labels size: ", train_labels.size())

In [None]:
# TensorDataset 및 DataLoader로 변환
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
val_dataset = TensorDataset(val_inputs, val_masks, val_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# BERT 분류 모델 초기화
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 손실 함수와 옵티마이저 설정
criterion = torch.nn.BCELoss()
optimizer = Adam(model.parameters(), lr=2e-5)

# 학습 루프
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)

    # 검증 단계
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)  # 예측 클래스 선택
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    val_accuracy = correct / total
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_train_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

# 테스트 단계
test_inputs = torch.tensor(test_encodings['input_ids'])
test_masks = torch.tensor(test_encodings['attention_mask'])

test_dataset = TensorDataset(test_inputs, test_masks, test_labels)
test_loader = DataLoader(test_dataset, batch_size=16)

model.eval()
test_pred = []
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1).cpu()  # 예측 클래스 선택 및 CPU로 이동
        test_pred.extend(predictions.numpy())

        # 테스트 정확도 계산
        correct += (predictions == labels.cpu()).sum().item()
        total += labels.size(0)

# 최종 정확도 계산 및 출력
test_accuracy = correct / total
print(f"Final Test Accuracy: {test_accuracy:.4f}")


# 최종 제출 파일 생성
sub_data = {
    'id': test_data['id'].values.astype('int64'),
    'target': [int(pred) for pred in test_pred]  # 정수로 변환
}
submission = pd.DataFrame(sub_data)
submission.to_csv("submission.csv", sep=',', float_format='%.0f', index=False)
print("Submission file created successfully.")