<a href="https://colab.research.google.com/github/gmrwh92/Machine-Learning-Deep-Learning/blob/main/Distilbert_with_5000_sample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import string
from nltk.tokenize import word_tokenize
import re
from nltk.tag import pos_tag

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
from sklearn.metrics import accuracy_score
from transformers import EarlyStoppingCallback

  warn(f"Failed to load image Python extension: {e}")


In [None]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gmrwh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gmrwh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gmrwh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\gmrwh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\gmrwh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [None]:
## load dataset
df = pd.read_csv('IMDB Dataset.csv')
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
df = df.head(5000)
## check the balance of labels
value_counts = df['sentiment'].value_counts()
print(value_counts)

0    2532
1    2468
Name: sentiment, dtype: int64


In [None]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
4995,An interesting slasher film with multiple susp...,0
4996,i watched this series when it first came out i...,1
4997,Once again Jet Li brings his charismatic prese...,1
4998,"I rented this movie, after hearing Chris Gore ...",0


In [None]:
def get_wordnet_pos(tag):

    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # 기본값

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def data_preprocessing(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub('<.*?>', '', text)  #
        text = ''.join([c for c in text if c not in string.punctuation])

        words = word_tokenize(text)
        words = pos_tag(words)
        words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in words if word not in stop_words]

        text = ' '.join(words)
    else:
        text = ''
    return text

df['cleaned_reviews'] = df['review'].apply(data_preprocessing)
df.head()

Unnamed: 0,review,sentiment,cleaned_reviews
0,One of the other reviewers has mentioned that ...,1,one reviewer mention watch 1 oz episode youll ...
1,A wonderful little production. <br /><br />The...,1,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,1,think wonderful way spend time hot summer week...
3,Basically there's a family where a little boy ...,0,basically theres family little boy jake think ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter matteis love time money visually stunni...


In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
model_name = "distilbert-base-uncased"  # 감성 분석에 최적화된 미리 학습된 모델
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).item()  # 0 or 1 값 반환
    return  predictions

# 데이터프레임에 적용
df["pretrained"] = df["cleaned_reviews"].apply(predict_sentiment)

In [None]:
df

Unnamed: 0,review,sentiment,cleaned_reviews,pretrained
0,One of the other reviewers has mentioned that ...,1,one reviewer mention watch 1 oz episode youll ...,1
1,A wonderful little production. <br /><br />The...,1,wonderful little production filming technique ...,1
2,I thought this was a wonderful way to spend ti...,1,think wonderful way spend time hot summer week...,1
3,Basically there's a family where a little boy ...,0,basically theres family little boy jake think ...,1
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter matteis love time money visually stunni...,1
...,...,...,...,...
4995,An interesting slasher film with multiple susp...,0,interesting slasher film multiple suspectsincl...,1
4996,i watched this series when it first came out i...,1,watch series first come 70si 14 year old watch...,1
4997,Once again Jet Li brings his charismatic prese...,1,jet li bring charismatic presence movie screen...,0
4998,"I rented this movie, after hearing Chris Gore ...",0,rent movie hear chris gore say something effec...,1


# 파인튜닝

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

In [None]:
from datasets import Dataset

In [None]:
texts = df['cleaned_reviews'].tolist()  # 텍스트
labels = df['sentiment']  # 감성 레이블 (예: 0 = 부정, 1 = 긍정)

In [None]:
dataset = Dataset.from_dict({"text": texts, "label": labels})

In [None]:
train_dataset = dataset.train_test_split(test_size=0.2)['train']
eval_dataset = dataset.train_test_split(test_size=0.2)['test']

In [None]:
def preprocess_function(examples):
    return tokenizer(examples['text'], padding=True, truncation=True)

In [None]:
train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# 8. 훈련 인자 설정

early_stopping = EarlyStoppingCallback(
    early_stopping_patience=2  # 성능 향상이 없을 때 2번 기다렸다가 종료
)

training_args = TrainingArguments(
    output_dir="./results",           # 결과 저장 디렉터리
    evaluation_strategy="epoch",      # 에포크마다 평가
    learning_rate=2e-5,               # 학습률
    per_device_train_batch_size=16,    # 훈련 배치 사이즈
    per_device_eval_batch_size=16,     # 평가 배치 사이즈
    num_train_epochs=3,               # 에포크 수
    weight_decay=0.01,                # 가중치 감쇠
    logging_dir="./logs",             # 로그 파일 저장 디렉터리
    logging_steps=10,                 # 10 스텝마다 로그 출력
    save_strategy="epoch",            # 모델을 매 에포크마다 저장
    load_best_model_at_end=True,      # 훈련 후 가장 좋은 모델을 로드
)

# 9. Trainer 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[early_stopping]
)

# 10. 모델 훈련
trainer.train()

# 11. 모델 저장
model.save_pretrained("./saved_model")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,0.3707,0.25758
2,0.2476,0.158421
3,0.0827,0.144402


In [None]:
# 11. 모델 저장
# 모델과 토크나이저를 저장하는 코드
model.save_pretrained("./my_trained_model")  # 모델 저장
tokenizer.save_pretrained("./my_trained_model")  # 토크나이저 저장


('./my_trained_model\\tokenizer_config.json',
 './my_trained_model\\special_tokens_map.json',
 './my_trained_model\\vocab.txt',
 './my_trained_model\\added_tokens.json')

In [None]:
model_path = "./my_trained_model"  # 트레이닝 후 저장된 모델 경로
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)


In [None]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).item()  # 0 or 1 값 반환
    return  predictions

# 데이터프레임에 적용
df["fine_tuning"] = df["cleaned_reviews"].apply(predict_sentiment)
df

Unnamed: 0,review,sentiment,cleaned_reviews,pretrained,fine_tuning
0,One of the other reviewers has mentioned that ...,1,one reviewer mention watch 1 oz episode youll ...,1,1
1,A wonderful little production. <br /><br />The...,1,wonderful little production filming technique ...,1,1
2,I thought this was a wonderful way to spend ti...,1,think wonderful way spend time hot summer week...,1,1
3,Basically there's a family where a little boy ...,0,basically theres family little boy jake think ...,1,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter matteis love time money visually stunni...,1,1
...,...,...,...,...,...
4995,An interesting slasher film with multiple susp...,0,interesting slasher film multiple suspectsincl...,1,0
4996,i watched this series when it first came out i...,1,watch series first come 70si 14 year old watch...,1,1
4997,Once again Jet Li brings his charismatic prese...,1,jet li bring charismatic presence movie screen...,0,1
4998,"I rented this movie, after hearing Chris Gore ...",0,rent movie hear chris gore say something effec...,1,0


In [None]:
# 튜닝없이 기본 모델 정확도 계산
accuracy = accuracy_score(df['pretrained'], df['sentiment'])
print(f"기본 distilbert-base-uncased 모델 정확도: {accuracy * 100:.2f}%")
##파인튜닝후
accuracy1 = accuracy_score(df['fine_tuning'], df['sentiment'])
print(f"튜닝후 distilbert-base-uncased 모델 정확도: {accuracy1 * 100:.2f}%")

기본 distilbert-base-uncased 모델 정확도: 48.16%
튜닝후 distilbert-base-uncased 모델 정확도: 95.46%


In [None]:
df = df.rename(columns={
    "sentiment": "original", "pretrained" : "Distilbert", "fine_tuning":"Distilbert with tuning"})

In [None]:
abc = df
abc = abc.iloc[:,1:]
abc

Unnamed: 0,original,cleaned_reviews,Distilbert,Distilbert with tuning
0,1,one reviewer mention watch 1 oz episode youll ...,1,1
1,1,wonderful little production filming technique ...,1,1
2,1,think wonderful way spend time hot summer week...,1,1
3,0,basically theres family little boy jake think ...,1,0
4,1,petter matteis love time money visually stunni...,1,1
...,...,...,...,...
4995,0,interesting slasher film multiple suspectsincl...,1,0
4996,1,watch series first come 70si 14 year old watch...,1,1
4997,1,jet li bring charismatic presence movie screen...,0,1
4998,0,rent movie hear chris gore say something effec...,1,0


In [None]:
new_order = ["cleaned_reviews", "original", "Distilbert", "Distilbert with tuning"]
abc = abc[new_order]
abc

Unnamed: 0,cleaned_reviews,original,Distilbert,Distilbert with tuning
0,one reviewer mention watch 1 oz episode youll ...,1,1,1
1,wonderful little production filming technique ...,1,1,1
2,think wonderful way spend time hot summer week...,1,1,1
3,basically theres family little boy jake think ...,0,1,0
4,petter matteis love time money visually stunni...,1,1,1
...,...,...,...,...
4995,interesting slasher film multiple suspectsincl...,0,1,0
4996,watch series first come 70si 14 year old watch...,1,1,1
4997,jet li bring charismatic presence movie screen...,1,0,1
4998,rent movie hear chris gore say something effec...,0,1,0
