In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import random
import os
import pandas as pd
from transformers import AutoConfig
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import StepLR
from google.colab import drive
import numpy as np

In [None]:
!pip install transformers



In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [None]:
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# ÏÉàÎ°úÏö¥ Î¶¨Î∑∞ Îç∞Ïù¥ÌÑ∞ Î°úÎìú
predicted_reviews_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/skin_toner.csv')
predicted_reviews_df.rename(columns={
    'label': 'sentiment',
    'review': 'txt',

}, inplace=True)

In [None]:
predicted_reviews_df['review'] = predicted_reviews_df['txt'].str.replace('\n', ' ').replace('\r', ' ')

In [None]:
tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

Downloading (‚Ä¶)okenizer_config.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

Downloading (‚Ä¶)lve/main/config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

Downloading (‚Ä¶)solve/main/vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator", num_labels=2)  # Í∞êÏÑ± ÎùºÎ≤®Ïù¥ Í∏çÏ†ï/Î∂ÄÏ†ï ÎëêÍ∞úÏù¥ÎØÄÎ°ú num_labels=2ÏúºÎ°ú ÏÑ§Ï†ï

Downloading pytorch_model.bin:   0%|          | 0.00/452M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
class ReviewDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len

        # ÌÜ†ÌÅ∞ÌôîÌïú Í∏∏Ïù¥Î•º Í≥ÑÏÇ∞ÌïòÍ≥†, max_lenÎ≥¥Îã§ ÌÅ∞ Îç∞Ïù¥ÌÑ∞Î•º ÌïÑÌÑ∞ÎßÅÌï©ÎãàÎã§.
        self.data = data[data['txt'].apply(lambda x: len(self.tokenizer.tokenize(x)) < self.max_len)]

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        text = row['txt']
        label = row['sentiment']

        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'ids': inputs['input_ids'].flatten(),
            'mask': inputs['attention_mask'].flatten(),
            'targets': torch.tensor(label, dtype=torch.long)
        }

    def __len__(self):
        return len(self.data)

In [None]:
# Î°úÎçî Ï§ÄÎπÑ
predicted_dataset = ReviewDataset(predicted_reviews_df, tokenizer, max_len=512)
predicted_loader = DataLoader(predicted_dataset, batch_size=16, shuffle=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (621 > 512). Running this sequence through the model will result in indexing errors


In [None]:
model.load_state_dict(torch.load('/content/drive/My Drive/Colab Notebooks/L25_drop02_10epoch_sche0_4_best_model.pth'))
model.eval()
preds = []
for batch in predicted_loader:
    input_ids = batch['ids']
    attention_mask = batch['mask']
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    preds.extend(torch.argmax(outputs.logits, axis=-1).tolist())


In [None]:
model = AutoModelForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator", num_labels=2)  # Í∞êÏÑ± ÎùºÎ≤®Ïù¥ Í∏çÏ†ï/Î∂ÄÏ†ï ÎëêÍ∞úÏù¥ÎØÄÎ°ú num_labels=2ÏúºÎ°ú ÏÑ§Ï†ï

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
max_len = 512
filtered_data =predicted_reviews_df[predicted_reviews_df['txt'].apply(lambda x: len(tokenizer.tokenize(x)) < max_len)]
results_df = pd.DataFrame({'name': filtered_data['name'], 'review': filtered_data['txt'], 'sentiment': preds})

In [None]:
# Í∏çÏ†ïÏ†ÅÏù∏ Î¶¨Î∑∞ÏôÄ Î∂ÄÏ†ïÏ†ÅÏù∏ Î¶¨Î∑∞ ÏÑ†ÌÉù
positive_reviews = results_df[results_df['sentiment'] == 1]
negative_reviews = results_df[results_df['sentiment'] == 0]


In [None]:
# Í∞Å ÏÉÅÌíàÎ™ÖÏóê Îî∞Îùº Í∏çÏ†ïÏ†ÅÏù∏ Î¶¨Î∑∞ÏôÄ Î∂ÄÏ†ïÏ†ÅÏù∏ Î¶¨Î∑∞ Í∑∏Î£πÌôî
positive_grouped = positive_reviews.groupby('name')
negative_grouped = negative_reviews.groupby('name')

In [None]:
# Í∞Å Í∑∏Î£πÏùÑ Ï∂úÎ†•ÌïòÍ≥†, Î≥ÑÎèÑÏùò ÌååÏùºÎ°ú Ï†ÄÏû•
for name, group in positive_grouped:
    print(f"Product name: {name}")
    print("Positive reviews:")
    print(group)
    #group.to_csv(os.path.join( f'{name}_positive_reviews.csv'), index=False)

for name, group in negative_grouped:
    print(f"Product name: {name}")
    print("Negative reviews:")
    print(group)
    #group.to_csv(os.path.join( f'{name}_negative_reviews.csv'), index=False)


Product name: [8Ïõî Ïò¨ÏòÅÌîΩ/Î¶¨ÌïÑÍ∏∞Ìöç] ÏïÑÎàÑÏïÑ Ïñ¥ÏÑ±Ï¥à 77 ÏàòÎî© ÌÜ†ÎÑà 350ML Î¶¨ÌïÑ Í∏∞ÌöçÏÑ∏Ìä∏(350ML+350MLÎ¶¨ÌïÑ)
Positive reviews:
                                                 name  \
4   [8Ïõî Ïò¨ÏòÅÌîΩ/Î¶¨ÌïÑÍ∏∞Ìöç] ÏïÑÎàÑÏïÑ Ïñ¥ÏÑ±Ï¥à 77 ÏàòÎî© ÌÜ†ÎÑà 350ML Î¶¨ÌïÑ Í∏∞ÌöçÏÑ∏Ìä∏(3...   
9   [8Ïõî Ïò¨ÏòÅÌîΩ/Î¶¨ÌïÑÍ∏∞Ìöç] ÏïÑÎàÑÏïÑ Ïñ¥ÏÑ±Ï¥à 77 ÏàòÎî© ÌÜ†ÎÑà 350ML Î¶¨ÌïÑ Í∏∞ÌöçÏÑ∏Ìä∏(3...   
17  [8Ïõî Ïò¨ÏòÅÌîΩ/Î¶¨ÌïÑÍ∏∞Ìöç] ÏïÑÎàÑÏïÑ Ïñ¥ÏÑ±Ï¥à 77 ÏàòÎî© ÌÜ†ÎÑà 350ML Î¶¨ÌïÑ Í∏∞ÌöçÏÑ∏Ìä∏(3...   
20  [8Ïõî Ïò¨ÏòÅÌîΩ/Î¶¨ÌïÑÍ∏∞Ìöç] ÏïÑÎàÑÏïÑ Ïñ¥ÏÑ±Ï¥à 77 ÏàòÎî© ÌÜ†ÎÑà 350ML Î¶¨ÌïÑ Í∏∞ÌöçÏÑ∏Ìä∏(3...   
40  [8Ïõî Ïò¨ÏòÅÌîΩ/Î¶¨ÌïÑÍ∏∞Ìöç] ÏïÑÎàÑÏïÑ Ïñ¥ÏÑ±Ï¥à 77 ÏàòÎî© ÌÜ†ÎÑà 350ML Î¶¨ÌïÑ Í∏∞ÌöçÏÑ∏Ìä∏(3...   
50  [8Ïõî Ïò¨ÏòÅÌîΩ/Î¶¨ÌïÑÍ∏∞Ìöç] ÏïÑÎàÑÏïÑ Ïñ¥ÏÑ±Ï¥à 77 ÏàòÎî© ÌÜ†ÎÑà 350ML Î¶¨ÌïÑ Í∏∞ÌöçÏÑ∏Ìä∏(3...   
51  [8Ïõî Ïò¨ÏòÅÌîΩ/Î¶¨ÌïÑÍ∏∞Ìöç] ÏïÑÎàÑÏïÑ Ïñ¥ÏÑ±Ï¥à 77 ÏàòÎî© ÌÜ†ÎÑà 350ML Î¶¨ÌïÑ Í∏∞ÌöçÏÑ∏Ìä∏(3...   
52  [8Ïõî Ïò¨ÏòÅÌîΩ/Î¶¨ÌïÑÍ∏∞Ìöç] ÏïÑÎàÑÏïÑ Ïñ¥ÏÑ±Ï¥à 77 ÏàòÎî© 

In [None]:
# TF-IDF Vectorizer Ï¥àÍ∏∞Ìôî
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

In [None]:
def extract_top_keywords_with_tfidf(grouped_reviews, n=10):
    # Í∞Å Í∑∏Î£πÏùò Î¶¨Î∑∞Î•º ÌïòÎÇòÏùò Î¨∏ÏÑúÎ°ú Í≤∞Ìï©
    documents = [' '.join(group['review'].tolist()) for _, group in grouped_reviews]
    names = [name for name, _ in grouped_reviews]

    # TF-IDF Í∞í Í≥ÑÏÇ∞
    tfidf_matrix = vectorizer.fit_transform(documents)

    top_keywords = {}
    for i, name in enumerate(names):
        row = tfidf_matrix.getrow(i).toarray()[0]
        top_n_indices = row.argsort()[-n:][::-1]
        top_n_keywords = [vectorizer.get_feature_names_out()[idx] for idx in top_n_indices]
        top_keywords[name] = top_n_keywords

    return top_keywords


In [None]:
# Í∞Å ÏÉÅÌíàÎ≥Ñ Í∏çÏ†ï Î∞è Î∂ÄÏ†ï Î¶¨Î∑∞ÏóêÏÑú Ï§ëÏöîÌïú ÌÇ§ÏõåÎìú Ï∂îÏ∂ú
positive_keywords = extract_top_keywords_with_tfidf(positive_grouped, n=5)
negative_keywords = extract_top_keywords_with_tfidf(negative_grouped, n=3)

In [None]:
print("Top Keywords for Positive Reviews:")
for name, keywords in positive_keywords.items():
    print(f"Product {name}: {', '.join(keywords)}")

print("\nTop Keywords for Negative Reviews:")
for name, keywords in negative_keywords.items():
    print(f"Product {name}: {', '.join(keywords)}")

Top Keywords for Positive Reviews:
Product [8Ïõî Ïò¨ÏòÅÌîΩ/Î¶¨ÌïÑÍ∏∞Ìöç] ÏïÑÎàÑÏïÑ Ïñ¥ÏÑ±Ï¥à 77 ÏàòÎî© ÌÜ†ÎÑà 350ML Î¶¨ÌïÑ Í∏∞ÌöçÏÑ∏Ìä∏(350ML+350MLÎ¶¨ÌïÑ): Í∞ôÏïÑÏöî, Ïñ¥ÏÑ±Ï¥à, ÏïÑÎàÑÏïÑ, ÌÜ†ÎÑà, ÏûàÎäî
Product [8ÏõîÏò¨ÏòÅÌîΩ/ÎåÄÏö©ÎüâÍ∏∞Ìöç] ÌÜ†Î¶¨Îì† Îã§Ïù¥Î∏åÏù∏ Ï†ÄÎ∂ÑÏûê ÌûàÏïåÎ£®Î°†ÏÇ∞ ÌÜ†ÎÑà 500ml Í∏∞Ìöç (+ÌôîÏû•ÏÜú 60Îß§ Ï¶ùÏ†ï): Îã§Ïù¥Î∏åÏù∏, ÌÜ†Î¶¨Îì†, ÌÜ†ÎÑà, ÎèÖÎèÑÌÜ†ÎÑà, Ï¢ãÏïÑÏöî
Product [Îã®ÎèÖÍ∏∞Ìöç] ÎÑòÎ≤ÑÏ¶àÏù∏ 3Î≤à Í≤∞Í¥ëÍ∞ÄÎìù ÏóêÏÑºÏä§ ÌÜ†ÎÑà 200ml (ÌôîÏû•ÏÜú Ï¶ùÏ†ï): ÎÑòÎ≤ÑÏ¶àÏù∏, ÏóêÏÑºÏä§, 3Î≤à, Ï†ÄÎäî, ÎÑàÎ¨¥
Product [Ïø®ÎßÅÏßÑÏ†ï] ÎÑòÎ≤ÑÏ¶àÏù∏ 1Î≤à ÏßÑÏ†ï ÎßëÍ≤åÎã¥ÏùÄ Ï≤≠Ï¥àÌÜ†ÎÑà 300ml Í∏∞Ìöç (+1Î≤à Ìå®Îìú 10Îß§ Ï¶ùÏ†ï): Î¨¥ÏÉâ, ÌÜ†ÎÑàÏó¨ÏÑú, Ïù¥Î≤§Ìä∏Î°ú, ÏüÅÏûÖÎãàÎã§, Îã¶ÏïÑÏ§Ñ
Product [ÌïúÏ†ïÍ∏∞Ìöç/ÎåÄÏö©Îüâ] ÎùºÏö¥ÎìúÎû© 1025 ÎèÖÎèÑ ÌÜ†ÎÑà 500ml+200ml Í∏∞Ìöç(+ÏÜåÎÇòÎ¨¥ ÌÅ¥Î†åÏ†Ä 10ml Ï¶ùÏ†ï): ÎèÖÎèÑ, Ï¢ãÏïÑÏöî, ÌÜ†ÎÑà, Í∞ôÏïÑÏöî, ÎßéÏù¥
Product [ÌïúÏ†ïÍ∏∞Ìöç] ÎùºÏö¥ÎìúÎû© 1025 ÎèÖÎèÑ ÌÜ†ÎÑà ÎåÄÏö©Îüâ Î¶¨ÌïÑÍ∏∞Ìöç (300ml+300ml Î¶¨ÌïÑ+ÏïΩÏΩ©ÌåêÌÖåÎÜÄÌÅ¨Î¶º1