<a href="https://colab.research.google.com/github/dhdbsrlw/2023F-ML-Topic-Modelling/blob/main/Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pipeline Overview

1. Data loader (Preprocessed)
2. Document embedding
- Sentence Transformer
3. Dimension Reduction
- UMAP
4. Clustering
- HDBScan
5. Topic extraction
- LDA, C-TF-IDF

In [None]:
!python --version


Python 3.10.12


# Import Library

In [None]:
!pip install pyLDAvis
!pip install numpy==1.23.5
!pip install pandas==1.5.3
!pip install hdbscan
!pip install python-box
!pip install joblib
!pip install -U sentence-transformers
!pip install -e .
!pip install umap-learn
!pip install datashader bokeh holoviews scikit-image and colorcet

Collecting numpy>=1.24.2 (from pyLDAvis)
  Using cached numpy-1.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Collecting pandas>=2.0.0 (from pyLDAvis)
  Using cached pandas-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
Installing collected packages: numpy, pandas
  Attempting uninstall: numpy
    Found existing installation: numpy 1.23.5
    Uninstalling numpy-1.23.5:
      Successfully uninstalled numpy-1.23.5
  Attempting uninstall: pandas
    Found existing installation: pandas 1.5.3
    Uninstalling pandas-1.5.3:
      Successfully uninstalled pandas-1.5.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
lida 0.0.10 requires fastapi, which is not installed.
lida 0.0.10 requires kaleido, which is not installed.
lida 0.0.10 requires python-multipart, which is not installed.
lida 0.0.10 requires uvic

Collecting pandas==1.5.3
  Using cached pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.1.1
    Uninstalling pandas-2.1.1:
      Successfully uninstalled pandas-2.1.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
lida 0.0.10 requires fastapi, which is not installed.
lida 0.0.10 requires kaleido, which is not installed.
lida 0.0.10 requires python-multipart, which is not installed.
lida 0.0.10 requires uvicorn, which is not installed.
pyldavis 3.4.1 requires numpy>=1.24.2, but you have numpy 1.23.5 which is incompatible.
pyldavis 3.4.1 requires pandas>=2.0.0, but you have pandas 1.5.3 which is incompatible.[0m[31m
[0mSuccessfully installed pandas-1.5.3
Collecting hdbscan
  Downloading hdbscan-0.8.33.tar.gz (5.

In [None]:
import numpy as np
import pandas as pd

# Sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.decomposition import LatentDirichletAllocation
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn import mixture
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_samples, silhouette_score, make_scorer
from sklearn.model_selection import RandomizedSearchCV

# Sentence Transformer
from sentence_transformers.readers import InputExample
from sentence_transformers import SentenceTransformer, models
from tqdm import tqdm

# UMAP
import umap.umap_ as umap
import umap.plot

# Clustering
import hdbscan
from scipy.spatial import distance # To calculate distances
import scipy.cluster.hierarchy as sch
from joblib import Memory

# Visualization
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

# Topic modeling
import gensim
from gensim import corpora
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

from box import Box


# Config

In [None]:
config = {
    "random_state": 42,
    "document_embedding": {
        "max_features": 6000 # 데이터셋 크기 축소에 따라 수정 10000 # 환경에 따라 메모리 부족할 수도 있음. 터지면 줄여서 사용 요망
    },
    "dim_reduction": {
        "n_components": 50,
    },
    "clustering": {
        "n_clusters": 3
    },
    "lda": {
        "n_components": 10,
        "num_keywords": 30,
    },
    "ctfidf" : {
        "num_topN": 5,
        "min_topic_size": 10,
    }
}

config = Box(config)
config # config.pca.n_components의 형태로 접근 가능


  and should_run_async(code)


Box({'random_state': 42, 'document_embedding': {'max_features': 6000}, 'dim_reduction': {'n_components': 50}, 'clustering': {'n_clusters': 3}, 'lda': {'n_components': 10, 'num_keywords': 30}, 'ctfidf': {'num_topN': 5, 'min_topic_size': 10}})

# Load data

이 공유 드라이브에 대해 "정리->바로가기 추가"를 통해

내 드라이브 root에 공유 드라이브 바로가기를 만드셔야해요

제출할 때는 압축해서 낼 거니까 이 부분은 없애고 ./ 로 바꿔서 낼게요



In [None]:
# mount Google drive
from google.colab import drive
drive.mount('/content/drive')

# now you can see files
!echo -e "\nNumber of Google drive files in /content/drive/My Drive/:"
!ls -a "/content/drive/MyDrive/23-2_기계학습_팀플"
# by the way, you can run any linux command by putting a ! at the start of the line

# by default everything gets executed and saved in /content/
!echo -e "\nCurrent directory:"
!pwd


  and should_run_async(code)


### Set your workspace path

In [None]:
workspace_path = '/content/drive/MyDrive/23-2_기계학습_팀플'  # Change this path!
# workspace_path = './';
filename = 'beauty.csv'
# filename = 'whole.csv'
print(f'Current Workspace: {workspace_path}')

try:
  import google.colab
  data_path = f'{workspace_path}/dataset/{filename}'
  cachedir = f'{workspace_path}/cache'
except:
  data_path = f'./dataset/{filename}'



Current Workspace: /content/drive/MyDrive/23-2_기계학습_팀플


  and should_run_async(code)


In [None]:
data_path

  and should_run_async(code)


'/content/drive/MyDrive/23-2_기계학습_팀플/dataset/beauty.csv'

In [None]:
df = pd.read_csv(data_path)
df.head()

  and should_run_async(code)


Unnamed: 0.1,Unnamed: 0,review,category,rawReview
0,60616,disregard claim hear commercial reduce stretch...,beauty,Disregard all the claims you've heard on comme...
1,72986,price pretty good find nozzle attachment fall ...,beauty,the price it's pretty good. I find the nozzle ...
2,15877,look like harden vasoline come tube smell like...,beauty,It looks like hardened Vasoline coming from th...
3,85030,absolutely love thing small face short eyebrow...,beauty,I absolutely love this the only thing is that ...
4,83073,love set brush two favorite brush set blush b...,beauty,I love this set of brushes! My two favorite br...


# 결측치 제거

In [None]:
df[df["review"].isnull()]


  and should_run_async(code)


Unnamed: 0.1,Unnamed: 0,review,category,rawReview


In [None]:
print(df["review"].isnull().sum())
df = df[df["review"].notnull()].reset_index(drop=True)
df.head()


0


  and should_run_async(code)


Unnamed: 0.1,Unnamed: 0,review,category,rawReview
0,60616,disregard claim hear commercial reduce stretch...,beauty,Disregard all the claims you've heard on comme...
1,72986,price pretty good find nozzle attachment fall ...,beauty,the price it's pretty good. I find the nozzle ...
2,15877,look like harden vasoline come tube smell like...,beauty,It looks like hardened Vasoline coming from th...
3,85030,absolutely love thing small face short eyebrow...,beauty,I absolutely love this the only thing is that ...
4,83073,love set brush two favorite brush set blush b...,beauty,I love this set of brushes! My two favorite br...


In [None]:
category2id = {v:id for id, v in enumerate(df['category'].unique())}
id2category = {id:v for id, v in enumerate(df['category'].unique())}


  and should_run_async(code)


In [None]:
df["category_id"] = df["category"].map(category2id)
df["category_id"].value_counts()


  and should_run_async(code)


0    30000
Name: category_id, dtype: int64

# Document embedding

## SentenceTransformer

In [None]:
# download pretrained model
sentence_model = SentenceTransformer('all-MiniLM-L6-v2') # model 에서 sentence_model 로 변수명 수정했어요. - 윤진

sbert = sentence_model.encode(df["review"])
df["sbert"] = list(sbert)
df.head()

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Unnamed: 0.1,Unnamed: 0,review,category,rawReview,category_id,sbert
0,60616,disregard claim hear commercial reduce stretch...,beauty,Disregard all the claims you've heard on comme...,0,"[-0.0021139544, 0.031628005, 0.053724263, 0.05..."
1,72986,price pretty good find nozzle attachment fall ...,beauty,the price it's pretty good. I find the nozzle ...,0,"[-0.084313385, -0.02356731, 0.07364636, 0.0112..."
2,15877,look like harden vasoline come tube smell like...,beauty,It looks like hardened Vasoline coming from th...,0,"[-0.046834804, -0.041300334, 0.03862773, 0.041..."
3,85030,absolutely love thing small face short eyebrow...,beauty,I absolutely love this the only thing is that ...,0,"[-0.009914572, 0.012472473, 0.012321725, -0.00..."
4,83073,love set brush two favorite brush set blush b...,beauty,I love this set of brushes! My two favorite br...,0,"[-0.12224284, -0.067056805, 0.024126083, -0.01..."


In [None]:
df["sbert"].iloc[0].shape

  and should_run_async(code)


(384,)

# Dimension Reduction

#### UMAP

In [None]:
config.dim_reduction.n_components #확인

  and should_run_async(code)


50

In [None]:
mapper = umap.UMAP(densmap=True, n_components=config.dim_reduction.n_components, random_state=config.random_state)
umap_emb = mapper.fit_transform(sbert)


  and should_run_async(code)
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [None]:
umap_emb.shape

  and should_run_async(code)


(30000, 50)

In [None]:
df["dim_reduced"] = list(umap_emb)

  and should_run_async(code)


In [None]:
df["dim_reduced"].iloc[0].shape

  and should_run_async(code)


(50,)

# Clustering

### HDBSCAN

In [None]:
# 빠른 실험을 위한 caching
memory = Memory(cachedir, verbose=0)
import warnings
warnings.filterwarnings('ignore')

  and should_run_async(code)


No tuning

In [None]:
dim_reduced = np.array(df["dim_reduced"].tolist())
clustering_model = hdbscan.HDBSCAN(gen_min_span_tree=True, min_cluster_size=40, min_samples=40)
clustered = clustering_model.fit_predict(dim_reduced)

In [None]:
df["clustered"] = clustered

In [None]:
print("num clusters:", len(df["clustered"].unique()))
df["clustered"].value_counts()

num clusters: 4


 2    16485
 0     7617
-1     3697
 1     2201
Name: clustered, dtype: int64

In [None]:
clustering_model.relative_validity_

0.23545746339357992

In [None]:
df[df["clustered"] == -1].iloc[-1]["review"]

'ive want buy product hold back cu product mar kay cost double price find product half price grab right away great product 60 percent cant go wrong grab today theyre go'

In [None]:
corpus = df["review"] # corpus 가 정의되어 있지 않길래, 추가했습니다. - 윤진
d={"review": corpus, "clustered": pd.Series(clustered)}
cluster_result = pd.DataFrame(data=d)

for i in range(len(pd.Series(clustered).unique())):
  print(f'{i}th cluster size: {len(cluster_result[cluster_result["clustered"] == i])}')

0th cluster size: 7617
1th cluster size: 2201
2th cluster size: 16485
3th cluster size: 0


# Topic Modeling (Using Gensim)

In [None]:
# Load previous results
df = pd.read_csv(f'{workspace_path}/experiments/ctfidf/results_hdbscan_ctfidf_4.csv')

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,review,category,rawReview,category_id,sbert,dim_reduced,clustered
19407,51391,obagi expensive total program search amazon be...,beauty,Obagi can be very expensive when you are doing...,0,"[-0.028409513, 0.015774677, -0.008788007, -0.0...","[1.3318702, 7.96046, 4.08623, 9.220188, 3.5224...",-1
13254,81116,everything package work great color beautiful...,beauty,"everything in this package works great, all of...",0,"[-0.06007748, 0.019629898, 0.06926075, -0.1171...","[1.3808727, 6.7270055, 2.1803992, 10.334277, 3...",-1
13260,19765,use product gel year discover discontinue it b...,beauty,Have been using this product with the gel for ...,0,"[-0.060834613, 0.01850149, 0.034088254, -0.026...","[0.93919486, 8.481972, 4.290873, 9.717662, 2.6...",-1
13266,10169,use brush twice since receive detachable head ...,beauty,I've used this brush twice since I received it...,0,"[0.013687457, -0.025585113, 0.11701513, 0.0391...","[4.096278, 7.5901814, 4.9866414, 9.562265, 1.7...",-1
13271,99010,work great,beauty,Works Great!,0,"[-0.13016334, 0.053195927, 0.041615326, -0.030...","[3.0059762, 6.9702454, 4.14191, 10.027312, 3.7...",-1


In [None]:
df["review_tokenized"] = df["review"].apply(lambda x: x.split())
df.head()

Unnamed: 0.1,Unnamed: 0,review,category,rawReview,category_id,sbert,dim_reduced,clustered,review_tokenized
19407,51391,obagi expensive total program search amazon be...,beauty,Obagi can be very expensive when you are doing...,0,"[-0.028409513, 0.015774677, -0.008788007, -0.0...","[1.3318702, 7.96046, 4.08623, 9.220188, 3.5224...",-1,"[obagi, expensive, total, program, search, ama..."
13254,81116,everything package work great color beautiful...,beauty,"everything in this package works great, all of...",0,"[-0.06007748, 0.019629898, 0.06926075, -0.1171...","[1.3808727, 6.7270055, 2.1803992, 10.334277, 3...",-1,"[everything, package, work, great, color, beau..."
13260,19765,use product gel year discover discontinue it b...,beauty,Have been using this product with the gel for ...,0,"[-0.060834613, 0.01850149, 0.034088254, -0.026...","[0.93919486, 8.481972, 4.290873, 9.717662, 2.6...",-1,"[use, product, gel, year, discover, discontinu..."
13266,10169,use brush twice since receive detachable head ...,beauty,I've used this brush twice since I received it...,0,"[0.013687457, -0.025585113, 0.11701513, 0.0391...","[4.096278, 7.5901814, 4.9866414, 9.562265, 1.7...",-1,"[use, brush, twice, since, receive, detachable..."
13271,99010,work great,beauty,Works Great!,0,"[-0.13016334, 0.053195927, 0.041615326, -0.030...","[3.0059762, 6.9702454, 4.14191, 10.027312, 3.7...",-1,"[work, great]"


## Topic Modeling by cluster

In [None]:
df["clustered"].value_counts()

 2    16485
 0     7617
-1     3697
 1     2201
Name: clustered, dtype: int64

In [None]:
# Sort by cluster_num
df = df.sort_values(by='clustered').copy()
df

Unnamed: 0.1,Unnamed: 0,review,category,rawReview,category_id,sbert,dim_reduced,clustered,review_tokenized
19407,51391,obagi expensive total program search amazon be...,beauty,Obagi can be very expensive when you are doing...,0,"[-0.028409513, 0.015774677, -0.008788007, -0.0...","[1.3318702, 7.96046, 4.08623, 9.220188, 3.5224...",-1,"[obagi, expensive, total, program, search, ama..."
18895,14482,work greatand lot amounti love italso good pri...,beauty,Works greatAnd a lot of amountI love itAlso go...,0,"[-0.10554301, 0.0921953, 0.012891948, -0.03648...","[2.2972887, 6.8016005, 4.09537, 9.55347, 3.773...",-1,"[work, greatand, lot, amounti, love, italso, g..."
18899,82512,buy store price want tale trip buy online love...,beauty,I bought this before at the store for the same...,0,"[-0.02053465, 0.046217334, 0.02266982, 0.01285...","[2.8735895, 7.0995092, 3.3644834, 9.75102, 4.7...",-1,"[buy, store, price, want, tale, trip, buy, onl..."
4724,99370,awesome know marilyn monroe wear half lash co...,beauty,These are awesome. Did you know Marilyn Monro...,0,"[-0.061099887, -0.04258116, 0.024757992, -0.01...","[0.8221312, 7.1890125, 7.6376185, 9.655814, 0....",-1,"[awesome, know, marilyn, monroe, wear, half, l..."
18904,63310,great happy item pay it regrets would recomme...,beauty,"all great, I was happy with the item and what ...",0,"[-0.08752327, 0.12100833, 0.008093214, 0.02285...","[1.8787737, 6.6844463, 3.2785296, 9.282688, 5....",-1,"[great, happy, item, pay, it, regrets, would, ..."
...,...,...,...,...,...,...,...,...,...
16842,33368,perfume smell sooo good love lust one lust ...,beauty,This perfume smells sooo good!! of the love an...,0,"[-0.0830399, -0.04199629, 0.08293059, -0.00297...","[2.534813, 9.762105, 2.4143028, 9.722871, 5.57...",2,"[perfume, smell, sooo, good, love, lust, one, ..."
5649,99185,product high otc sulfur % take care follow di...,beauty,"This product has the highest OTC sulfur %, so ...",0,"[-0.0074007064, -0.0031446416, 0.015269704, 0....","[0.65279835, 10.071983, 4.8397555, 9.704564, 1...",2,"[product, high, otc, sulfur, %, take, care, fo..."
5646,21186,try firm skin basically good moisturizer nothi...,beauty,Tried this for firming my skin. It's basicall...,0,"[-0.0288812, 0.005959749, 0.09622886, 0.053392...","[0.41481248, 9.881819, 4.135339, 9.69625, 2.74...",2,"[try, firm, skin, basically, good, moisturizer..."
16759,30098,everytime use still amaze 15 yr find good tan ...,beauty,Everytime I use this I am still amazed that af...,0,"[0.0048249653, 0.025290476, 0.112990946, 0.029...","[0.91846144, 9.2537155, 3.414291, 9.787343, 2....",2,"[everytime, use, still, amaze, 15, yr, find, g..."


In [None]:
def topic_modeling_by_cluster(df):
    corpus = df["review_tokenized"]
    dictionary = corpora.Dictionary(corpus)
    corpus = [dictionary.doc2bow(text) for text in corpus]
    tokenized_text = df["review_tokenized"]
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=10,
                                                passes=10,
                                                random_state=config.random_state)
    # Compute Coherence Score
    coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, texts=tokenized_text.tolist(), coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    return lda_model, corpus, dictionary, coherence_score

In [None]:
keywords_by_cluster = []
coherence_scores = []
for cluster_num in sorted(df["clustered"].unique()):
    if cluster_num == -1:
        continue
    cluster_df = df[df["clustered"] == cluster_num]
    print(f"Cluster {cluster_num}({len(cluster_df)} documents):")
    lda_model, corpus, dictionary, coherence_score = topic_modeling_by_cluster(cluster_df)


    for topic in lda_model.print_topics(num_words=10):
        print(topic)
    print()

    pyLDAvis.enable_notebook()
    vis = gensimvis.prepare(lda_model, corpus, dictionary)

    num_keywords = config.lda.num_keywords
    keywords = vis.topic_info["Term"].tolist()[:num_keywords]
    print("keywords: ", keywords)
    print("c_v: ", coherence_score)
    keywords_by_cluster.append(keywords)
    coherence_scores.append(coherence_score)

Cluster 0(7617 documents):
(0, '0.182*"34" + 0.010*"tail" + 0.010*"nioxin" + 0.010*"pony" + 0.007*"scissors" + 0.007*"thickening" + 0.006*"vo5" + 0.005*"ur" + 0.005*"cure" + 0.005*"everytime"')
(1, '0.043*"product" + 0.026*"buy" + 0.020*"love" + 0.018*"hair" + 0.017*"price" + 0.016*"oil" + 0.014*"good" + 0.014*"one" + 0.013*"money" + 0.013*"purchase"')
(2, '0.069*"color" + 0.049*"hair" + 0.018*"brown" + 0.017*"dye" + 0.015*"red" + 0.015*"blonde" + 0.014*"dark" + 0.013*"use" + 0.012*"get" + 0.011*"light"')
(3, '0.028*"hair" + 0.013*"head" + 0.013*"pin" + 0.012*"wear" + 0.011*"wig" + 0.011*"look" + 0.010*"cap" + 0.010*"hold" + 0.009*"put" + 0.008*"use"')
(4, '0.101*"brush" + 0.042*"comb" + 0.036*"hair" + 0.017*"bristle" + 0.017*"tangle" + 0.013*"scalp" + 0.011*"great" + 0.009*"size" + 0.008*"use" + 0.008*"work"')
(5, '0.030*"towel" + 0.009*"deva" + 0.009*"dominican" + 0.007*"t" + 0.007*"drawer" + 0.007*"fluid" + 0.007*"eczema" + 0.007*"bambu" + 0.006*"workout" + 0.006*"dermatologist"')
(

In [None]:
print(len(keywords_by_cluster))
np.mean(coherence_scores)

3


0.4141600634310589

In [None]:
for kwc in keywords_by_cluster:
    for kw in kwc[:10]:
        print(kw, end=' ')
    print()

product hair color brush oil comb 34 buy love dryer 
color coat nail ~ dry top take use cuticle nails 
smell skin wash soap eye color scent body love brush 


In [None]:
len(keywords_by_cluster[1] * 10)

300

In [None]:
df["keywords"] = df.apply(lambda x: [word for word in keywords_by_cluster[x["clustered"]] if word in x["review_tokenized"] and x["clustered"] != -1], axis=1)


In [None]:
df.sort_index()

Unnamed: 0.1,Unnamed: 0,review,category,rawReview,category_id,sbert,dim_reduced,clustered,review_tokenized,keywords
0,60616,disregard claim hear commercial reduce stretch...,beauty,Disregard all the claims you've heard on comme...,0,"[-0.0021139544, 0.031628005, 0.053724263, 0.05...","[0.44638398, 10.0415745, 4.4307847, 9.742665, ...",2,"[disregard, claim, hear, commercial, reduce, s...","[smell, skin, soap, look, use]"
1,72986,price pretty good find nozzle attachment fall ...,beauty,the price it's pretty good. I find the nozzle ...,0,"[-0.084313385, -0.02356731, 0.07364636, 0.0112...","[8.751592, 6.6071258, 4.8410044, 9.5007305, 2....",0,"[price, pretty, good, find, nozzle, attachment...","[hair, brush, dryer, price, good, use, get]"
2,15877,look like harden vasoline come tube smell like...,beauty,It looks like hardened Vasoline coming from th...,0,"[-0.046834804, -0.041300334, 0.03862773, 0.041...","[1.8686926, 9.726857, 3.534628, 9.680706, 3.77...",2,"[look, like, harden, vasoline, come, tube, sme...","[smell, look, like, perfume]"
3,85030,absolutely love thing small face short eyebrow...,beauty,I absolutely love this the only thing is that ...,0,"[-0.009914572, 0.012472473, 0.012321725, -0.00...","[1.5704919, 6.175264, 6.5739202, 9.528315, 1.2...",2,"[absolutely, love, thing, small, face, short, ...","[eye, love, look, face, like, use]"
4,83073,love set brush two favorite brush set blush b...,beauty,I love this set of brushes! My two favorite br...,0,"[-0.12224284, -0.067056805, 0.024126083, -0.01...","[2.9159412, 6.8043437, 5.210582, 9.573075, 1.4...",2,"[love, set, brush, two, favorite, brush, set, ...","[love, brush, great]"
...,...,...,...,...,...,...,...,...,...,...
29995,75758,try clean clear neutrogena biore product s...,beauty,"After trying Clean & Clear, Neutrogena, and Bi...",0,"[-0.022597803, -0.013498354, 0.07781014, 0.009...","[0.78196883, 10.365755, 4.7498198, 9.650059, 2...",2,"[try, clean, clear, neutrogena, biore, product...","[skin, hand, oil, clean]"
29996,89376,may look nice 8217 fooled soft fact stiff i...,beauty,They may look nice but don&#8217;t be fooled. ...,0,"[-0.09848054, 0.055580847, 0.12894471, -0.0191...","[4.0435786, 7.1127825, 5.1558385, 9.176137, 1....",-1,"[may, look, nice, 8217, fooled, soft, fact, st...",[]
29997,47962,strong sticky send back maybe good short male ...,beauty,SO STRONG AND STICKY. I had to send it back. ...,0,"[-0.08914643, -0.0042910976, 0.08933469, 0.012...","[8.173485, 6.5082984, 4.3910947, 9.491093, 3.6...",0,"[strong, sticky, send, back, maybe, good, shor...","[hair, good]"
29998,21339,ive want buy product hold back cu product mar ...,beauty,ive been wanting to buy this product but I had...,0,"[-0.08350157, -0.012376309, 0.016681567, 0.000...","[2.0384207, 5.8280644, 3.2671359, 9.562258, 4....",-1,"[ive, want, buy, product, hold, back, cu, prod...",[]


In [None]:
save_path = f"/content/drive/MyDrive/results_hdbscan_lda.csv"
output_df = df.sort_index()
output_df[["review", "rawReview", "clustered", "keywords"]].to_csv(save_path)

# Topic Modeling (Using C-TF-IDF;  Class-based TF-IDF)

C-TF-IDF를 통해서 각 묶어진 그룹(Topic 또는 Class)에 대해 해당 Topic을 잘 표현하는 단어를 찾는다.

In [None]:
%%capture
!pip install bertopic

In [None]:
# 열 추가
df["review_tokenized"] = df["review"].apply(lambda x: x.split())
df.head()

Unnamed: 0.1,Unnamed: 0,review,category,rawReview,category_id,sbert,dim_reduced,clustered,review_tokenized,keywords
19407,51391,obagi expensive total program search amazon be...,beauty,Obagi can be very expensive when you are doing...,0,"[-0.028409513, 0.015774677, -0.008788007, -0.0...","[1.3318702, 7.96046, 4.08623, 9.220188, 3.5224...",-1,"[obagi, expensive, total, program, search, ama...",[]
18895,14482,work greatand lot amounti love italso good pri...,beauty,Works greatAnd a lot of amountI love itAlso go...,0,"[-0.10554301, 0.0921953, 0.012891948, -0.03648...","[2.2972887, 6.8016005, 4.09537, 9.55347, 3.773...",-1,"[work, greatand, lot, amounti, love, italso, g...",[]
18899,82512,buy store price want tale trip buy online love...,beauty,I bought this before at the store for the same...,0,"[-0.02053465, 0.046217334, 0.02266982, 0.01285...","[2.8735895, 7.0995092, 3.3644834, 9.75102, 4.7...",-1,"[buy, store, price, want, tale, trip, buy, onl...",[]
4724,99370,awesome know marilyn monroe wear half lash co...,beauty,These are awesome. Did you know Marilyn Monro...,0,"[-0.061099887, -0.04258116, 0.024757992, -0.01...","[0.8221312, 7.1890125, 7.6376185, 9.655814, 0....",-1,"[awesome, know, marilyn, monroe, wear, half, l...",[]
18904,63310,great happy item pay it regrets would recomme...,beauty,"all great, I was happy with the item and what ...",0,"[-0.08752327, 0.12100833, 0.008093214, 0.02285...","[1.8787737, 6.6844463, 3.2785296, 9.282688, 5....",-1,"[great, happy, item, pay, it, regrets, would, ...",[]


In [None]:
docs = df["review"].tolist()
# 실제 키워드 추출에 사용 (성능 향상을 위해 raw text 가 아닌 preprocessed text 사용)
# 단순히 키워드 빈도수만 체크되는 것이 아니기에, review_tokenized 가 아닌 (문장의 형태가 보존되어 있는) review 사용

docs[:3]

['obagi expensive total program search amazon best price product ',
 'work greatand lot amounti love italso good pricei wan na buy next time toogreat',
 'buy store price want tale trip buy online love item']

In [None]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from bertopic import BERTopic

### Hyperparameter Tuning (10/20 Last Update)

**버토픽의 하이퍼파라미터 목록**

`top_n_words`: 각 토픽 별로 추출하고자 하는 단어의 개수 (확률분포값 기준, 상위 N개의 단어), 10과 20사이가 적절하고 값이 30을 넘기지 않는 것이 좋음. \
`n_gram_range`: 토픽 분포(representation)을 생성하는 CountVectorizer 에 반영되는 기준 단어 단위, 토픽별 추출되는 키워드를 구성하는 단어의 개수 \
`min_topic_size`: **(중요)** 하나의 토픽이 가져야 하는 최소 단어(키워드)의 개수, 이 값이 낮을수록 더 많은 토픽이 추출됨. (=제한조건이 낮으므로) / default 10  \
`nr_topics`: 토픽의 개수를 줄여서 결과적으로 남기고 싶은 토픽의 개수, 만약 "auto"로 설정한다면, 토픽의 개수를 HDBSCAN을 이용하여 자동적으로 특정 개수로 줄임. 너무 낮은 값으로 설정하면, 합쳐지지 말아야할 토픽들이 서로 합쳐져서 성능에 악영향을 미칠 수 있으니 주의. \


*`calculate_probabilities` 및 `low-memory` 하이퍼파라미터들은 일단 고려하지 않음.

각 하이퍼파라미터 세부설명 참고) \
 https://colab.research.google.com/drive/1ClTYut039t-LDtlcd-oQAdXWgcsSGTw9?usp=sharing#scrollTo=xLrIUdCGsgkf

### Extract Keywords

In [None]:
# 클러스터별 키워드 얻기

ctfidf_model = BERTopic(embedding_model=sentence_model, umap_model=mapper, hdbscan_model=clustering_model, top_n_words=5, min_topic_size=10).fit(docs) # min_topic_size 조절 필요 # 클러스터별 개수 문제
topics, _ = ctfidf_model.fit_transform(docs)

# Preprocess Documents
documents = pd.DataFrame({"Document": docs,
                              "ID": range(len(docs)),
                              "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = ctfidf_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and tokenizer from BERTopic
vectorizer = ctfidf_model.vectorizer_model
tokenizer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [tokenizer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in ctfidf_model.get_topic(topic)]
                  for topic in range(len(set(topics))-1)] # topic 이 하나일 경우 문제 발생
print(f"topic_words: {topic_words}")

# Evaluate
coherence_model = CoherenceModel(topics=topic_words, # topic_words 가 0 개인 경우 문제
                                    texts=tokens,
                                    corpus=corpus,
                                    dictionary=dictionary,
                                    coherence='c_v')

coherence_score = coherence_model.get_coherence()

# Print
print(f"\nTopic modeling is done.\n")


topic_words: [['skin', 'use', 'product', 'like', 'face'], ['hair', 'use', 'product', 'shampoo', 'like'], ['nail', 'polish', 'coat', 'color', 'use']]

Topic modeling is done.



In [None]:
keywords = []

# topid_id 별 상위 5개의 단어 추출 (본 topic_id 는 클러스별이 아닌, 전체 문서에 대해 추출된 topic_id 이다.)
for topic_id, words in enumerate(topic_words):
  print(f"({topic_id}, '{words[0]}' + '{words[1]}' + '{words[2]}' + '{words[3]}' + '{words[4]}')")
  keywords += words

  # 개수 조정
  if (topic_id) > 8:
    break

keywords = list(set(keywords))
print()
print("keywords: ", keywords) # keywords - list 자료형
print("c_v: ", coherence_score)
print("\n")


(0, 'skin' + 'use' + 'product' + 'like' + 'face')
(1, 'hair' + 'use' + 'product' + 'shampoo' + 'like')
(2, 'nail' + 'polish' + 'coat' + 'color' + 'use')

keywords:  ['use', 'product', 'face', 'hair', 'nail', 'color', 'coat', 'polish', 'shampoo', 'like', 'skin']
c_v:  0.5601889274303593




In [None]:
# 각 클러스터별 키워드 추출 (ref: https://towardsdatascience.com/topics-per-class-using-bertopic-252314f2640)
# ref: https://maartengr.github.io/BERTopic/getting_started/topicsperclass/topicsperclass.html

classes = [df["clustered"][i] for i in range(len(df))]

topics_per_class = ctfidf_model.topics_per_class(docs, classes=classes)

keywords_by_cluster = topics_per_class["Words"].tolist()


In [None]:
ctfidf_model.visualize_topics_per_class(topics_per_class, top_n_topics=5)

In [None]:
# keywords_by_cluster 전처리

result = []

for item in keywords_by_cluster:
  result.append(item.split(', '))

print(result)

# keywords_by_cluster 대체
keywords_by_cluster = result

[['product', 'use', 'get', 'one', 'work'], ['skin', 'use', 'product', 'like', 'face'], ['hair', 'use', 'product', 'shampoo', 'like'], ['nail', 'polish', 'coat', 'color', 'use'], ['product', 'use', 'get', 'like', 'one'], ['skin', 'use', 'product', 'like', 'face'], ['hair', 'use', 'product', 'shampoo', 'like'], ['nail', 'polish', 'coat', 'color', 'use'], ['product', 'use', 'get', 'like', 'one'], ['skin', 'use', 'product', 'like', 'face'], ['hair', 'use', 'product', 'shampoo', 'like'], ['nail', 'polish', 'coat', 'color', 'use'], ['product', 'use', 'get', 'work', 'good'], ['skin', 'use', 'product', 'like', 'get'], ['hair', 'use', 'product', 'shampoo', 'like'], ['nail', 'polish', 'coat', 'color', 'use']]


In [None]:
# 각 리뷰(인스턴스) 별 keywords 추출 및 df 에 'kewords' 열 추가

df['keywords'] = None

for idx, row in df.iterrows():
    cluster_num = row["clustered"]

    if cluster_num == -1:
      cluster_num = 3

    keywords = keywords_by_cluster[cluster_num]
    row_keywords = [kw for kw in keywords if kw in row["review_tokenized"]]

    # Assign the row_keywords to the "keywords" column in the DataFrame
    df.at[idx, "keywords"] = row_keywords

print(df.head())  # To verify the changes


       Unnamed: 0                                             review category  \
19407       51391  obagi expensive total program search amazon be...   beauty   
18895       14482  work greatand lot amounti love italso good pri...   beauty   
18899       82512  buy store price want tale trip buy online love...   beauty   
4724        99370  awesome know marilyn monroe wear half lash  co...   beauty   
18904       63310  great  happy item pay it regrets would recomme...   beauty   

                                               rawReview  category_id  \
19407  Obagi can be very expensive when you are doing...            0   
18895  Works greatAnd a lot of amountI love itAlso go...            0   
18899  I bought this before at the store for the same...            0   
4724   These are awesome.  Did you know Marilyn Monro...            0   
18904  all great, I was happy with the item and what ...            0   

                                                   sbert  \
19407  [-0.028

In [None]:
# 중간 확인
df.head()

Unnamed: 0.1,Unnamed: 0,review,category,rawReview,category_id,sbert,dim_reduced,clustered,review_tokenized,keywords
19407,51391,obagi expensive total program search amazon be...,beauty,Obagi can be very expensive when you are doing...,0,"[-0.028409513, 0.015774677, -0.008788007, -0.0...","[1.3318702, 7.96046, 4.08623, 9.220188, 3.5224...",-1,"[obagi, expensive, total, program, search, ama...",[]
18895,14482,work greatand lot amounti love italso good pri...,beauty,Works greatAnd a lot of amountI love itAlso go...,0,"[-0.10554301, 0.0921953, 0.012891948, -0.03648...","[2.2972887, 6.8016005, 4.09537, 9.55347, 3.773...",-1,"[work, greatand, lot, amounti, love, italso, g...",[]
18899,82512,buy store price want tale trip buy online love...,beauty,I bought this before at the store for the same...,0,"[-0.02053465, 0.046217334, 0.02266982, 0.01285...","[2.8735895, 7.0995092, 3.3644834, 9.75102, 4.7...",-1,"[buy, store, price, want, tale, trip, buy, onl...",[]
4724,99370,awesome know marilyn monroe wear half lash co...,beauty,These are awesome. Did you know Marilyn Monro...,0,"[-0.061099887, -0.04258116, 0.024757992, -0.01...","[0.8221312, 7.1890125, 7.6376185, 9.655814, 0....",-1,"[awesome, know, marilyn, monroe, wear, half, l...",[]
18904,63310,great happy item pay it regrets would recomme...,beauty,"all great, I was happy with the item and what ...",0,"[-0.08752327, 0.12100833, 0.008093214, 0.02285...","[1.8787737, 6.6844463, 3.2785296, 9.282688, 5....",-1,"[great, happy, item, pay, it, regrets, would, ...",[]


In [None]:
df["keywords"] = df.apply(lambda x: [word for word in keywords_by_cluster[x["clustered"]] if word in x["review_tokenized"] and x["clustered"] != -1], axis=1)

In [None]:
df.sort_index()

Unnamed: 0.1,Unnamed: 0,review,category,rawReview,category_id,sbert,dim_reduced,clustered,review_tokenized,keywords
0,60616,disregard claim hear commercial reduce stretch...,beauty,Disregard all the claims you've heard on comme...,0,"[-0.0021139544, 0.031628005, 0.053724263, 0.05...","[0.44638398, 10.0415745, 4.4307847, 9.742665, ...",2,"[disregard, claim, hear, commercial, reduce, s...","[use, product]"
1,72986,price pretty good find nozzle attachment fall ...,beauty,the price it's pretty good. I find the nozzle ...,0,"[-0.084313385, -0.02356731, 0.07364636, 0.0112...","[8.751592, 6.6071258, 4.8410044, 9.5007305, 2....",0,"[price, pretty, good, find, nozzle, attachment...","[use, get]"
2,15877,look like harden vasoline come tube smell like...,beauty,It looks like hardened Vasoline coming from th...,0,"[-0.046834804, -0.041300334, 0.03862773, 0.041...","[1.8686926, 9.726857, 3.534628, 9.680706, 3.77...",2,"[look, like, harden, vasoline, come, tube, sme...",[like]
3,85030,absolutely love thing small face short eyebrow...,beauty,I absolutely love this the only thing is that ...,0,"[-0.009914572, 0.012472473, 0.012321725, -0.00...","[1.5704919, 6.175264, 6.5739202, 9.528315, 1.2...",2,"[absolutely, love, thing, small, face, short, ...","[use, like]"
4,83073,love set brush two favorite brush set blush b...,beauty,I love this set of brushes! My two favorite br...,0,"[-0.12224284, -0.067056805, 0.024126083, -0.01...","[2.9159412, 6.8043437, 5.210582, 9.573075, 1.4...",2,"[love, set, brush, two, favorite, brush, set, ...",[]
...,...,...,...,...,...,...,...,...,...,...
29995,75758,try clean clear neutrogena biore product s...,beauty,"After trying Clean & Clear, Neutrogena, and Bi...",0,"[-0.022597803, -0.013498354, 0.07781014, 0.009...","[0.78196883, 10.365755, 4.7498198, 9.650059, 2...",2,"[try, clean, clear, neutrogena, biore, product...",[product]
29996,89376,may look nice 8217 fooled soft fact stiff i...,beauty,They may look nice but don&#8217;t be fooled. ...,0,"[-0.09848054, 0.055580847, 0.12894471, -0.0191...","[4.0435786, 7.1127825, 5.1558385, 9.176137, 1....",-1,"[may, look, nice, 8217, fooled, soft, fact, st...",[]
29997,47962,strong sticky send back maybe good short male ...,beauty,SO STRONG AND STICKY. I had to send it back. ...,0,"[-0.08914643, -0.0042910976, 0.08933469, 0.012...","[8.173485, 6.5082984, 4.3910947, 9.491093, 3.6...",0,"[strong, sticky, send, back, maybe, good, shor...",[]
29998,21339,ive want buy product hold back cu product mar ...,beauty,ive been wanting to buy this product but I had...,0,"[-0.08350157, -0.012376309, 0.016681567, 0.000...","[2.0384207, 5.8280644, 3.2671359, 9.562258, 4....",-1,"[ive, want, buy, product, hold, back, cu, prod...",[]


In [None]:
# 각 리뷰별 키워드 추출된 상태의 데이터프레임 저장

save_path = f"/content/results_hdbscan_ctfidf_5.csv"
output_df = df.sort_index()
output_df[["review", "rawReview", "clustered", "keywords"]].to_csv(save_path)

In [None]:
# 끝