In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from utils.mongodb_utils import MongoDBUtils

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
db_info = {
    'host': 'corpus',
    'port': '27017',
    'db_name': 'active_learning',
    'collection': 'query-2019-09-11_124402',
}

mongo = MongoDBUtils()

doc_list = mongo.get_all_documents(db_info=db_info, collection=db_info['collection'])

In [None]:
df = pd.DataFrame(doc_list)

print('{:,}'.format(len(df)))
df.head()

In [None]:
cleans = [doc for doc in doc_list if isinstance(doc['review'], str)]

In [None]:
def clean_text(text):
    text = text.replace('.', '')
    text = text.replace(',', '')
    text = text.replace(';', '')
    text = text.replace('~', '')
    text = text.replace('+', '')
    
    return text

texts = [clean_text(doc['review']) for doc in cleans]

texts[:5]

In [None]:
cv = CountVectorizer(min_df=3, ngram_range=(2, 5), analyzer='char', max_features=500)

X = cv.fit_transform(texts)

In [None]:
def display_wordcloud(data=None, backgroundcolor='black', width=800, height=600):
    import matplotlib.pyplot as plt
    from wordcloud import WordCloud, STOPWORDS

    opt = {
        'stopwords': STOPWORDS,
        'background_color': backgroundcolor,
        'width': width,
        'height': height,
        'font_path': '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf',
        'max_words': 20000,
    }

    wordcloud = WordCloud(**opt).generate(data)

    plt.figure(figsize=(15, 10))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

    return

In [None]:
display_wordcloud(' '.join(cv.get_feature_names()))

In [None]:
kmeans = KMeans(n_clusters=100, n_init=1)

kmeans.fit(X)

In [None]:
def display():
    import seaborn as sns

    # 단어 수
    num_words = df['text'].apply(lambda x: len(str(x).split()))

    # 중복을 제거한 단어 수
    num_uniq_words = df['text'].apply(lambda x: len(set(str(x).split())))

    fig, axes = plt.subplots(ncols=2)
    fig.set_size_inches(18, 6)

    print('리뷰 별 단어 평균값 :', num_words.mean())
    print('리뷰 별 단어 중간값', num_words.median())
    
    sns.distplot(num_words, bins=100, ax=axes[0])
    
    axes[0].axvline(num_words.median(), linestyle='dashed')
    axes[0].set_title('리뷰 별 단어 수 분포')

    print('리뷰 별 고유 단어 평균값 :', num_uniq_words.mean())
    print('리뷰 별 고유 단어 중간값', num_uniq_words.median())
    
    sns.distplot(num_uniq_words, bins=100, color='g', ax=axes[1])
    
    axes[1].axvline(num_uniq_words.median(), linestyle='dashed')
    axes[1].set_title('리뷰 별 고유한 단어 수 분포')
    
    return

In [None]:
index = {}
result_list = []

for i, t in enumerate(texts):
    cls_id = kmeans.labels_[i]
    
    result_list.append({
        'cls_id': cls_id,
        'text': cleans[i]['review'],
        'select': cleans[i]['select'],
        'label': cleans[i]['label'],
    })
    
    if cls_id not in index:
        index[cls_id] = []
        
    index[cls_id].append(t)

In [None]:
df = pd.DataFrame(result_list)

df.head()

In [None]:
df.sort_values('cls_id')

In [None]:
df.groupby(by=['select', 'label']).size().to_frame().reset_index()

In [None]:
pd.options.display.max_rows = -1

df[ df['select'] == 'vote' ].sort_values(by=['cls_id', 'label'])[['cls_id', 'label', 'text', 'select']]

In [None]:
# https://data-newbie.tistory.com/25

In [None]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=20)
mu = nmf.fit_transform(X)