<a href="https://colab.research.google.com/github/dr-song-summer-project/AI/blob/main/Keyword%20%EC%B6%94%EC%B6%9C/reviewType%EB%B3%84_%EC%A0%84%EC%B2%B4_TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

리뷰타입 별 - 문장별 TFIDF

In [None]:
!pip install konlpy
!pip install openpyxl



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


리뷰 타입 전체 군집 별 상위 키워드

In [None]:
from konlpy.tag import Komoran

komoran = Komoran()
def komoran_tokenize(sent):
    words = komoran.pos(sent, join=True)
    words = [w for w in words if ('/NN' in w or '/XR' in w or '/VA' in w or '/VV' in w)]
    return words

In [None]:
import numpy as np
from sklearn.preprocessing import normalize

def pagerank(x, df=0.85, max_iter=30, bias=None):
    """
    Arguments
    ---------
    x : scipy.sparse.csr_matrix
        shape = (n vertex, n vertex)
    df : float
        Damping factor, 0 < df < 1
    max_iter : int
        Maximum number of iteration
    bias : numpy.ndarray or None
        If None, equal bias
    Returns
    -------
    R : numpy.ndarray
        PageRank vector. shape = (n vertex, 1)
    """

    assert 0 < df < 1

    # initialize
    A = normalize(x, axis=0, norm='l1')
    R = np.ones(A.shape[0]).reshape(-1,1)

    # check bias
    if bias is None:
        bias = (1 - df) * np.ones(A.shape[0]).reshape(-1,1)
    else:
        bias = bias.reshape(-1,1)
        bias = A.shape[0] * bias / bias.sum()
        assert bias.shape[0] == A.shape[0]
        bias = (1 - df) * bias

    # iteration
    for _ in range(max_iter):
        R = df * (A * R) + bias

    return R

In [None]:
from collections import Counter
import math
import numpy as np
import scipy as sp
from scipy.sparse import csr_matrix
from sklearn.metrics import pairwise_distances


def sent_graph(sents, tokenize=None, min_count=2, min_sim=0.3,
    similarity=None, vocab_to_idx=None, verbose=False):
    """
    Arguments
    ---------
    sents : list of str
        Sentence list
    tokenize : callable
        tokenize(sent) return list of str
    min_count : int
        Minimum term frequency
    min_sim : float
        Minimum similarity between sentences
    similarity : callable or str
        similarity(s1, s2) returns float
        s1 and s2 are list of str.
        available similarity = [callable, 'cosine', 'textrank']
    vocab_to_idx : dict
        Vocabulary to index mapper.
        If None, this function scan vocabulary first.
    verbose : Boolean
        If True, verbose mode on
    Returns
    -------
    sentence similarity graph : scipy.sparse.csr_matrix
        shape = (n sents, n sents)
    """

    if vocab_to_idx is None:
        idx_to_vocab, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count)
    else:
        idx_to_vocab = [vocab for vocab, _ in sorted(vocab_to_idx.items(), key=lambda x:x[1])]

    x = vectorize_sents(sents, tokenize, vocab_to_idx)
    if similarity == 'cosine':
        x = numpy_cosine_similarity_matrix(x, min_sim, verbose, batch_size=1000)
    else:
        x = numpy_textrank_similarity_matrix(x, min_sim, verbose, batch_size=1000)
    return x

def vectorize_sents(sents, tokenize, vocab_to_idx):
    rows, cols, data = [], [], []
    for i, sent in enumerate(sents):
        counter = Counter(tokenize(sent))
        for token, count in counter.items():
            j = vocab_to_idx.get(token, -1)
            if j == -1:
                continue
            rows.append(i)
            cols.append(j)
            data.append(count)
    n_rows = len(sents)
    n_cols = len(vocab_to_idx)
    return csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols))

def numpy_cosine_similarity_matrix(x, min_sim=0.3, verbose=True, batch_size=1000):
    n_rows = x.shape[0]
    mat = []
    for bidx in range(math.ceil(n_rows / batch_size)):
        b = int(bidx * batch_size)
        e = min(n_rows, int((bidx+1) * batch_size))
        psim = 1 - pairwise_distances(x[b:e], x, metric='cosine')
        rows, cols = np.where(psim >= min_sim)
        data = psim[rows, cols]
        mat.append(csr_matrix((data, (rows, cols)), shape=(e-b, n_rows)))
        if verbose:
            print('\rcalculating cosine sentence similarity {} / {}'.format(b, n_rows), end='')
    mat = sp.sparse.vstack(mat)
    if verbose:
        print('\rcalculating cosine sentence similarity was done with {} sents'.format(n_rows))
    return mat

def numpy_textrank_similarity_matrix(x, min_sim=0.3, verbose=True, min_length=1, batch_size=1000):
    n_rows, n_cols = x.shape

    # Boolean matrix
    rows, cols = x.nonzero()
    data = np.ones(rows.shape[0])
    z = csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols))

    # Inverse sentence length
    size = np.asarray(x.sum(axis=1)).reshape(-1)
    size[np.where(size <= min_length)] = 10000
    size = np.log(size)

    mat = []
    for bidx in range(math.ceil(n_rows / batch_size)):

        # slicing
        b = int(bidx * batch_size)
        e = min(n_rows, int((bidx+1) * batch_size))

        # dot product
        inner = z[b:e,:] * z.transpose()

        # sentence len[i,j] = size[i] + size[j]
        norm = size[b:e].reshape(-1,1) + size.reshape(1,-1)
        norm = norm ** (-1)
        norm[np.where(norm == np.inf)] = 0

        # normalize
        sim = inner.multiply(norm).tocsr()
        rows, cols = (sim >= min_sim).nonzero()
        data = np.asarray(sim[rows, cols]).reshape(-1)

        # append
        mat.append(csr_matrix((data, (rows, cols)), shape=(e-b, n_rows)))

        if verbose:
            print('\rcalculating textrank sentence similarity {} / {}'.format(b, n_rows), end='')

    mat = sp.sparse.vstack(mat)
    if verbose:
        print('\rcalculating textrank sentence similarity was done with {} sents'.format(n_rows))

    return mat

def graph_with_python_sim(tokens, verbose, similarity, min_sim):
    if similarity == 'cosine':
        similarity = cosine_sent_sim
    elif callable(similarity):
        similarity = similarity
    else:
        similarity = textrank_sent_sim

    rows, cols, data = [], [], []
    n_sents = len(tokens)
    for i, tokens_i in enumerate(tokens):
        if verbose and i % 1000 == 0:
            print('\rconstructing sentence graph {} / {} ...'.format(i, n_sents), end='')
        for j, tokens_j in enumerate(tokens):
            if i >= j:
                continue
            sim = similarity(tokens_i, tokens_j)
            if sim < min_sim:
                continue
            rows.append(i)
            cols.append(j)
            data.append(sim)
    if verbose:
        print('\rconstructing sentence graph was constructed from {} sents'.format(n_sents))
    return csr_matrix((data, (rows, cols)), shape=(n_sents, n_sents))

def textrank_sent_sim(s1, s2):
    """
    Arguments
    ---------
    s1, s2 : list of str
        Tokenized sentences
    Returns
    -------
    Sentence similarity : float
        Non-negative number
    """
    n1 = len(s1)
    n2 = len(s2)
    if (n1 <= 1) or (n2 <= 1):
        return 0
    common = len(set(s1).intersection(set(s2)))
    base = math.log(n1) + math.log(n2)
    return common / base

def cosine_sent_sim(s1, s2):
    """
    Arguments
    ---------
    s1, s2 : list of str
        Tokenized sentences
    Returns
    -------
    Sentence similarity : float
        Non-negative number
    """
    if (not s1) or (not s2):
        return 0

    s1 = Counter(s1)
    s2 = Counter(s2)
    norm1 = math.sqrt(sum(v ** 2 for v in s1.values()))
    norm2 = math.sqrt(sum(v ** 2 for v in s2.values()))
    prod = 0
    for k, v in s1.items():
        prod += v * s2.get(k, 0)
    return prod / (norm1 * norm2)

In [None]:
import numpy as np
class KeywordSummarizer:
    """
    Arguments
    ---------
    sents : list of str
        Sentence list
    tokenize : callable
        Tokenize function: tokenize(str) = list of str
    min_count : int
        Minumum frequency of words will be used to construct sentence graph
    window : int
        Word cooccurrence window size. Default is -1.
        '-1' means there is cooccurrence between two words if the words occur in a sentence
    min_cooccurrence : int
        Minimum cooccurrence frequency of two words
    vocab_to_idx : dict or None
        Vocabulary to index mapper
    df : float
        PageRank damping factor
    max_iter : int
        Number of PageRank iterations
    verbose : Boolean
        If True, it shows training progress
    """
    def __init__(self, sents=None, tokenize=None, min_count=2,
        window=-1, min_cooccurrence=2, vocab_to_idx=None,
        df=0.85, max_iter=30, verbose=False):

        self.tokenize = tokenize
        self.min_count = min_count
        self.window = window
        self.min_cooccurrence = min_cooccurrence
        self.vocab_to_idx = vocab_to_idx
        self.df = df
        self.max_iter = max_iter
        self.verbose = verbose

        if sents is not None:
            self.train_textrank(sents)

    def train_textrank(self, sents, bias=None):
        """
        Arguments
        ---------
        sents : list of str
            Sentence list
        bias : None or numpy.ndarray
            PageRank bias term
        Returns
        -------
        None
        """

        g, self.idx_to_vocab = word_graph(sents,
            self.tokenize, self.min_count,self.window,
            self.min_cooccurrence, self.vocab_to_idx, self.verbose)
        self.R = pagerank(g, self.df, self.max_iter, bias).reshape(-1)
        if self.verbose:
            print('trained TextRank. n words = {}'.format(self.R.shape[0]))

    def keywords(self, topk=30):
        """
        Arguments
        ---------
        topk : int
            Number of keywords selected from TextRank
        Returns
        -------
        keywords : list of tuple
            Each tuple stands for (word, rank)
        """
        if not hasattr(self, 'R'):
            raise RuntimeError('Train textrank first or use summarize function')
        idxs = self.R.argsort()[-topk:]
        keywords = [(self.idx_to_vocab[idx], self.R[idx]) for idx in reversed(idxs)]
        return keywords

    def summarize(self, sents, topk=30):
        """
        Arguments
        ---------
        sents : list of str
            Sentence list
        topk : int
            Number of keywords selected from TextRank
        Returns
        -------
        keywords : list of tuple
            Each tuple stands for (word, rank)
        """

        self.train_textrank(sents)
        return self.keywords(topk)

        

class KeysentenceSummarizer:
    """
    Arguments
    ---------
    sents : list of str
        Sentence list
    tokenize : callable
        Tokenize function: tokenize(str) = list of str
    min_count : int
        Minumum frequency of words will be used to construct sentence graph
    min_sim : float
        Minimum similarity between sentences in sentence graph
    similarity : str
        available similarity = ['cosine', 'textrank']
    vocab_to_idx : dict or None
        Vocabulary to index mapper
    df : float
        PageRank damping factor
    max_iter : int
        Number of PageRank iterations
    verbose : Boolean
        If True, it shows training progress
    """
    def __init__(self, sents=None, tokenize=None, min_count=2,
        min_sim=0.3, similarity=None, vocab_to_idx=None,
        df=0.85, max_iter=30, verbose=False):

        self.tokenize = tokenize
        self.min_count = min_count
        self.min_sim = min_sim
        self.similarity = similarity
        self.vocab_to_idx = vocab_to_idx
        self.df = df
        self.max_iter = max_iter
        self.verbose = verbose

        if sents is not None:
            self.train_textrank(sents)

    def train_textrank(self, sents, bias=None):
        """
        Arguments
        ---------
        sents : list of str
            Sentence list
        bias : None or numpy.ndarray
            PageRank bias term
            Shape must be (n_sents,)
        Returns
        -------
        None
        """
        g = sent_graph(sents, self.tokenize, self.min_count,
            self.min_sim, self.similarity, self.vocab_to_idx, self.verbose)
        self.R = pagerank(g, self.df, self.max_iter, bias).reshape(-1)
        if self.verbose:
            print('trained TextRank. n sentences = {}'.format(self.R.shape[0]))

    def summarize(self, sents, topk=30, bias=None):
        """
        Arguments
        ---------
        sents : list of str
            Sentence list
        topk : int
            Number of key-sentences to be selected.
        bias : None or numpy.ndarray
            PageRank bias term
            Shape must be (n_sents,)
        Returns
        -------
        keysents : list of tuple
            Each tuple stands for (sentence index, rank, sentence)
        Usage
        -----
            >>> from textrank import KeysentenceSummarizer
            >>> summarizer = KeysentenceSummarizer(tokenize = tokenizer, min_sim = 0.5)
            >>> keysents = summarizer.summarize(texts, topk=30)
        """
        n_sents = len(sents)
        if isinstance(bias, np.ndarray):
            if bias.shape != (n_sents,):
                raise ValueError('The shape of bias must be (n_sents,) but {}'.format(bias.shape))
        elif bias is not None:
            raise ValueError('The type of bias must be None or numpy.ndarray but the type is {}'.format(type(bias)))

        self.train_textrank(sents, bias)
        idxs = self.R.argsort()[-topk:]
        try:
          keysents = [(idx, self.R[idx], sents[idx]) for idx in reversed(idxs)]
        except:
          print(sents)
          keysents = False

        return keysents

In [None]:
from collections import Counter
from scipy.sparse import csr_matrix
import numpy as np


def scan_vocabulary(sents, tokenize=None, min_count=2):
    """
    Arguments
    ---------
    sents : list of str
        Sentence list
    tokenize : callable
        tokenize(str) returns list of str
    min_count : int
        Minumum term frequency
    Returns
    -------
    idx_to_vocab : list of str
        Vocabulary list
    vocab_to_idx : dict
        Vocabulary to index mapper.
    """
    counter = Counter(w for sent in sents for w in tokenize(sent))
    counter = {w:c for w,c in counter.items() if c >= min_count}
    idx_to_vocab = [w for w, _ in sorted(counter.items(), key=lambda x:-x[1])]
    vocab_to_idx = {vocab:idx for idx, vocab in enumerate(idx_to_vocab)}
    return idx_to_vocab, vocab_to_idx

def tokenize_sents(sents, tokenize):
    """
    Arguments
    ---------
    sents : list of str
        Sentence list
    tokenize : callable
        tokenize(sent) returns list of str (word sequence)
    Returns피임
    -------
    tokenized sentence list : list of list of str
    """
    return [tokenize(sent) for sent in sents]

def vectorize(tokens, vocab_to_idx):
    """
    Arguments
    ---------
    tokens : list of list of str
        Tokenzed sentence list
    vocab_to_idx : dict
        Vocabulary to index mapper
    Returns
    -------
    sentence bow : scipy.sparse.csr_matrix
        shape = (n_sents, n_terms)
    """
    rows, cols, data = [], [], []
    for i, tokens_i in enumerate(tokens):
        for t, c in Counter(tokens_i).items():
            j = vocab_to_idx.get(t, -1)
            if j == -1:
                continue
            rows.append(i)
            cols.append(j)
            data.append(c)
    n_sents = len(tokens)
    n_terms = len(vocab_to_idx)
    x = csr_matrix((data, (rows, cols)), shape=(n_sents, n_terms))
    return x


In [None]:
import pandas as pd
import re

text = '안녕하세요 반갑습니다🐶'
print(text) 
only_BMP_pattern = re.compile("["
        u"\U00010000-\U0010FFFF"  #BMP characters 이외
                           "]+", flags=re.UNICODE)

path = '/content/drive/My Drive/닥터송 여름 프로젝트/4. 대-스타 해결 2/데이터/unlabeled_data_excel.csv'
df = pd.read_csv(path)
data = pd.DataFrame.to_numpy(df)


test = [[] for _ in range(3)]
idx = 0
for content in data:
  if content[5] == 'recruitReview':
    test[2].append(only_BMP_pattern.sub(r'', content[3]))
  elif content[5] == 'interviewReview':
    test[0].append(only_BMP_pattern.sub(r'', content[3]))
  else:
    test[1].append(only_BMP_pattern.sub(r'', content[3]))

print(test[0][0], test[1][0], test[2][0])


안녕하세요 반갑습니다🐶
안녕하세요 반갑습니다
수락하시고 전화 인터뷰 진행해주셨고요. 첫째 아이 돌봄 구인 글을 보고 지원을 하였는데, 전화 인터뷰 진행하실 때 둘째 아이 방학이 다가오는 데 그때도 괜찮냐고 물어보셨지만, 둘째 아이가 방학하는 주에는 선약 돌봄 집들이 있다고 말씀드리니, 첫째 아이와 둘째 아이를 같이 돌볼 수 있는 분을 찾으신다고 하셔서 아쉽게도 뵙지 못했습니다. 아이들과 새해에도 행복하세요. 연락이 오지 않아 뵙지 못하였습니다 돌보시다가 시간이 맞지 않으셨는지 곧 그만두신 다 하셨습니다. 일하시다 불만족스러운 신 부분을 나름 조율해 드리려 했으나, 다른 조건 조율 과정 없이 바로 그만두시겠다 하셔서 워킹맘 입장에서 매우 난처했습니다.


In [None]:
summarizer = KeysentenceSummarizer(
    tokenize = komoran_tokenize,
    min_sim = 0.5,
    verbose = True
)

keysents = summarizer.summarize(test[2], topk=10)
for sent_idx, rank, sent in keysents:  
  print(f'{sent_idx} : {rank} :: {sent}')

calculating textrank sentence similarity was done with 3321 sents
trained TextRank. n sentences = 3321
1808 : 5.2517252145090225 :: 아이가 첫째 날에는 낯가림도 하고 자 다 깬지 얼마 안 돼서 잠투정이 있었던 건지 그날은 내내 기분이 좋다가도 금방 다시 엄마를 찾았는데, 둘째 날은 얼굴 보자마자 웃어주고 인사도 해줬습니다. 돌봐주는 내내 기분 좋아서 잘 웃고 떼를 쓰지도 않았습니다. 아기가 에너지가 넘치고 자기주장이 확실한 편이라 자기가 무얼 원하는지 분명하게 말합니다. 그래서 아이가 지 금 원하는 해줄 수 있어서 저는 마음이 조금 편했어요. 어머님도 굉장히 유쾌하셔서 그런지 아기도 굉장히 잘 웃고 애정을 나누는 방법을 알고 있는 것 같아요. 활동비는 제가 편하게 마지막 날에 받겠다고 했고 봉투에 감사하게 글까지 적어서 주셨어요. 활동도 미리 말씀해 주신 것 외에 따로 시키신 것도 없었고 시간도 정확하게 지켜주셨습니다~ 저는 사 정상 저녁에는 시간 내기가 어려워 계속 돌봐줄 수는 없었지만 좋은 분 만나셨으면 좋겠습니다
904 : 4.940581320667702 :: 집에 13개월 아가와 5살 남아가 있는 집입니다. 13개월 아가는 제가 케어하고 5살 남아의 하원 및 돌봄이 2주 동안(9일) 필요해서 신청드렸습니다. 그러나 4일 동안 아이를 봐주시는 중에 2일은 10분 이상 지각하셨고 아이가 화장실을 갈 때도 저에게 맡기시더군요. 그리고 중간에 돌봄을 그만둔 가장 큰 이유는 아이 저녁 때문입니다. 돌봄 시간이 4 시에서 7시였는데 아이들 저녁 이 애매해서 13개월 아이와 5살 아이를 같이 먹이고 처음에는 시터 분도 저녁을 같이 제공해드렸습니다. 그럼 5살 아이는 저녁을 먹여주시거나 케어해 주셔야 하는데 혼자 맛있게 40여 분 동안 드시더군요. 이틀 저녁을 드렸었는데 변하지를 않아서 나머지 이틀은 챙겨드리지 않았어요. 제가 혼자 아가 둘을 케어하기가 좀 힘들더라

In [None]:
for i in range(3):
  print(f'=================={i+1}번째====================')
  keywords = keyword_extractor.summarize(test[i], topk=20)
  for word, rank in keywords:
      print(word, rank)

하/VV 69.64618576560952
되/VV 35.1358039670359
좋/VA 32.1107361774536
연락/NNG 30.531011295014977
시간/NNG 22.929137049708736
분/NNB 22.565207247408395
같/VA 17.745956844072886
채용/NNG 17.58230697588734
있/VV 17.54476093750468
맞/VV 16.60933009293096
드리/VV 16.310709033545425
아이/NNG 16.000918197881266
시/NNB 15.678659905661148
아쉽/VA 15.597821633460688
터/NNB 15.529265569233008
만나/VV 15.500843622787828
것/NNB 15.19944150059622
없/VA 14.160044494308952
인터뷰/NNP 12.551800315079223
바라/VV 12.041957877267377
연락/NNG 68.53064480202643
하/VV 36.6042609177784
좋/VA 31.183454104570764
되/VV 26.798283734699638
수락/NNG 24.86154221027692
없/VA 18.703968590307664
오/VV 16.50493124601051
드리/VV 15.493513267783275
터/NNB 15.109818119243958
만나/VV 15.048063989529226
아쉽/VA 14.969826866741592
시간/NNG 14.499242669122124
분/NNB 14.071351207760484
시/NNB 13.113364448694018
바라/VV 12.57304038963731
다음/NNG 11.397023194240061
기회/NNG 10.648297177674523
있/VV 10.541441725771532
닿/VV 9.941231321785189
기다리/VV 9.505990095243167
하/VV 78.08706970710

In [None]:
TF_IDF = pd.DataFrame()
tmp_word = []
tmp_label = []
tmp_score = []

for i in range(3):
  print(f'=================={i+1}번째====================')
  keywords = keyword_extractor.summarize(test[i], topk=20)
  for word, rank in keywords:
    # print(word, rank)
    tmp_label.append(i)
    tmp_word.append(word)
    tmp_score.append(rank)

TF_IDF['word'] = tmp_word
TF_IDF['score'] = tmp_score
TF_IDF['label'] = tmp_label

TF_IDF.to_excel('/content/TF_IDF.xlsx', index=False)

