### 필요한 라이브러리 불러오기

In [None]:
!set -x \
&& pip install konlpy \
&& curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh | bash -x

+ pip install konlpy
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
+ curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh
+ bash -x
+ mecab_dicdir=/usr/local/lib/mecab/dic/mecab-ko-dic
+ set -e
++ uname
+ os=Linux
+ [[ ! Linux == \L\i\n\u\x ]]
+ hash sudo
+ sudo=sudo
+ python=python3
+ hash pyenv
+ at_user_site=
++ check_python_site_location_is_writable
++ python3 -
+ [[ 1 == \0 ]]
+ hash automake
+ hash mecab
+ echo 'mecab-ko is already installed'
mecab-ko is already installed
+ [[ -d /usr/local/lib/mecab/dic/mecab-ko-dic ]]
+ echo 'mecab-ko-dic is already installed'
mecab-ko-dic is already installed
++ python3 -c 'import pkgutil; print(1 if pkgutil.find_loader("MeCab") else 0)'
+ [[ 1 == \1 ]]
+ echo 'mecab-python is already installed'
mecab-python is already installed
+ echo Done.
Done.


In [None]:
from konlpy.tag import Mecab
from itertools import permutations, combinations, product
import numpy as np

### Mecab 예시

In [None]:
mecab = Mecab()
mecab.pos('무엇의무엇에')

[('무엇', 'NP'), ('의', 'JKG'), ('무엇', 'NP'), ('에', 'JKB')]

### input words

In [None]:
words = ['의열단', '조선 의용군', '활빈당', '조선 의용대']
# words = ['할', '스', '이', '투', '앤']
# words = ['갑', '을', '갑', '임', '일']
# words = ['세', '무', '사', '기', '일']
# words = ['조', '조', '한', '한', '한']

In [None]:
def characters_extract(words):

  splited_words = []
  for word in words:
    splited_words.append(word.split())

  characters = []
  for word in splited_words:
    if len(word) == 1:
      characters.append(word[0][0])
    else:
      characters.append([w[0] for w in word])

  characters_permutation = list(product(*characters))

  return characters_permutation

In [None]:
chars = characters_extract(words)
chars

[('의', '조', '활', '조'),
 ('의', '조', '활', '의'),
 ('의', '의', '활', '조'),
 ('의', '의', '활', '의')]

### window_size 단위로 단어 추출

In [None]:
def words_extract(characters): # input: 글자 리스트

  windows = [[], characters]  # windows 에는 모든 경우 포함
  for i in range(2, 5):
    permutation = list(set(permutations(characters, i)))
    for j, p in enumerate(permutation):
      concat = ''
      for character in p:
        concat += character
        permutation[j] = concat
    windows.append(permutation)

  is_noun = [[] for _ in range(5)]  # is_noun 에는 단어만 포함
  for i in range(1, 5):
    for w in windows[i]:
      if len(mecab.pos(w)) == 1:
        # is_noun[i].append(w)
        if mecab.pos(w)[0][1][0] in 'NVM':  # 명사, 동사, 형용사, 관형사, 부사
          is_noun[i].append(w)  # index 가 window_size 를 의미함

  return is_noun  # output: window_size 별 조합 단어 리스트

In [None]:
is_noun = words_extract(chars[0])
is_noun

[[], ['의', '조', '활', '조'], ['의조', '조조', '조의', '활의'], [], []]

### 품사태깅 예시

In [None]:
# 관형사 vs 명사
print(mecab.pos('한'))  # 한 -> 관형사
print(mecab.pos('한이'))
print(mecab.pos('한의'))
print(mecab.pos('한이맺히다')) # 한 -> 명사

# 오류
print(mecab.pos('스튜어디스스튜스투'))  # '스투'와 같이 말이 안 되는 단어를 명사로 분류

[('한', 'MM')]
[('한이', 'NNP')]
[('한의', 'NNG')]
[('한', 'NNG'), ('이', 'JKS'), ('맺히', 'VV'), ('다', 'EC')]
[('스튜어디스', 'NNG'), ('스튜', 'NNG'), ('스투', 'NNP')]


### 최종 output

In [None]:
def words_combination(is_noun, characters): # input: 조합 단어 리스트, 글자 리스트

  is_noun_concat = []
  for _ in is_noun:
    is_noun_concat.extend(_)

  combination = []
  for i in range(1, len(characters) + 1):
    combination.extend(list(combinations(is_noun_concat, i)))

  final_combination = []
  for c in combination:
    concat = ''
    for w in c:
      concat += w
    if len(concat) == len(characters):
      flag = 1
      for f in characters:
        if concat.count(f) != characters.count(f):
          flag = 0
          break
      if flag:
        final_combination.append(c)
  
  final_combination = list(set(final_combination))

  return final_combination  # output: 최종 조합 리스트

In [None]:
final_combination = words_combination(is_noun, chars[0])
final_combination

[('조', '조', '활의'),
 ('조', '활', '의조'),
 ('의', '조', '활', '조'),
 ('의', '활', '조조'),
 ('조', '활', '조의'),
 ('조조', '활의'),
 ('활', '조', '조의'),
 ('활', '조', '의조')]

### 최종 코드

In [None]:
input_words = ['키', '킥 의', '킼 의', '킫 의']

characters_permutation = characters_extract(input_words)

output = []

while not output:

  if not characters_permutation:
    print('All impossible.')
    break
  
  characters = characters_permutation.pop(0)

  words_by_windows = words_extract(characters)

  output = words_combination(words_by_windows, characters)

  if not output:
    print(f'{characters} is impossible.')
  
output

('키', '킥', '킼', '킫') is impossible.
('키', '킥', '킼', '의') is impossible.
('키', '킥', '의', '킫') is impossible.


[('키', '킥', '의의'), ('의', '의', '키킥'), ('키', '킥', '의', '의'), ('키킥', '의의')]