# 표제어 추출

표제어(Lemmatization) 추출 : 서로 형태는 다르지만, root 단어를 가지고 비교해서, 전체적으로 단어의 개수를 줄이자.

am, are, is, was, were => be(표제어)

형태소 : stem (어간, 단어의 의미), affix(접사, 부가적 의미)
어간, 접사를 분리하는 작업

dog(독립형태소)
dogs = dog(어간) + s(접사)

WordNetLemmatizer : NLTK의 표제어 추출 도구

'''

In [1]:
from nltk.stem import WordNetLemmatizer

import numpy as np
import pandas as pd

In [42]:
wnl = WordNetLemmatizer()

#               단어      품사
wnl.lemmatize('watched', 'v')# watched
wnl.lemmatize('has', 'v')    # have
wnl.lemmatize('was', 'v')    # be
wnl.lemmatize('gone', 'v')   # go

'go'

# 어간 추출

In [27]:
text = 'Python is an interpreted, high-level, general-purpose programming language.'

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

In [33]:
words = word_tokenize(text)
change=[]
for i in words:
    change.append((i,ps.stem(i)))

print(change)

[('Python', 'python'), ('is', 'is'), ('an', 'an'), ('interpreted', 'interpret'), (',', ','), ('high-level', 'high-level'), (',', ','), ('general-purpose', 'general-purpos'), ('programming', 'program'), ('language', 'languag'), ('.', '.')]


In [35]:
print(ps.stem('electricical'))  # electric
print(ps.stem('formalize'))  # formal

# 구글에 마틴포터 or 포터스태머 검색 (영문 스태밍)

electric
formal


In [41]:
print(ps.stem('going'))
print(ps.stem('gone'))

# 다른 stemming
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()
print(ls.stem('going'))
print(ls.stem('gone'))
print(ls.stem('dies'))

go
gone
going
gon
die


# 불용어 (stopwords)

In [43]:
from nltk.corpus import stopwords

In [52]:
sw = stopwords.words('english')

ex = "Family is not an important thing. It's everything"

# 토큰화
wt = word_tokenize(ex)

for i in wt:
    if i in sw:
        wt.remove(i)

wt

['Family', 'not', 'important', 'thing', '.', 'It', "'s", 'everything']

In [49]:
# https://www.ranks.nl/stopwords/korean : 한글 불용어 사전

In [55]:
ex = '최근 코로나19로 인한 감염으로 인해 확진자 및 사망자가 증가하고 있습니다. 코로나 19를 이겨냅시다.'

# 불용어 사전 생성
stop_words = '인한 증가 최근 및'
stop_words = stop_words.split(' ')

# 단어 토큰화
wt = word_tokenize(ex)

print(wt)

# 불용어 제거
res = []
for w in wt:
    if w not in stop_words:
        res.append(w)
        
print(res)

['최근', '코로나19로', '인한', '감염으로', '인해', '확진자', '및', '사망자가', '증가하고', '있습니다', '.', '코로나', '19를', '이겨냅시다', '.']
['코로나19로', '감염으로', '인해', '확진자', '사망자가', '증가하고', '있습니다', '.', '코로나', '19를', '이겨냅시다', '.']


In [59]:
from nltk.tokenize import *

# 토큰화 예제

In [72]:
text = "Python is an interpreted, high-level, general-purpose programming language. Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace. Its language constructs and object-oriented approach aim to help programmers write clear, logical code for small and large-scale projects."

# 문장단위 토큰화
text = sent_tokenize(text)  # 3개의 문장
text

# 단어단위 토큰화
# 모든 단어를 소문자, 불용어 제거, 길이가 2이하인 단어 제거
res = []

for i in text:
    r = []
    words = word_tokenize(i)
    for w in words:
        word = w.lower()
        if (word not in sw) & (len(word)>2):
            r.append(word)
    res.append(r)
print(res)

[['python', 'interpreted', 'high-level', 'general-purpose', 'programming', 'language'], ['created', 'guido', 'van', 'rossum', 'first', 'released', '1991', 'python', 'design', 'philosophy', 'emphasizes', 'code', 'readability', 'notable', 'use', 'significant', 'whitespace'], ['language', 'constructs', 'object-oriented', 'approach', 'aim', 'help', 'programmers', 'write', 'clear', 'logical', 'code', 'small', 'large-scale', 'projects']]
['language', 'constructs', 'object-oriented', 'approach', 'aim', 'help', 'programmers', 'write', 'clear', 'logical', 'code', 'small', 'large-scale', 'projects']


In [116]:
text = "Python is an interpreted, high-level, general-purpose programming language. Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace. Its language constructs and object-oriented approach aim to help programmers write clear, logical code for small and large-scale projects."

# 문장단위 토큰화
text = sent_tokenize(text)  # 3개의 문장
text

# 단어단위 토큰화
# 모든 단어를 소문자, 불용어 제거, 길이가 2이하인 단어 제거
res = []
voc = {}
sentences = []

for i in text:
    res = []
    words = word_tokenize(i)
    for w in words:
        word = w.lower()
        # 어근 제거
        # word = ps.stem(word)
        if (word not in sw) & (len(word)>2):
            res.append(word)
            # 딕셔너리의 단어 출현 빈도 추가
            if word not in voc:
                voc[word] = 0
            voc[word] += 1
    sentences.append(res)

# print(res)
# print(voc)
print(sentences)

[['python', 'interpreted', 'high-level', 'general-purpose', 'programming', 'language'], ['created', 'guido', 'van', 'rossum', 'first', 'released', '1991', 'python', 'design', 'philosophy', 'emphasizes', 'code', 'readability', 'notable', 'use', 'significant', 'whitespace'], ['language', 'constructs', 'object-oriented', 'approach', 'aim', 'help', 'programmers', 'write', 'clear', 'logical', 'code', 'small', 'large-scale', 'projects']]


In [86]:
sorted(voc.items(), key= lambda x:x[0])  # key 기준 오름차순 정렬
vs = sorted(voc.items(), key= lambda x:x[1], reverse=True)  # value 기준 내림차순 정렬

In [90]:
wi = {}
idx = 0 

# 2번이상 언급된 단어 딕셔너리 생성
for w, f in vs:
    if f > 1 :
        idx+=1
        wi[w] = idx # 인덱스 부여
print(wi)

{'python': 1, 'language': 2, 'code': 3}


In [109]:
vocSize = 2  # 가장 많이 언급 된 2개의 단어만 추출
# 인덱스가 3번 이상인 단어는 제거(1번, 2번만 남김)

# 단어의 인덱스가 vocSize를 초과하는 단어 추출
wordFreq = [w for w, i in wi.items() if i > vocSize]
wordFreq

# 초과하는 단어 제거
for w in wordFreq:
    # 바로 제거됨
    del wi[w]
    
wordFreq

[]

# OOV (out of voc) : 단어집합에 없는 단어

In [117]:
'''
영수 : 철수야 안녕?  (입력 데이터, X)
철수 : 응 너도 안녕. (출력 데이터, Y)
              
              수치화
철수야 안녕? => 모델 => 응 너도 안녕.

철수 안녕   => 모델  => 응 너 안녕
'''

'\n영수 : 철수야 안녕?  (입력 데이터, X)\n철수 : 응 너도 안녕. (출력 데이터, Y)\n\n철수야 안녕? => 모델 => 응 너도 안녕.\n'

In [125]:
# 원핫인코딩
from konlpy.tag import Okt

okt = Okt()

In [130]:
# 형태소로 나누기
tok = okt.morphs('나는 자연어처리를 학습한다.')
tok
# 원핫벡터 : 단어 집합을 벡터로 표현하는 방식

['나', '는', '자연어', '처리', '를', '학습', '한다', '.']

In [136]:
w2i = {}

for v in tok:
#     print(v)
    if v not in w2i.keys():
        w2i[v] = len(w2i)
print(w2i)

{'나': 0, '는': 1, '자연어': 2, '처리': 3, '를': 4, '학습': 5, '한다': 6, '.': 7}


In [139]:
# 자연어 -> 원핫 -> 0010000
def ohe(w, w2i):
    
    # 리스트 초기화
    ohv = [0] * len(w2i)
    
    # 해당 단어의 인덱스 추출
    idx = w2i[w]
    
    # 리스트의 인덱스 위치를 1로 변경
    ohv[idx] = 1
    
    return ohv

ohe('자연어', w2i)

[0, 0, 1, 0, 0, 0, 0, 0]

In [166]:
# 케라스 원핫인코딩

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

tok = Tokenizer()

text = '데이터 분석은 판다스 최고야 판다스 곰이야'

In [182]:
tok.fit_on_texts([text])

# 공백으로 구분된 단어별 인덱스 딕셔너리가 생성
tok.word_index  # 단어집합 (VOC)
# {'판다스': 1, '데이터': 2, '분석은': 3, '최고야': 4, '곰이야': 5}

sample = '판다스 분석은 동물원에서 한다'

# 단어집합에 있는 단어들의 인덱스
enc = tok.texts_to_sequences([sample])[0]  # [1, 3]

# voc에 있는 단어들 원핫 인코딩
to_categorical(enc)


array([[0., 1., 0., 0.],
       [0., 0., 0., 1.]], dtype=float32)

# BPE : byte pair encoding (단어분리) => 기계번역
학습과정에서 사용되지 않은 단어가 테스트 과정에서 입력되면 -> OOV 문제발생 => 모델이 제대로 동작 x

run-length 기법  (비트맵 이미지)
: aaaabbbaaaa => a4b3a5

허프만 트리를 이용한 압축  (jpeg)
: a=101, b=10, c=1101 ... 

BPE 압축 알고리즘 기법 => 단어분리에 응용
: AAABDAAABAC => xDxAC
연속적인 글자 쌍(2글자)을 구성했을 때 가장 많이 등장한 쌍
1) AA가 가장 많이 등장 => z로 치환
 zABDzABAC
 
2) AB가 가장 많이 등장 => y로 치환
 zyDzyAC

3) zy가 가장 많이 등장 => x로 치환
 xDxAC
 
 
 BPE : 단어분리 알고리즘 
  => 글자단위
  
 1) 단어집합 (VOC)
  => 'low, lower, newest, widest'
 ex) lowesr 입력 => OOV에 문제
            | 해결
            V
2) BPE 알고리즘
{low:5, lower:2, newest:6, widest:3}
  1. 알파벳단위로 분리 l, o, w, e, r, n, w, s, t, i, d
      (N-gram) lo:7, ow:7, we:8, .. es:9 .. 
  2. 가장 빈번 (es)                                                 es를 한 글자로 처리
      {low:5, lower:2, newest:6, widest:3} => {low:5, lower:2, new(es)t:6, wid(es)t:3}
  3. 빈도수 높은 쌍 다시 찾기 ((es)t)가 9쌍으로 최빈
      {low:5, lower:2, new(es)t:6, wid(es)t:3} => {low:5, lower:2, new(est):6, wid(est):3}
  4. 빈도수 높은 쌍 다시 찾기 (lo)가 7쌍으로 최빈
      {low:5, lower:2, new(est):6, wid(est):3} => {(lo)w:5, (lo)wer:2, new(est):6, wid(est):3}
  5. 빈도수 높은 쌍 다시 찾기 ((lo)w)가 7쌍으로 최빈
      (lo)w:5, (lo)wer:2, new(est):6, wid(est):3} => (low):5, (low)er:2, new(est):6, wid(est):3}
                                    유일한 쌍이 나올 때 까지 반복
  n회 반복
  l, o, w, e, r, n, w, s, t, i, d, es, est, lo, low, ne, new, newest, wi, wid, widest
                     ^
                     |  참조
  1. 테스트 과정에서 'lowest' 입력
  2. 글자단위로 분할 l, o, w, e, s, t
  3. low / est 로 분할하여 확인하면 OOV가 아니다 판단

# 언어 모델? 언어를 모델링(단어 순서에 대한 확률)
 => 통계 : 베이즈이론 (조건부확률) p(이전단어|다음단어)
 => 인공신경망 : 
                <  <  ngram  >  >
    ex) CBOW 나는 오늘 (    ) 타고 집에 갑니다.
    
                    >  >  ngram  <  <
    ex) Skipgram 나는 오늘 (    ) 타고 집에 갑니다.
    
 = > 기계번역
             O                   X
     p(나는 전철을 탔다) > p(나는 전철을 태운다) 
 

# 토픽모델링 => LSA

In [184]:
from sklearn.datasets import fetch_20newsgroups

In [192]:
dataser = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers','fotters','quotes'))

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [198]:
dataset = dataser
documents = dataset.data
len(documents)  # 11314건의 뉴스기사

type(documents)  # list
documents[0]

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

In [202]:
dataset.target_names  # 20개의 뉴스의 카테고리

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [200]:
documents[1]  # 필요없는 문자가 상당히 많음 => 정규표현식으로 제거

'\n\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can\'t pity you, Jim.  And I\'m sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won\'t be bummin\' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don\'t forget your Flintstone\'s Chewables!  :) \n--\nBake Timmons, III\n\n-- "...there\'s nothing higher, stronger, more wholesome and more useful in life\nthan some good memory..." -- Alyosha in Brothers Karamazov (Dostoevsky)\n'

topic1 토픽별 가장 관련성이 높은 단어를 10개씩 출력
~ 
topic20 
으로 구분

In [229]:
newsDf = pd.DataFrame({'documents':documents})
newsDf

Unnamed: 0,documents
0,Well i'm not sure about the story nad it did s...
1,"\n\n\n\n\n\n\n\nYeah, do you expect people to ..."
2,\n Although I realize that principle is not ...
3,\n Notwithstanding all the legitimate fuss ...
4,"Well, I will have to change the scoring on my ..."
...,...
11309,"Danny Rubenstein, an Israeli journalist, will ..."
11310,\n\n\nAn apt description of the content of jus...
11311,\nI agree. Home runs off Clemens are always m...
11312,I used HP DeskJet with Orange Micros Grappler ...


In [231]:
newsDf['clean_doc'] = newsDf['documents'].str.replace('[^a-zA-Z ]',' ')
newsDf

Unnamed: 0,documents,clean_doc
0,Well i'm not sure about the story nad it did s...,Well i m not sure about the story nad it did s...
1,"\n\n\n\n\n\n\n\nYeah, do you expect people to ...",Yeah do you expect people to read the...
2,\n Although I realize that principle is not ...,Although I realize that principle is not o...
3,\n Notwithstanding all the legitimate fuss ...,Notwithstanding all the legitimate fuss a...
4,"Well, I will have to change the scoring on my ...",Well I will have to change the scoring on my ...
...,...,...
11309,"Danny Rubenstein, an Israeli journalist, will ...",Danny Rubenstein an Israeli journalist will ...
11310,\n\n\nAn apt description of the content of jus...,An apt description of the content of just a...
11311,\nI agree. Home runs off Clemens are always m...,I agree Home runs off Clemens are always me...
11312,I used HP DeskJet with Orange Micros Grappler ...,I used HP DeskJet with Orange Micros Grappler ...


In [242]:
# 3글자 이하 제거
newsDf['clean_doc'] = newsDf['clean_doc'].apply(lambda x:' '.join([w for w in x.split(' ') if len(w)>3]))

In [245]:
# 대문자를 소문자로
newsDf['clean_doc'] = newsDf['clean_doc'].apply(lambda x:x.lower())

In [253]:
# 토큰화 (공백을 기준으로 토큰화)
tokenizedDoc = newsDf['clean_doc'].apply(lambda x:x.split())

# 불용어 제거
sw = stopwords.words('english')

tokenizedDoc = tokenizedDoc.apply(lambda x : [item for item in x if item not in sw])

In [265]:
# TF-IDF 매트릭스 구성
# tfidf는 토큰화가 되지 않은 텍스트 데이터로 구성

# 역토큰화
# tokenizedDoc.apply(lambda x : ' '.join(x))  # 내 코드

deTokenizedDoc = []

for i in range(len(newsDf)) :
    temp = ' '.join(tokenizedDoc[i])
    deTokenizedDoc.append(temp)

newsDf['clean_doc'] = deTokenizedDoc
newsDf

Unnamed: 0,documents,clean_doc
0,Well i'm not sure about the story nad it did s...,well sure story seem biased disagree statement...
1,"\n\n\n\n\n\n\n\nYeah, do you expect people to ...",yeah expect people read actually accept hard a...
2,\n Although I realize that principle is not ...,although realize principle strongest points wo...
3,\n Notwithstanding all the legitimate fuss ...,notwithstanding legitimate fuss proposal much ...
4,"Well, I will have to change the scoring on my ...",well change scoring playoff pool unfortunately...
...,...,...
11309,"Danny Rubenstein, an Israeli journalist, will ...",danny rubenstein israeli journalist speaking t...
11310,\n\n\nAn apt description of the content of jus...,description content ronroth posts date least e...
11311,\nI agree. Home runs off Clemens are always m...,agree home runs clemens always memorable kinda...
11312,I used HP DeskJet with Orange Micros Grappler ...,used deskjet orange micros grappler system upd...


In [267]:
# tfidf행렬 구성

from sklearn.feature_extraction.text import TfidfVectorizer

vector = TfidfVectorizer(stop_words='english',
               max_features=1000)  # 1,000개의 단어만 사용해서 구성하겠다

In [269]:
res = vector.fit_transform(newsDf['clean_doc'])
res.shape  # (11314, 1000)  // 11314개의 문서 1000개의 단어 행렬

(11314, 1000)

In [297]:
# svd (full, truncated) : 특이값 분해
# 행렬  = U * s * VT

# 절단된 SVD -> 차원 축소
# 토픽 숫자 : n_components

from sklearn.decomposition import TruncatedSVD
svdModel = TruncatedSVD(n_components=20)

In [300]:
# 모델 학습
svdModel.fit(res)

np.shape(svdModel.components_)  # VT = 20, 1000  // 20개의 토픽, 1000개의 단어

(20, 1000)

In [306]:
terms = vector.get_feature_names()  # 1000개의 단어 리스트
terms

['ability',
 'able',
 'accept',
 'access',
 'according',
 'account',
 'action',
 'actions',
 'actually',
 'added',
 'addition',
 'address',
 'administration',
 'advance',
 'advice',
 'agencies',
 'agree',
 'algorithm',
 'allow',
 'allowed',
 'allows',
 'amendment',
 'america',
 'american',
 'americans',
 'analysis',
 'andrew',
 'angeles',
 'anonymous',
 'answer',
 'answers',
 'anti',
 'anybody',
 'apparently',
 'appear',
 'appears',
 'apple',
 'application',
 'applications',
 'apply',
 'appreciate',
 'appreciated',
 'approach',
 'appropriate',
 'april',
 'arab',
 'archive',
 'area',
 'areas',
 'argument',
 'arguments',
 'armenia',
 'armenian',
 'armenians',
 'arms',
 'army',
 'article',
 'articles',
 'asked',
 'asking',
 'assume',
 'atheism',
 'atheists',
 'attack',
 'attempt',
 'author',
 'authority',
 'available',
 'average',
 'avoid',
 'away',
 'base',
 'baseball',
 'based',
 'basic',
 'basically',
 'basis',
 'begin',
 'beginning',
 'belief',
 'beliefs',
 'believe',
 'best',
 'bette

In [315]:
def getTopic(c, fName, n = 10):
    
    for i, t in enumerate(c):
        print('토픽 %d : '%(i+1), [(fName[i], t[i].round(3)) for i in t.argsort()[:-n-1:-1]], end='\n\n')
    
    
getTopic(svdModel.components_, terms)

토픽 1 :  [('like', 0.205), ('know', 0.188), ('people', 0.184), ('think', 0.168), ('good', 0.143), ('time', 0.139), ('thanks', 0.121), ('make', 0.104), ('right', 0.103), ('want', 0.1)]

토픽 2 :  [('thanks', 0.338), ('windows', 0.275), ('mail', 0.177), ('card', 0.171), ('drive', 0.156), ('file', 0.134), ('advance', 0.131), ('email', 0.121), ('software', 0.112), ('program', 0.106)]

토픽 3 :  [('game', 0.381), ('team', 0.324), ('year', 0.273), ('games', 0.245), ('season', 0.187), ('hockey', 0.172), ('players', 0.166), ('play', 0.156), ('good', 0.132), ('league', 0.121)]

토픽 4 :  [('drive', 0.513), ('scsi', 0.203), ('disk', 0.156), ('hard', 0.155), ('card', 0.145), ('drives', 0.141), ('problem', 0.12), ('controller', 0.104), ('apple', 0.101), ('floppy', 0.1)]

토픽 5 :  [('thanks', 0.371), ('drive', 0.358), ('know', 0.265), ('scsi', 0.136), ('advance', 0.123), ('jesus', 0.117), ('people', 0.111), ('mail', 0.104), ('drives', 0.103), ('hard', 0.092)]

토픽 6 :  [('windows', 0.365), ('know', 0.226), 

In [316]:
# BPE 알고리즘 구현
'''
low : 5
lower : 2
newest : 6
widest : 3
'''
#   l, o, w, e, r, n, s, t, i, d, es, est, lo, low, ne, new, newest, wi, wid, widest

'\nlow : 5\nlower : 2\nnewest : 6\nwidest : 3\n'

In [38]:
import re

data = {'low' : 5,
'lower' : 2,
'newest' : 6,
'widest' : 3}

# 첫 VOC
voc = list(set(''.join([term for term, time in data.items()])))

data2 = {}
for term, time in data.items():
    temp = []
    
    for j in term:
        temp.append(j)
    temp = ' '.join(temp)
    data2[str(temp)] = time
    
print(data2)
print(voc)

{'l o w': 5, 'l o w e r': 2, 'n e w e s t': 6, 'w i d e s t': 3}
['l', 'r', 's', 'o', 'w', 'n', 'd', 'e', 't', 'i']


In [39]:
' '.join(['l','r'])

'l r'

In [42]:

max_cnt = 0
max_pair = ''

for i in voc:
    for j in voc:
        if i != j:
            pair = ' '.join([i,j])
            cnt = 0
            print(pair)
            for check in [x for x in data2]:
                temp_cnt = 0
                if re.search(pair, check):
                    temp_cnt += 1
                    temp_cnt *= data2[check]
                    cnt += temp_cnt

#             if cnt > max_cnt:
#                 max_cnt = cnt
#                 max_pair = pair
            
#         change = max_pair.replace(' ','')
#         print(change)
#         voc.append(change)

#         # key값의 공백 제거
#         for i in [x for x in data2]:
#             temp_key = i.replace(max_pair, change)
#     #                 print(temp_key)
#             data2[temp_key] = data2.pop(i)



voc

l r
l s
l o
l w
l n
l d
l e
l t
l i
r l
r s
r o
r w
r n
r d
r e
r t
r i
s l
s r
s o
s w
s n
s d
s e
s t
s i
o l
o r
o s
o w
o n
o d
o e
o t
o i
w l
w r
w s
w o
w n
w d
w e
w t
w i
n l
n r
n s
n o
n w
n d
n e
n t
n i
d l
d r
d s
d o
d w
d n
d e
d t
d i
e l
e r
e s
e o
e w
e n
e d
e t
e i
t l
t r
t s
t o
t w
t n
t d
t e
t i
i l
i r
i s
i o
i w
i n
i d
i e
i t


['l', 'r', 's', 'o', 'w', 'n', 'd', 'e', 't', 'i']

In [490]:
voc

['d', 'o', 'l', 't', 'e', 'n', 's', 'i', 'w', 'r']