In [1]:
import math
import re

## 1. document 읽어들이기

In [2]:
doc1_path = '단어중요도구하기/문서1.txt'
with open(doc1_path, 'r', encoding='utf8') as temp:
    doc1 = temp.readlines()

doc2_path = '단어중요도구하기/문서2.txt'
with open(doc2_path, 'r', encoding='utf8') as temp:
    doc2 = temp.readlines()
    
doc3_path = '단어중요도구하기/문서3.txt'
with open(doc3_path, 'r', encoding='utf8') as temp:
    doc3 = temp.readlines()

## 2. document의 word bag 만들기

In [3]:
# 문장부호 등 특수문자 제거하기 위한 함수 정의
def remove_special_char(text):
    text = text.strip()
    p = re.compile('[ a-zA-Z0-9]')
    text = p.findall(text)
    text = ''.join(text)
    return text

In [4]:
# document를 인자로 받아서 띄어쓰기 단위로 word bag 생성하기 위한 함수 정의
def doc_to_word_bag(doc):
    word_bag = []
    for line in doc:
        line = remove_special_char(line)
        word_bag += line.split(' ')
    return word_bag

In [5]:
word_bag1 = doc_to_word_bag(doc1)
word_bag2 = doc_to_word_bag(doc2)
word_bag3 = doc_to_word_bag(doc3)

## 3. document 내 단어의 Term Frequency 구하기

In [6]:
# word bag을 인자로 받아서 단어의 문서 내 빈도 수를 세는 함수
def count_term_freq(word_bag):
    term_freq = {}
    for word in word_bag:
        if word not in term_freq.keys():
            term_freq[word] = 1
        else:
            term_freq[word] += 1
    return term_freq

In [7]:
term_freq1 = count_term_freq(word_bag1)
term_freq2 = count_term_freq(word_bag2)
term_freq3 = count_term_freq(word_bag3)

## 4. document 내 단어의 Document Frequency 구하기 

In [8]:
# 모든 문서들의 word_bag을 인자로 받아서 단어들의 document frequency 구하는 함수
def count_doc_freq(doc_index, word_bag_list):
    doc_freq = {}
    # 예를 들어 0번째 문서 내 단어들의 df를 구하고자 한다면, doc_index=0
    for word in word_bag_list[doc_index]:
        count = 0
        # 모든 word bag을 검사하여 해당 단어가 있는지 count  
        for i in range(len(word_bag_list)):
            if word in word_bag_list[i]:
                count += 1
                doc_freq[word] = count
    return doc_freq

In [9]:
word_bag_list = [word_bag1, word_bag2, word_bag3]

In [10]:
doc_freq1 = count_doc_freq(0, word_bag_list)
doc_freq2 = count_doc_freq(1, word_bag_list)
doc_freq3 = count_doc_freq(2, word_bag_list)

## 5. document 내 단어의 weight 구하기 : TF-IDF 

In [11]:
def get_weights(term_freq, doc_freq):
    term_weight = {}
    for word in term_freq.keys():
        term_weight[word] = term_freq[word] * math.log(3/doc_freq[word])
    # 가중치 내림차순 정렬
    term_weight = sorted(term_weight.items(), key=lambda x : x[1], reverse=True)
    return term_weight

In [12]:
term_weight1 = get_weights(term_freq1, doc_freq1)
term_weight2 = get_weights(term_freq2, doc_freq2)
term_weight3 = get_weights(term_freq3, doc_freq3)

## 6. document별 weight 상위 5개 단어들

In [13]:
# 문서 1
term_weight1

[('Seoul', 6.591673732008658),
 ('Chungju', 3.295836866004329),
 ('campus', 3.295836866004329),
 ('Campus', 3.295836866004329),
 ('located', 2.1972245773362196),
 ('The', 2.1972245773362196),
 ('13', 2.1972245773362196),
 ('undergraduate', 2.1972245773362196),
 ('colleges', 2.1972245773362196),
 ('graduate', 2.1972245773362196),
 ('schools', 2.1972245773362196),
 ('students', 2.1972245773362196),
 ('attending', 2.1972245773362196),
 ('in', 1.6218604324326575),
 ('Konkuk', 1.0986122886681098),
 ('University', 1.0986122886681098),
 ('one', 1.0986122886681098),
 ('comprehensive', 1.0986122886681098),
 ('private', 1.0986122886681098),
 ('universities', 1.0986122886681098),
 ('southeastern', 1.0986122886681098),
 ('part', 1.0986122886681098),
 ('near', 1.0986122886681098),
 ('Han', 1.0986122886681098),
 ('River', 1.0986122886681098),
 ('served', 1.0986122886681098),
 ('metro', 1.0986122886681098),
 ('station', 1.0986122886681098),
 ('same', 1.0986122886681098),
 ('name', 1.0986122886681098)

### 문서 1. 
- Seoul
- Chungju
- campus
- Campus
- located  

In [14]:
# 문서 2
term_weight2

[('engineering', 8.788898309344878),
 ('software', 4.394449154672439),
 ('Computer', 3.295836866004329),
 ('computer', 3.295836866004329),
 ('electronic', 3.295836866004329),
 ('design', 3.295836866004329),
 ('hardware', 2.1972245773362196),
 ('engineers', 2.1972245773362196),
 ('only', 2.1972245773362196),
 ('how', 2.1972245773362196),
 ('CpE', 1.0986122886681098),
 ('branch', 1.0986122886681098),
 ('integrates', 1.0986122886681098),
 ('several', 1.0986122886681098),
 ('fields', 1.0986122886681098),
 ('science', 1.0986122886681098),
 ('required', 1.0986122886681098),
 ('develop', 1.0986122886681098),
 ('usually', 1.0986122886681098),
 ('have', 1.0986122886681098),
 ('training', 1.0986122886681098),
 ('electrical', 1.0986122886681098),
 ('hardwaresoftware', 1.0986122886681098),
 ('integration', 1.0986122886681098),
 ('instead', 1.0986122886681098),
 ('involved', 1.0986122886681098),
 ('many', 1.0986122886681098),
 ('aspects', 1.0986122886681098),
 ('computing', 1.0986122886681098),
 ('

### 문서 2.
- engineering
- software
- Computer
- computer
- electronic

In [15]:
# 문서 3
term_weight3

[('intelligence', 5.493061443340549),
 ('its', 3.295836866004329),
 ('AI', 2.1972245773362196),
 ('machines', 2.1972245773362196),
 ('humans', 2.1972245773362196),
 ('as', 2.1972245773362196),
 ('that', 1.6218604324326575),
 ('Artificial', 1.0986122886681098),
 ('sometimes', 1.0986122886681098),
 ('called', 1.0986122886681098),
 ('machine', 1.0986122886681098),
 ('demonstrated', 1.0986122886681098),
 ('unlike', 1.0986122886681098),
 ('natural', 1.0986122886681098),
 ('displayed', 1.0986122886681098),
 ('animals', 1.0986122886681098),
 ('Leading', 1.0986122886681098),
 ('textbooks', 1.0986122886681098),
 ('define', 1.0986122886681098),
 ('study', 1.0986122886681098),
 ('intelligent', 1.0986122886681098),
 ('agents', 1.0986122886681098),
 ('any', 1.0986122886681098),
 ('device', 1.0986122886681098),
 ('perceives', 1.0986122886681098),
 ('environment', 1.0986122886681098),
 ('takes', 1.0986122886681098),
 ('actions', 1.0986122886681098),
 ('maximize', 1.0986122886681098),
 ('chance', 1.09

### 문서 3. 
- intelligence
- its
- AI
- machines
- humans