In [1]:
import os

In [2]:
#경로 변경
os.chdir(r"C:\ML\Python for ML\data")

#폴더에 있는 파일 제목 리스트 가져오기
def get_file_list(dir_name):
    return os.listdir(dir_name)

#파일 내용 가져오기 + 카테고리 분류
def get_contents(file_list):
    y_class = []
    X_text = []
    
    # 야구 기사 = "0", 축구 기사 = "1"로 분류 
    class_dict = {
        1:"0", 2:"0", 3:"0", 4:"1", 5:"1",6:"1"}
    
    for file_name in file_list:
        try:
            f = open(file_name, "r", encoding = "cp949")
            category = int(file_name.split(os.sep)[1].split("_")[0])
            y_class.append(class_dict[category])
            X_text.append(f.read())
        except UnicodeDecodeError as e:
            print(e)
            print(file_name)
    return X_text, y_class

#의미없는 문장보호 등은 제거하기
def get_cleaned_word(word):
    import re
    word = re.sub("\W","",word.lower())
    return word

In [3]:
#corpus_dict만들기
def get_corpus_dict(text):
    text = [sentence.split() for sentence in text]
    cleaned_words = [get_cleaned_word(word) for words in text for word in words]

    from collections import OrderedDict
    corpus_dict = OrderedDict()
    for i,v in enumerate(set(cleaned_words)):
        corpus_dict[v] =i
    return corpus_dict

In [4]:
# 백터화
def get_count_vector(text,corpus):
    text = [sentence.split() for sentence in text]
    word_number_list=[[corpus[get_cleaned_word(word)] for word in words] 
                      for words in text]
    X_vector = [[0 for _ in range(len(corpus))] for x in range(len(text))]
    
    for i, text in enumerate(word_number_list):
        for word_number in text:
            X_vector[i][word_number] += 1
    return X_vector
                

In [5]:
import math

In [18]:
#코사인 유사도 함수
def get_cosine_similarity(v1,v2):
    #compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||) 
    sumxx, sumxy, sumyy = 0,0,0
    for i in range(len(v1)):
        x=v1[i]; y=v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy/math.sqrt(sumxx*sumyy)

# 코사인 유사도 평가 함수
def get_similarity_score(X_vector, source):
    source_vector = X_vector[source]
    similarity_list = []
    for target_vector in X_vector:
        similarity_list.append(
        get_cosine_similarity(source_vector, target_vector))
    return similarity_list

# 유사도 높은 뉴스 선별 함수
def get_top_n_similarity_news(similarity_score,n):
    sorted_x = sorted([(v,i) for i,v in enumerate(similarity_score)])
    return [[i,v] for v,i in reversed(sorted_x)][1:n+1]

# 정확도 측정 함수
def get_accuracy(similarity_list, y_class, source_news):
    source_class = y_class[source_news]
    return sum([source_class == y_class[i[0]] for i in similarity_list]) / len(similarity_list)


In [21]:
if __name__ == "__main__":
    dir_name = "news_data"
    file_list = get_file_list(dir_name)
    file_list = [os.path.join(dir_name, file_name) for file_name in file_list]

    X_text, y_class = get_contents(file_list)
    
    corpus = get_corpus_dict(X_text)
    print("Number of words : {0}".format(len(corpus)))
    X_vector = get_count_vector(X_text, corpus)
    
    result = []
    
    for i in range(30):
        source_number = i
        
        similarity_score = get_similarity_score(X_vector, source_number)
        similarity_news = get_top_n_similarity_news(similarity_score,3)
        accuracy_score = get_accuracy(similarity_news, y_class, source_number)
        result.append(accuracy_score)

    print(sum(result)/ 30)
        
    
    
    

Number of words : 2211
0.711111111111111
