## Linear algebra codes

### Vector의 계산

In [1]:
u = [2,2]
v = [2,3]
z = [3,5]

result = [sum(t) for t in zip(u,v,z)]
print(result)

[7, 10]


### Vector의 계산: Scalar-Vector product

In [3]:
u = [1,2,3]
v = [4,4,4]
alpha = 2

result = [alpha*sum(t) for t in zip(u,v)]
print(result)

[10, 12, 14]


### Matrix representation of python  
- Matrix 역시 python으로 표시하는 다양한 방법이 존재  

In [4]:
matrix_a = [[3,6],[4,5]]
matrix_b = [(3,6),(4,5)]
matrix_c = {(0,0):3, (0,1):6, (1,0):4, (1,1):5}

### Matrix의 계산: Matrix addition

In [1]:
matrix_a = [[3,6],[4,5]]
matrix_b = [[5,8],[6,7]]
result = [[sum(row) for row in zip(*t)] for t in zip(matrix_a,matrix_b)]
print(result)

[[8, 14], [10, 12]]


### Matrix의 계산: Scalar-Matrix Product

In [2]:
matrix_a = [[3,6],[4,5]]
alpha = 4
result = [[alpha * element for element in t] for t in matrix_a]
print(result)

[[12, 24], [16, 20]]


### Matrix의 계산: Matrix Transpose

In [3]:
matrix_a = [[1,2,3],[4,5,6]]
result = [[element for element in t] for t in zip(*matrix_a)]
print(result)

[[1, 4], [2, 5], [3, 6]]


### Matrix의 계산: Matrix Product

In [6]:
matrix_a = [[1,1,2],[2,1,1]]
matrix_b = [[1,1],[2,1],[1,3]]
result = [[sum(a*b for a,b in zip(row_a,column_b)) \
          for column_b in zip(*matrix_b)] for row_a in matrix_a]
print(result)

[[5, 8], [5, 6]]


## Case Study - News Categorization - 1

- 문자를 Vector로 - One-hot Encoding  
    - 하나의 단어를 Vector의 Index로 인식, 단어 존재시 1 없으면 0  
- Bag of words  
    - 단어별로 인덱스를 부여해서, 한 문장의 단어의 개수를 Vector로 표현  
- 유사성  
- Euclidian distance  
    - 피타고라스 정리, 두 점 사이의 직선의 거리  
- Cosine distance  
    - 두 점 사이의 각도  
    - cosine distance를 더 많이 사용, Count < Direction

- Data set
    - 축구와 야구 선수들의 영문 기사를 분류

- Process  
    - 파일을 불러오기  
    - 파일을 읽어서 단어사전(corpus) 만들기  
    - 단어별로 Index 만들기  
    - 만들어진 인덱스로 문서별로 Bag of words vetor 생성  
    - 비교하고자 하는 문서 비교하기  
    - 얼마나 맞는지 측정하기

### 파일 불러오기

In [2]:
import os

def get_file_list(dir_name):
    return os.listdir(dir_name)

if __name__ == "__main__":
    dir_name = "news_data"
    file_list = get_file_list(dir_name)
    file_list = [os.path.join(dir_name, file_name) for file_name in file_list]

In [7]:
get_file_list("news_data")

['1_Dae-Ho Lee walk-off homer gives Mariners 4-2 win over Rangers.txt',
 '1_Korean First Baseman Dae-Ho Lee Becomes Free Agent, Interested In MLB Deal.txt',
 '1_Lee Dae-ho Announces MLB Aspirations.txt',
 '1_Lee Dae-ho to Start Spring Training in Arizona.txt',
 '1_Lee Dae-ho wins MVP in Japan Series.txt',
 "1_Mariners' Lee Dae-ho belts a walk-off homer.txt",
 '1_Mariners’ Lee Dae-ho gets 1st two-hit game, double.txt',
 '1_MLB Team Interested In Dae-Ho Lee.txt',
 "1_Seattle Mariners' Newest Signing Dae-Ho Lee Could Become Fan Favorite.txt",
 '1_SoftBank Hawks Hope to Renew Contract with Lee Dae-ho.txt',
 '2_Dodgers left with questions after latest Hyun-Jin Ryu setback.txt',
 '2_Dodgers left-hander Hyun-Jin Ryu expects to be ready for start of season.txt',
 '2_Dodgers unsure when Hyun-Jin Ryu will throw another bullpen session.txt',
 '2_Dodgers will take it slowly with pitcher Hyun-Jin Ryu, whose health could be a key to their season.txt',
 '2_Hyun-Jin Ryu downplays long break between bu

In [10]:
print(file_list)



### 파일별로 내용읽기

In [29]:
def get_contents(file_list):
    y_class = []
    X_text = []
    class_dict = {1:"0", 2:"0", 3:"0", 4:"0", 5:"1", 6:"1", 7:"1", 8:"1"}
    
    for file_name in file_list:
        try:
            f = open(file_name, "r", encoding="cp949")
            category = int(file_name.split(os.sep)[1].split("_")[0])
            y_class.append(class_dict[category])
            X_text.append(f.read())
            f.close()
        except UnicodeDecodeError as e:
            print(e)
            print(file_name)
    return X_text, y_class
            

In [30]:
X_text, y_class = get_contents(file_list)
print(X_text)



In [15]:
print(y_class)

['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1']


### Corpus 만들기 + 단어별 index 생성하기

In [24]:
def get_cleaned_text(text):
    import re
    text = re.sub('\W+','',text.lower())
    return text

def get_corpus_dict(text):
    text = [sentence.split() for sentence in text]
    cleaned_words = [get_cleaned_text(word) for words in text for word in words]
    
    from collections import OrderedDict
    corpus_dict = OrderedDict()
    for i, v in enumerate(set(cleaned_words)):
        corpus_dict[v] = i
    return corpus_dict

In [26]:
get_cleaned_text("I'm")

'im'

In [33]:
corpus = get_corpus_dict(X_text)
print("Number of words: {0}".format(len(corpus)))

Number of words: 4024


In [34]:
print(corpus)



### 문서별로 Bag of words vector 생성

In [38]:
def get_count_vector(text, corpus):
    text = [sentence.split() for sentence in text]
    word_number_list = [[corpus[get_cleaned_text(word)] for word in words] for words in text]
    X_vector = [[0 for _ in range(len(corpus))] for x in range(len(text))]  #80 X 4024 0인 matrix 생성
    
    for i, text in enumerate(word_number_list):
        for word_number in text:
            X_vector[i][word_number] += 1
    return X_vector

In [40]:
X_vector = get_count_vector(X_text, corpus)
print(X_vector[0])

[0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

### 비교하기

In [41]:
import math
def get_cosine_similarity(v1,v2):
    "compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||}"
    sumxx,sumxy,sumyy = 0,0,0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy/math.sqrt(sumxx*sumyy)

### 비교결과 정리하기

In [43]:
def get_similarity_score(X_vector, source):
    source_vector = X_vector[source]
    similarity_list = []
    for target_vector in X_vector:
        similarity_list.append(
            get_cosine_similarity(source_vector, target_vector))
    return similarity_list

In [42]:
def get_top_n_similarity_news(similarity_score, n):
    import operator
    x = {i:v for i, v in enumerate(similarity_score)}
    sorted_x = sorted(x.items(), key=operator.itemgetter(1))

    return list(reversed(sorted_x))[1:n+1]

In [45]:
def get_accuracy(similarity_list, y_class, source_news):
    source_class = y_class[source_news]

    return sum([source_class == y_class[i[0]] for i in similarity_list]) / len(similarity_list)


In [48]:
source_number = 10
result = []

In [51]:
for i in range(80):
        source_number = i

        similarity_score = get_similarity_score(X_vector, source_number)
        similarity_news = get_top_n_similarity_news(similarity_score, 10)
        accuracy_score = get_accuracy(similarity_news, y_class, source_number)
        result.append(accuracy_score)
print(sum(result) / 80)

0.6950000000000001


## Case Study - News Categorization - 2  
- News Categorization using sklearn  
    - python 과 scikit-learn을 함께 사용하여 News Categorization을 수행하는 코드

In [53]:
from sklearn.feature_extraction.text import CountVectorizer

In [60]:
vectorizer = CountVectorizer()
corpus = [
    'This is the first document.',
    'This is the second second document',
    'And the third one.',
    'Is this the first document?'
]

In [61]:
X = vectorizer.fit_transform(corpus)

In [62]:
X.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]], dtype=int64)

In [63]:
vectorizer.get_feature_names()

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']